Exemplo n.º 1
0
 def test_VC_measure(self):
     """Tests the VC measure."""
     stemmer = Stemmer()
     for word, measure in VC_DATA.items():
         self.failUnless(stemmer.m(word) == measure,
                         "Measure test failed for word '%s' calculated (%d) \
                         should have been (%d)" % (word, stemmer.m(word),
                         measure))
Exemplo n.º 2
0
    def stem(self, min_word_count=10):
        stemmer = Stemmer({w:n for (w,n) in self.vocab.items()
                               if n >= min_word_count})

        for mail in self.mails:
            mail.sents = [[stemmer.stem(w) for w in sent] for sent in mail.sents]

        self.stemmer = stemmer
Exemplo n.º 3
0
 def test_stem(self):
     """Checks the final stems."""
     stemmer = Stemmer()
     output = file('output.txt')
     for word in file('voc.txt'):
         word = word.strip()
         stem = output.next().strip()
         self.failUnless(stemmer.stem(word) == stem,
                         "Test failed for word \'%s\' stemmed "\
                         "to %s should have been %s"\
                         % (word, stemmer.stemmed, stem))
Exemplo n.º 4
0
    def test(self):
        print 'Starting analysis'

        for trie_name in self.trie_files:
            print 'Starting', trie_name
            correct_number = 0
            all_number = 0
            s = Stemmer(self.plp, filename=trie_name, word_type=None)
            corrects_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/success_' + trie_name.replace('bak', 'txt'), 'w', 'utf-8')
            result_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/' + trie_name.replace('bak', 'txt'), 'w', 'utf-8')
            result_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n')
            corrects_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n')
            # for k, v in self.cities.iteritems():
            cities = codecs.open('../data/cities_wies_miasto_kolonia_osada.csv', 'r', 'utf-8')
            for city in cities:
                k = city.split(';')[1].strip()
                v = city.split(';')[0].strip()
                all_number += 1
                basic_form = ''
                # word_labels = []
                # if k.__contains__('-'):
                #     for city_parts in v.split('-'):
                #         b = s.find_basic_form(city_parts)
                #         basic_form += b.basic_form + '-'
                #         word_labels.append(b.word_labels)
                #     basic_form = basic_form[0:basic_form.__len__() - 1]
                # else:
                #     for city_parts in v.split(' '):
                #         b = s.find_basic_form(city_parts)
                #         basic_form += b.basic_form + ' '
                #         word_labels.append(b.word_labels)

                basic_form = s.find_basic_form(v).basic_form.strip()
                if basic_form != k:
                # if basic_form == k:
                    result_file.write(v + ';' + k + ';' + basic_form + ';')
                    # for w_label in word_labels:
                    #     result_file.write(self.find_most_label(w_label) + ' ')
                    result_file.write('\n')
                else:
                #     corrects_file.write(v + ';' + k + ';' + basic_form + ';')
                    # for label in s.find_labels(word_labels):
                    #     corrects_file.write(label + ' ')
                    # corrects_file.write('\n')
                    correct_number += 1
            result_file.write(u'Liczba miejscowości;Liczba niepoprawnie rozpoznanych;Liczba poprawnie rozpoznanych\n')
            result_file.write(
                str(all_number) + ';' + str(all_number - correct_number) + ';' + str(correct_number))
            print 'Done', trie_name
Exemplo n.º 5
0
Arquivo: main.py Projeto: Attil/WEDT
def main(args):
    dl = DataLoader()
    stem = Stemmer('porter')

    # files is a list of files, which are lists of lines, which are lists of words
    files = [{element[0]: stem.stem(element[1]) for element in dl.load_data(file) if stem.stem(element[1])} for file in args]

    for file, arg in zip(files, args):
        print('Processing file {}...'.format(arg))
        file = {k: list(v) for k, v in file.items()}

        print('Data Clusterer')
        test_clusterer(DataClusterer(list(file.values()), 'euclidean'), file)

        print('-'*64)

        print('Description Clusterer')
        test_clusterer(DescriptionClusterer(list(file.values()), 'cosine'), file)
Exemplo n.º 6
0
import codecs

from plp import PLP
from stemmer import Stemmer

__author__ = 'maciej'

plp = PLP()
plp._init()

ile_poprawnych = 0
ile_wszystkich = 0

s = Stemmer(plp, filename='trie.bak', word_type=None)
f = codecs.open('test.txt', 'r', 'utf-8')

for line in f:
    ile_wszystkich += 1
    parts = line.split(',')
    b_form = s.find_basic_form(parts[0])
    if b_form.basic_form.strip() == parts[1].strip():
        ile_poprawnych += 1
    else:
        print b_form.basic_form, ';', parts[1], ';', parts[0]

print 'Liczba poprawnie rozpoznanych: ', ile_poprawnych, '\nLiczba niepoprawnie rozpoznanych:', ile_wszystkich - ile_poprawnych
    "duree",
    "ville",
    "lieu",
    "labo",
]

outdir = "archives_SFBI_AnnotationManuelle"

mails = list(mailLoaderGen())
words = Counter()
for mail in mails:
    mail.sents = list(iterTokenizedSentences(mail.description))
    for sent in mail.sents:
        words.update(sent)

stemmer = Stemmer(set(word for (word, n) in words.items() if n > 10))

for m in mails:
    outf = outdir + m.mailfile.strip("archives_SFBI")
    d = m.__dict__
    d["date"] = date.fromtimestamp(d["timestamp"]).strftime("%d %B %Y")

    with open(outf, "wt") as f:
        d["from"] = d.pop("sender")
        if m.sfbi:
            ce = d["contact-email"]
            ce = "\t".join(ce) if type(ce) is set else ce
            d["contact-email"] = ce.replace(" [dot] ", ".").replace("[at]", "@")

            cn = d["contact-nom"]
            d["contact-nom"] = "\t".join(cn) if type(cn) is set else cn