Пример #1
0
    def test(self):
        print 'Starting analysis'

        for trie_name in self.trie_files:
            print 'Starting', trie_name
            correct_number = 0
            all_number = 0
            s = Stemmer(self.plp, filename=trie_name, word_type=None)
            corrects_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/success_' + trie_name.replace('bak', 'txt'), 'w', 'utf-8')
            result_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/' + trie_name.replace('bak', 'txt'), 'w', 'utf-8')
            result_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n')
            corrects_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n')
            # for k, v in self.cities.iteritems():
            cities = codecs.open('../data/cities_wies_miasto_kolonia_osada.csv', 'r', 'utf-8')
            for city in cities:
                k = city.split(';')[1].strip()
                v = city.split(';')[0].strip()
                all_number += 1
                basic_form = ''
                # word_labels = []
                # if k.__contains__('-'):
                #     for city_parts in v.split('-'):
                #         b = s.find_basic_form(city_parts)
                #         basic_form += b.basic_form + '-'
                #         word_labels.append(b.word_labels)
                #     basic_form = basic_form[0:basic_form.__len__() - 1]
                # else:
                #     for city_parts in v.split(' '):
                #         b = s.find_basic_form(city_parts)
                #         basic_form += b.basic_form + ' '
                #         word_labels.append(b.word_labels)

                basic_form = s.find_basic_form(v).basic_form.strip()
                if basic_form != k:
                # if basic_form == k:
                    result_file.write(v + ';' + k + ';' + basic_form + ';')
                    # for w_label in word_labels:
                    #     result_file.write(self.find_most_label(w_label) + ' ')
                    result_file.write('\n')
                else:
                #     corrects_file.write(v + ';' + k + ';' + basic_form + ';')
                    # for label in s.find_labels(word_labels):
                    #     corrects_file.write(label + ' ')
                    # corrects_file.write('\n')
                    correct_number += 1
            result_file.write(u'Liczba miejscowości;Liczba niepoprawnie rozpoznanych;Liczba poprawnie rozpoznanych\n')
            result_file.write(
                str(all_number) + ';' + str(all_number - correct_number) + ';' + str(correct_number))
            print 'Done', trie_name
Пример #2
0
import codecs

from plp import PLP
from stemmer import Stemmer

__author__ = 'maciej'

plp = PLP()
plp._init()

ile_poprawnych = 0
ile_wszystkich = 0

s = Stemmer(plp, filename='trie.bak', word_type=None)
f = codecs.open('test.txt', 'r', 'utf-8')

for line in f:
    ile_wszystkich += 1
    parts = line.split(',')
    b_form = s.find_basic_form(parts[0])
    if b_form.basic_form.strip() == parts[1].strip():
        ile_poprawnych += 1
    else:
        print b_form.basic_form, ';', parts[1], ';', parts[0]

print 'Liczba poprawnie rozpoznanych: ', ile_poprawnych, '\nLiczba niepoprawnie rozpoznanych:', ile_wszystkich - ile_poprawnych