Exemplo n.º 1
0
# coding=utf-8
from collections import Counter

from plp import PLP

p = PLP()


def basic_form(word):
    ids = p.rec(word)
    return p.bform(ids[0]) if len(ids) > 0 else word


def stats_sorted(stats):
    return sorted(stats, key=lambda x: x[1], reverse=True)


def ranking(words):
    basic_forms = [basic_form(word) for word in words]
    stats = Counter(basic_forms)
    return stats_sorted(stats.items()), stats_sorted(stats.most_common(100))
Exemplo n.º 2
0
BETA = 0.00002


def pre_process(s):
    return re.sub('[^\w\s]', '', s.lower(), flags=re.UNICODE)


def strip_sie(form):
    if form.endswith(' się'):
        return form[:-len(' się')]
    return form


if __name__ == '__main__':
    p = PLP()
    with open('data/stop_words.json', 'r') as f:
        stop_list = json.load(f)

    words_freq = {}
    total_no = 0
    cooccurence_freq = {}
    associative_strength = {}

    if isfile('results/words_freq.json') and isfile('results/total_no.json') and isfile(
                            'results/' + STIMULUS + '_cooccurence_freq.json'):
        with open('results/words_freq.json', 'r') as f:
            words_freq = json.load(f)
        with open('results/total_no.json', 'r') as f:
            total_no = json.load(f)
        with open('results/' + STIMULUS + '_cooccurence_freq.json', 'r') as f:
Exemplo n.º 3
0
import codecs

from plp import PLP
from stemmer import Stemmer

__author__ = 'maciej'

plp = PLP()
plp._init()

ile_poprawnych = 0
ile_wszystkich = 0

s = Stemmer(plp, filename='trie.bak', word_type=None)
f = codecs.open('test.txt', 'r', 'utf-8')

for line in f:
    ile_wszystkich += 1
    parts = line.split(',')
    b_form = s.find_basic_form(parts[0])
    if b_form.basic_form.strip() == parts[1].strip():
        ile_poprawnych += 1
    else:
        print b_form.basic_form, ';', parts[1], ';', parts[0]

print 'Liczba poprawnie rozpoznanych: ', ile_poprawnych, '\nLiczba niepoprawnie rozpoznanych:', ile_wszystkich - ile_poprawnych
Exemplo n.º 4
0
def getForms(bodziec):
    return map(lambda x: PLP().forms(x), PLP().orec(bodziec))[0]
Exemplo n.º 5
0
# coding: utf-8
import codecs
from collections import defaultdict

from plp import PLP

__author__ = "Michał Ciołczyk"

_FILENAME = "data/odm.txt"
_ENCODING = "windows-1250"
_basic_forms = defaultdict(list)
_initialized = False
_plp = PLP()
_SIE = ' się'


def _load_flection_map():
    global _initialized
    if not _initialized:
        with codecs.open(_FILENAME, 'r', encoding=_ENCODING) as f:
            for line in f:
                forms = line.rstrip('\n').split(', ')
                bform = forms[0]
                for form in forms:
                    _basic_forms[form].append(bform)
        for form, bforms in _basic_forms.items():
            _basic_forms[form] = list(set(bforms))
        _initialized = True


def _strip_sie(form):
Exemplo n.º 6
0
#!/usr/bin/env python
# encoding: utf-8

from plp import PLP
p = PLP()

VERB = PLP.CZESCI_MOWY.CZASOWNIK

stimulus = u'fajka'
st_forms = set(p.forms(p.rec(u'fajka')[0]))
print st_forms

snippets_count = 0

def parse_file(filename):
    global snippets_count
    with open(filename, 'r') as f:
        all_words = []
        for line in f:
            words = line.strip().split()
            all_words.extend(words)

        stimulus_seen = False
        last_verb = None
        second_to_last_verb = None
        last_verb_index = 0
    
        for i, word in enumerate(all_words):
            word_utf8 = word.decode('utf-8')
            if word_utf8 in st_forms or word_utf8[:-1] in st_forms:
                #print 'stimulus_seen' 
Exemplo n.º 7
0
 def __init__(self):
     self.plp = PLP()
     self.plp._init()
     print 'Initialized plp'
     self.cities = TestPreparer().start()
Exemplo n.º 8
0
class Test:
    """
    Class responsible for running test against cities retrieved by TestPreparer
    """

    trie_files = ['trie.bak', 'trie_only_nouns.bak', 'trie_nouns_and_adjectives.bak', 'trie_nouns_and_numerals.bak',
                  'trie_nouns_adjectives_and_numerals.bak']
    # trie_files = ['trie.bak']

    def __init__(self):
        self.plp = PLP()
        self.plp._init()
        print 'Initialized plp'
        self.cities = TestPreparer().start()

        # print 'Loaded cities: ', self.cities.__len__()

    def test(self):
        print 'Starting analysis'

        for trie_name in self.trie_files:
            print 'Starting', trie_name
            correct_number = 0
            all_number = 0
            s = Stemmer(self.plp, filename=trie_name, word_type=None)
            corrects_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/success_' + trie_name.replace('bak', 'txt'), 'w', 'utf-8')
            result_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/' + trie_name.replace('bak', 'txt'), 'w', 'utf-8')
            result_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n')
            corrects_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n')
            # for k, v in self.cities.iteritems():
            cities = codecs.open('../data/cities_wies_miasto_kolonia_osada.csv', 'r', 'utf-8')
            for city in cities:
                k = city.split(';')[1].strip()
                v = city.split(';')[0].strip()
                all_number += 1
                basic_form = ''
                # word_labels = []
                # if k.__contains__('-'):
                #     for city_parts in v.split('-'):
                #         b = s.find_basic_form(city_parts)
                #         basic_form += b.basic_form + '-'
                #         word_labels.append(b.word_labels)
                #     basic_form = basic_form[0:basic_form.__len__() - 1]
                # else:
                #     for city_parts in v.split(' '):
                #         b = s.find_basic_form(city_parts)
                #         basic_form += b.basic_form + ' '
                #         word_labels.append(b.word_labels)

                basic_form = s.find_basic_form(v).basic_form.strip()
                if basic_form != k:
                # if basic_form == k:
                    result_file.write(v + ';' + k + ';' + basic_form + ';')
                    # for w_label in word_labels:
                    #     result_file.write(self.find_most_label(w_label) + ' ')
                    result_file.write('\n')
                else:
                #     corrects_file.write(v + ';' + k + ';' + basic_form + ';')
                    # for label in s.find_labels(word_labels):
                    #     corrects_file.write(label + ' ')
                    # corrects_file.write('\n')
                    correct_number += 1
            result_file.write(u'Liczba miejscowości;Liczba niepoprawnie rozpoznanych;Liczba poprawnie rozpoznanych\n')
            result_file.write(
                str(all_number) + ';' + str(all_number - correct_number) + ';' + str(correct_number))
            print 'Done', trie_name

    def find_most_label(self, w_label):
        max_labels = dict()
        for word in w_label:
            for id in self.plp.rec(word):
                label = self.plp.label(id)
                if label in max_labels:
                    max_labels[label] += 1
                else:
                    max_labels[label] = 1
        return max(max_labels.iteritems(), key=operator.itemgetter(1))[0]


    def prepare_cities(self):
        print 'Preparing cities'
        res_file = codecs.open('../data/cities.csv', 'w', 'utf-8')
        res_file.write(u'Dopełniacz;Mianownik\n')
        for k, v in self.cities.iteritems():
            res_file.write(v + ';' + k + '\n')
Exemplo n.º 9
0
 def setUp(self):
     self.plp = PLP('/usr/local/clp/lib/libclp_2.6.so')
Exemplo n.º 10
0
class PLPTestCase(unittest.TestCase):
    def setUp(self):
        self.plp = PLP('/usr/local/clp/lib/libclp_2.6.so')

    def test_ver(self):
        self.assertIsInstance(self.plp.ver() , unicode)

    def test_rec(self):
        self.assertEqual(self.plp.rec(u'żółwiem'), [18660912])

    def test_orec(self):
        self.assertEqual(self.plp.rec(u'zolwiem'), [])
        self.assertEqual(self.plp.orec(u'zolwiem'), [18660912])

    def test_bform(self):
        self.assertEqual(self.plp.bform(18660912), u'żółw')

    def test_label(self):
        self.assertEqual(self.plp.label(18660912)[0], PLP.CZESCI_MOWY.RZECZOWNIK)
        self.assertEqual(self.plp.label(self.plp.rec(u'idę')[0])[0], PLP.CZESCI_MOWY.CZASOWNIK)

    def test_ogonkify(self):
        self.assertItemsEqual(self.plp.ogonkify(u'gzo'), [u'gzó', u'gżo', u'gźo', u'gźó', u'gżó'])

    def test_forms(self):
        self.assertEqual(self.plp.forms(17786048), [
            u'pogoda',
            u'pogody',
            u'pogodzie',
            u'pogodę',
            u'pogodą',
            u'pogodo',
            u'pogód',
            u'pogodom',
            u'pogodami',
            u'pogodach'
        ])

    def test_vec(self):
        self.assertEqual(self.plp.vec(18660912, u'żółwiem')[0], 5)