예제 #1
0
 def __init__(self, directory):
     '''
     directory: dataset directory as string
     stemmer: Potter's Stemmer for stemming and tokenizing from nltk
     tf_vectorizer: object to vectorize datas which appears at least 15 documents for featuring
     '''
     self.directory = directory
     self.stemmer = Stemmer()
예제 #2
0
def test_stemming(test_file, lexicon_file):
    stemmer = Stemmer(lexicon_file)

    with open(test_file) as f:
        for test in yaml.load(f):
            lemma = strip_length(test.pop("lemma"))
            test_length = test.pop("test_length", True)
            location = test.pop("location", "")

            for parse, form in test.items():
                stemmer.stem(location, lemma, parse, form, test_length)

    stemmer.counter.results()
예제 #3
0
    return remove_stopwords(remove_ponctuation(str(s))).upper()


def toString(sentence):
    out = ''
    if str(sentence) != 'nan':
        for word in sentence.split():
            if isinstance(word, basestring):
                out += (" " + word)


#            else:
#                out += (" " + str(word))
    return out

stemmer = Stemmer()


def DistJaccard(str1, str2):
    if str1 != '' and str2 != '':
        str1 = set(str1.split())
        str2 = set(str2.split())
        return 1.0 - float(len(str1 & str2)) / len(str1 | str2)
    else:
        return numpy.nan


#--------------------------------------------------------------------------------#
#                   Get products infos from GoldStandard                         #
#--------------------------------------------------------------------------------#
예제 #4
0
#!/usr/bin/env python3

from pysblgnt import morphgnt_rows

from stemming import Stemmer


IGNORE_LIST = [
    "σαβαχθάνι",
    "ἔνι",
    "χρή",
]


stemmer = Stemmer("lexicons/morphgnt.yaml")

for book_num in range(1, 28):
    for row in morphgnt_rows(book_num):
        ccat_pos = row["ccat-pos"]
        ccat_parse = row["ccat-parse"]
        norm = row["norm"]
        lemma = row["lemma"]

        if ccat_pos != "V-":
            continue

        if lemma in IGNORE_LIST:
            continue

        if ccat_parse[3] == "N":
            parse = ccat_parse[1:4]