예제 #1
0
 def __init__(self, directory):
     '''
     directory: dataset directory as string
     stemmer: Potter's Stemmer for stemming and tokenizing from nltk
     tf_vectorizer: object to vectorize datas which appears at least 15 documents for featuring
     '''
     self.directory = directory
     self.stemmer = Stemmer()
예제 #2
0
def test_stemming(test_file, lexicon_file):
    stemmer = Stemmer(lexicon_file)

    with open(test_file) as f:
        for test in yaml.load(f):
            lemma = strip_length(test.pop("lemma"))
            test_length = test.pop("test_length", True)
            location = test.pop("location", "")

            for parse, form in test.items():
                stemmer.stem(location, lemma, parse, form, test_length)

    stemmer.counter.results()
예제 #3
0
class Dataset(object):
    
    def __init__(self, directory):
        '''
        directory: dataset directory as string
        stemmer: Potter's Stemmer for stemming and tokenizing from nltk
        tf_vectorizer: object to vectorize datas which appears at least 15 documents for featuring
        '''
        self.directory = directory
        self.stemmer = Stemmer()
        
    def get_set(self, train_dir):
        X = []
        Y = []
        os.chdir(self.directory + train_dir)
        for root, dirs, files in os.walk('.'):
            for file in files:
                f = open(file, 'r', encoding='iso-8859-9')
                data = []
                for line in f:
                    if not line.startswith(("<ANCH>", "<P>")):
                        continue
                    else:
                        data += self.stemmer.stem_text(line)
                X.append(data)
                #if y is nonrelative ->0
                #else -> 1
                if file[0] == 'n':
                    Y.append(0)
                else:
                    Y.append(1)       
        return X, Y
예제 #4
0
    return remove_stopwords(remove_ponctuation(str(s))).upper()


def toString(sentence):
    out = ''
    if str(sentence) != 'nan':
        for word in sentence.split():
            if isinstance(word, basestring):
                out += (" " + word)


#            else:
#                out += (" " + str(word))
    return out

stemmer = Stemmer()


def DistJaccard(str1, str2):
    if str1 != '' and str2 != '':
        str1 = set(str1.split())
        str2 = set(str2.split())
        return 1.0 - float(len(str1 & str2)) / len(str1 | str2)
    else:
        return numpy.nan


#--------------------------------------------------------------------------------#
#                   Get products infos from GoldStandard                         #
#--------------------------------------------------------------------------------#
예제 #5
0
#!/usr/bin/env python3

from pysblgnt import morphgnt_rows

from stemming import Stemmer


IGNORE_LIST = [
    "σαβαχθάνι",
    "ἔνι",
    "χρή",
]


stemmer = Stemmer("lexicons/morphgnt.yaml")

for book_num in range(1, 28):
    for row in morphgnt_rows(book_num):
        ccat_pos = row["ccat-pos"]
        ccat_parse = row["ccat-parse"]
        norm = row["norm"]
        lemma = row["lemma"]

        if ccat_pos != "V-":
            continue

        if lemma in IGNORE_LIST:
            continue

        if ccat_parse[3] == "N":
            parse = ccat_parse[1:4]