Пример #1
0
def sent_tokenize(fileid=None):
    token_list = []
    for text in raw(fileid):
        t = Tokenizer(text)
        t.generate_sentences()
        token_list.append(t.sentences)

    return token_list
# -*- coding: utf-8 -*-
'''
Tokeniser for hindi
'''
from HindiTokenizer import Tokenizer
import sys
if __name__ == "__main__":
    if len(sys.argv) < 3:
        sys.stderr.write("Usage: " + sys.argv[0] + " <corpusfile> <outputfile>\n " )
        sys.exit(2)
    file_name = sys.argv[1]
    fopen = open(file_name, "r")
    a = open(sys.argv[2], "w")
    while True:
        line = fopen.readline()[0:-1]
        if line == '':
            break
        else:
            t = Tokenizer(line)
            t.generate_sentences()
            for i in t.print_sentences():
                a.write(i+"\n")
                

    a.close()
    fopen.close()
Пример #3
0
    stopwords = [x.strip() for x in f.readlines()]
    tokens = [i for i in list if unicode(i) not in stopwords]
    return tokens


texts = []
documents = {}

for i in os.listdir("Reviews"):
    if i.endswith(".txt"):
        with open("Reviews\\" + i) as f:
            documents[i] = []
            for line in f:
                l = line.split('#####')[0]
                t = Tokenizer(l)
                t.generate_sentences()
                for s in t.sentences:
                    if not s.strip() == '':
                        documents[i].append(s)
                t.tokenize()
                tokens = removeStopWords(t.tokens)
                # qwe.extend(tokens)
                texts.append(tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
model = gensim.models.ldamodel.LdaModel(corpus,
                                        num_topics=9,
                                        id2word=dictionary,
                                        passes=100)
val = model.print_topics(num_topics=8, num_words=10)
print val
Пример #4
0
PATH = '../Data/'
tokenizer = TokenizeSentence('hindi')
files = os.listdir(PATH)
features = []
values = []
for file in files:
    if (os.path.isdir(PATH + file + '/')):
        for inner_file in os.listdir(PATH + file + '/'):
            if (os.path.isdir(PATH + file + '/' + inner_file + '/')):
                for inner_inner_file in os.listdir(PATH + file + '/' +
                                                   inner_file + '/'):
                    values.append(file)
                    t = Tokenizer()
                    t.read_from_file(PATH + file + '/' + inner_file + '/' +
                                     inner_inner_file)
                    split_shit = t.generate_sentences()
                    final_split_shit = []
                    for i in split_shit:
                        hello = re.split('\?|\!', i)
                        for k in hello:
                            final_split_shit.append(k)
                    filtered_final_split_shit = []
                    for i in final_split_shit:
                        if (not (bool(re.match('^\s+$', i)))):
                            filtered_final_split_shit.append(i)
                    words = []
                    for i in filtered_final_split_shit:
                        sentence_tokenized = tokenizer.tokenize(i)
                        for k in sentence_tokenized:
                            words.append(k.strip('\n'))
                    length = [