def wordcount(text, logger=xetrapal.astra.baselogger):
    #matches 30 reetitions or more of one character
    #regex2 = re.compile(u'(^.{30,})')
    #regex3 = re.compile(u'(\A\u002D)|(\u002D\Z)')
    # create dictionary to store word frequencies
    # process each file chunk
    # remove special characters and anything beyond Unicode 382
    #preCleanText = regex1.sub(' ', decodedText)
    # parse text
    #parsedText = re.split(' ', text)
    wordFreq = collections.Counter()
    t = Tokenizer(text)
    logger.info("Beginning generate word count on input")
    logger.info("Tokenizing the input")
    t.tokenize()
    parsedText = t.tokens

    # clean up and count word
    while "" in parsedText:
        parsedText.remove("")
    for word in tqdm(parsedText):
        if word == '':
            continue
        # add word to count
        wordFreq[word] += 1

    return wordFreq
Exemplo n.º 2
0
def sent_tokenize(fileid=None):
    token_list = []
    for text in raw(fileid):
        t = Tokenizer(text)
        t.generate_sentences()
        token_list.append(t.sentences)

    return token_list
Exemplo n.º 3
0
def wordcount(text,logger=xetrapal.astra.baselogger):
    wordFreq = collections.Counter()
    t=Tokenizer(text)
    logger.info("Beginning generate word count on input")
    logger.info("Tokenizing the input")
    t.tokenize()
    parsedText = t.tokens
    
    # clean up and count word
    while "" in parsedText:
        parsedText.remove("")
    for word in tqdm(parsedText):
		if word == '':
			continue
		# add word to count
		wordFreq[word] += 1     
    return wordFreq
Exemplo n.º 4
0
def ngramfrequencyht(filename,gramlength=3,logger=xetrapal.astra.baselogger):
    with open(filename,"r") as f:
        intext=f.read()
    logger.info("Read file "+ filename)   
    logger.info("Cleaning text")
    cleantext=intext.replace("\nENDOFTWEET\n","\n")
    cleantext=cleantext.lower()
    cleantext=tweet_cleaner(cleantext)
    cleantext=re.sub("\ +"," ",cleantext)
    logger.info("Tokenizing input")
    t=Tokenizer(cleantext)
    t.tokenize()
    grams=nltk.ngrams(t.tokens,gramlength)
    logger.info("Generating freq distribution")
    fdist=nltk.FreqDist(grams)
    freqdist={}
    for k,v in fdist.items():
        freqdist[" ".join(k)]=v
    logger.info("Returning final values")
    freqdistdf=pandas.Series(freqdist).to_frame()
    return freqdistdf
Exemplo n.º 5
0
def tokenize(fileid=None, remove_stopwords=False):
    token_list = []
    for text in raw(fileid):
        t = Tokenizer(text)
        t.tokenize()
        if remove_stopwords:
            t.remove_stop_words()
            token_list.append(t.final_tokens)
        else:
            token_list.append(t.tokens)

    return token_list
Exemplo n.º 6
0
def tokenize():
    t = Tokenizer("यह वाक्य हिन्दी में है।")
    t.tokenize()
    return t
Exemplo n.º 7
0
    f = codecs.open("stopwords.txt", encoding='utf-8')
    stopwords = [x.strip() for x in f.readlines()]
    tokens = [i for i in list if unicode(i) not in stopwords]
    return tokens


texts = []
documents = {}

for i in os.listdir("Reviews"):
    if i.endswith(".txt"):
        with open("Reviews\\" + i) as f:
            documents[i] = []
            for line in f:
                l = line.split('#####')[0]
                t = Tokenizer(l)
                t.generate_sentences()
                for s in t.sentences:
                    if not s.strip() == '':
                        documents[i].append(s)
                t.tokenize()
                tokens = removeStopWords(t.tokens)
                # qwe.extend(tokens)
                texts.append(tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
model = gensim.models.ldamodel.LdaModel(corpus,
                                        num_topics=9,
                                        id2word=dictionary,
                                        passes=100)
val = model.print_topics(num_topics=8, num_words=10)
Corpus = pd.read_csv("train_hindi.tsv",
                     encoding='utf-8',
                     sep="\t",
                     names=header_list)
Corpus['text'] = [
    re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', entry)
    for entry in Corpus['text']
]
Corpus['text'] = [regex.sub(' ', entry) for entry in Corpus['text']]
Corpus['text'] = [entry.split() for entry in Corpus['text']]

for index, entry in enumerate(Corpus['text']):
    Final_words = []
    for word in entry:
        if word not in hindi_stopwords:
            t = Tokenizer()
            Final_words.append(t.generate_stem_words(word))
    Corpus.loc[index, 'text_final'] = str(Final_words)

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(
    Corpus['text_final'], Corpus['task_1'], test_size=0.3)
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
    f = open("data/eng-hin-modified.txt", "r+")
    s = f.readlines()
    f.close()

    sentences = []

    # tokenize the whole thing into sentences
    for line in s[1:2000]:
        t_ = sent_tokenize(line, delim)
        t_ = [x for x in t_ if x != "\n"]
        sentences += t_

    # tokenize the whole thing into words
    words = []
    for sent in sentences:
        tok_ = Tokenizer(sent)
        tok_.tokenize()
        words += tok_.tokens

    unigrams = unigrammatize(words)
    unigrams = freq_sorted_unigrams(unigrams)

    #stopwords = []
    for gram in unigrams:
        print gram[0].decode("utf-8")
    #    if gram[1] > 270:
    #        stopwords.append(gram[0])
    #    else:
    #        break
    #for stop in stopwords:
    #    print stop.decode("utf-8")
Exemplo n.º 10
0
from HindiTokenizer import Tokenizer
import sys
if __name__ == "__main__":
    if len(sys.argv) < 2:
        sys.stderr.write("Usage: " + sys.argv[0] + " <corpusfile> <outputfile>\n")
        sys.exit(2)
    file_name = sys.argv[1]
    fopen = open(file_name, "r")
    a = open(sys.argv[2], "w")
    dic_tok = {}
    while True:
        line = fopen.readline()[0:-1]
        if line == '':
            break
        else:
            t = Tokenizer(line)
            t.tokenize()
            for i in t.print_tokens():
                try:
                    dic_tok[i] += 1
                except KeyError:
                    dic_tok[i] = 1
    final_list = []
    for i in dic_tok.items():
        i = list(i)
        i.reverse()
        final_list.append(i)
    final_list.sort()
    final_list.reverse()
    final_list = final_list[0:50]
    for i in final_list:
# -*- coding: utf-8 -*-
'''
Tokeniser for hindi
'''
from HindiTokenizer import Tokenizer
import sys
if __name__ == "__main__":
    if len(sys.argv) < 3:
        sys.stderr.write("Usage: " + sys.argv[0] + " <corpusfile> <outputfile>\n " )
        sys.exit(2)
    file_name = sys.argv[1]
    fopen = open(file_name, "r")
    a = open(sys.argv[2], "w")
    while True:
        line = fopen.readline()[0:-1]
        if line == '':
            break
        else:
            t = Tokenizer(line)
            t.generate_sentences()
            for i in t.print_sentences():
                a.write(i+"\n")
                

    a.close()
    fopen.close()
Exemplo n.º 12
0
def removeStopWords(list):
    f = codecs.open("stopwords.txt",encoding='utf-8')
    stopwords=[x.strip() for x in f.readlines()]
    tokens=[i for i in list if unicode(i) not in stopwords]
    return tokens

texts = []
documents = {}

for i in os.listdir("Reviews"):
    if i.endswith(".txt"):
        with open("Reviews\\"+i) as f:
            documents[i] = []
            for line in f:
                l = line.split('#####')[0]
                t = Tokenizer(l)
                t.generate_sentences()
                for s in t.sentences:
                    if not s.strip() == '':
                        documents[i].append(s)
                t.tokenize()
                tokens = removeStopWords(t.tokens)
                # qwe.extend(tokens)
                texts.append(tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
model = gensim.models.ldamodel.LdaModel(corpus, num_topics=9, id2word = dictionary, passes=100)
val = model.print_topics(num_topics=8, num_words=10)
print val
for value in val:
    a,b=value
Exemplo n.º 13
0
from cltk.tokenize.sentence import TokenizeSentence
import statistics
import pickle
PATH = '../Data/'
tokenizer = TokenizeSentence('hindi')
files = os.listdir(PATH)
features = []
values = []
for file in files:
    if (os.path.isdir(PATH + file + '/')):
        for inner_file in os.listdir(PATH + file + '/'):
            if (os.path.isdir(PATH + file + '/' + inner_file + '/')):
                for inner_inner_file in os.listdir(PATH + file + '/' +
                                                   inner_file + '/'):
                    values.append(file)
                    t = Tokenizer()
                    t.read_from_file(PATH + file + '/' + inner_file + '/' +
                                     inner_inner_file)
                    split_shit = t.generate_sentences()
                    final_split_shit = []
                    for i in split_shit:
                        hello = re.split('\?|\!', i)
                        for k in hello:
                            final_split_shit.append(k)
                    filtered_final_split_shit = []
                    for i in final_split_shit:
                        if (not (bool(re.match('^\s+$', i)))):
                            filtered_final_split_shit.append(i)
                    words = []
                    for i in filtered_final_split_shit:
                        sentence_tokenized = tokenizer.tokenize(i)