def get_features1(tweets, subj_dict): print( "Getting features type 1 ... : [p_verb, n_verb, p_noun, n_noun, punctation, negations]" ) features = [] tknzr = Tokenizer(lang='hin') tagger = Tagger(lang='hin') #take positive and negative noun/verb phrases for tweet in tweets: feature_list = [0.0] * 6 tokens = tknzr.tokenize(tweet) try: pos = tagger.tag(tokens) except: pos = [] #print("=>",pos,'\n') pos = [p for p in pos if 'V' in p[1] or 'NN' in p[1]] #print("==>",pos,'\n') for p in pos: word = p[0] if 'V' in p[1] and word in subj_dict: if 'verb' in subj_dict[word]: if 'positive' in subj_dict[word]['verb']: feature_list[0] += 1.0 if 'negative' in subj_dict[word]['verb']: feature_list[1] += 1.0 elif 'anypos' in subj_dict[word]: if 'positive' in subj_dict[word]['anypos']: feature_list[0] += 1.0 if 'negative' in subj_dict[word]['anypos']: feature_list[1] += 1.0 if 'NN' in p[1] in pos and word in subj_dict: if 'noun' in subj_dict[word]: if 'positive' in subj_dict[word]['noun']: feature_list[2] += 1.0 if 'negative' in subj_dict[word]['noun']: feature_list[3] += 1.0 elif 'anypos' in subj_dict[word]: if 'positive' in subj_dict[word]['anypos']: feature_list[2] += 1.0 if 'negative' in subj_dict[word]['anypos']: feature_list[3] += 1.0 #derive feature from punctuations feature_list[4] += count_apparitions(tokens, helper.punctuation) #derive number of strong negations words feature_list[5] += count_apparitions(tokens, helper.strong_negations) features.append(feature_list) print("Done") return features
@author: vaibhav """ #question-answer q1 = "What is Inexhaustible Natural Resource?" a1 = "The resources which are present in unlimited quantity in nature and are not likely to be exhausted by human activities are known as Inexhaustible Resources. For Example: Sunlight, air" a2 = "It is a natural resource that will never run out so if we take advantage of the greatest natural resources will not be depleted and will continue to exist, such as water, sunlight, tidal energy, ocean energy and wind energy." #importing tokenizer,tagger,parser,stemmer from isc_tokenizer import Tokenizer tk = Tokenizer(lang='en') from isc_tagger import Tagger tagger = Tagger(lang='eng') """ from __future__ import unicode_literals from isc_parser import Parser parser = Parser(lang='eng') """ from nltk.stem.porter import PorterStemmer ps = PorterStemmer() import re import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer
#-------------------------------------------------------------- parser = argparse.ArgumentParser() parser.add_argument("-in", "--input_file", help="Input File") parser.add_argument("-out", "--output_file", help="Output File") args = parser.parse_args() # print( "input {} output {}".format(args.input_file,args.output_file)) input_file = open(args.input_file) input_text = input_file.read() output_file = open(args.output_file, "w") dict_file_nel = open('./NER-LIST-UTF/NEL.txt.utf') tk_ner = Tokenizer(lang='hin') tagger = Tagger(lang='hin') seq = tk_ner.tokenize(input_text) # print(seq[0]) # list_ner = tagger.tag(seq) dict_read_nel = dict_file_nel.read() dict_word_nel = tk_ner.tokenize(dict_read_nel) count = -1 tag_dict = {} dict_nem = open('./NER-LIST-UTF/NEM.txt.utf') text = dict_nem.read() seq_dict_nem = tk_ner.tokenize(text) dict_ned = open('./NER-LIST-UTF/NED.txt.utf')
from __future__ import unicode_literals from source import sentences, source from isc_tokenizer import Tokenizer from isc_tagger import Tagger import math from kartakaram import kartafunc # ensures the question is split according to sentences tk = Tokenizer(lang='hin', split_sen=True) tagger = Tagger(lang='hin') def finalsent(sent, assign, is_transfer, default_change): container = [] positive = ['कुल', 'मिलकर', 'मिलाकर', 'अखंडित'] negative = [ 'पहले', 'पेहले', 'ज़्यादा', 'ज्यादा', 'बाकी', 'खर्च', 'खरीदीं', 'बच', 'खरीदी', 'बची' ] tagged_sent = tagger.tag(sent) foundcont = "*" foundobj = "*" got_adj = "" qf_flag = 0 got_one = False if is_transfer == True: # there is a transfer occuring for index in range(0, len(tagged_sent) - 1): current_word = tagged_sent[index][0].strip() current_tag = tagged_sent[index][1] next_word = tagged_sent[index + 1][0].strip() next_tag = tagged_sent[index + 1][1]
from __future__ import unicode_literals import time start = (time.time()) from isc_tokenizer import Tokenizer from isc_tagger import Tagger tk = Tokenizer(lang='hin') tagger = Tagger(lang='hin') print( str(time.time() - start) + " seconds in intializing tokenizer and tagger.\n") #sequence = tk.tokenize("राम फल खा रहा है| :-)") tweets = '' with open('tokens_clean_original_train_hn.txt', 'r') as filename: print("Loading File....") tweets = filename.read().split("\n") print("Number of sentences loaded .. " + str(len(tweets))) def get_tag(pos_list): tag_list = [] for word, tag in pos_list: tag_list.append(tag) return ' '.join(tag_list) start = time.time() pos_tweets = []
def hin_tool(): tk = Tokenizer(lang='hin') tagger = Tagger(lang='hin') return tk, tagger
topic_features = {} doc_topics, word_topics, phi_values = ldamodel.get_document_topics( corpus, per_word_topics=True)[index] for topic in doc_topics: topic_features['topic ' + str(topic[0])] = topic[1] return topic_features if __name__ == '__main__': sample = 'मैं लगातार ट्विटर पर आर्सेनल के बारे में ट्वीट्स देखता हूं। दुनिया को अपडेट करने के लिए धन्यवाद @उपयोगकर्ता & @उपयोगकर्ता शॉनक्स। #' tknzr = Tokenizer(lang='hin') sys.stdout = open("toutput.txt", "a", encoding='utf-8') tokens = tknzr.tokenize(sample) tagger = Tagger(lang='hin') tags = tagger.tag(tokens) valid_tokens = [] for p in tags: if p[1] != 'SYM' and p[0] != '#': valid_tokens.append(p[0]) #for t in tokens: #print("=>",tokens) #ngram_list = [gram for gram in ngrams(tokens, 2)] #print(get_ngrams(tokens, [1,2])) print("Tokens ", tokens) print("POS ", tags) print("Filtered:", valid_tokens)
s = "" dataset = [] for i in range(1, 47): s = 'A' + str(i) print(sheet_ranges[s].value) dataset.append(sheet_ranges[s].value) #feature extraction-unigram,nouns,window #tokenizer for unigram from isc_tokenizer import Tokenizer tk = Tokenizer(lang='eng') #tagger for tagging nouns from isc_tagger import Tagger tagger = Tagger(lang='eng') #unigram temp_u = [] for i in range(0, len(dataset)): if (dataset[i] is not None): #print(dataset[i]) temp_u.append(tk.tokenize(dataset[i])) else: temp_u.append("") unigram = [] temp = "" for i in range(0, len(temp_u)): for j in range(1, len(temp_u[i])): #started from 1 as we don't need indexing
##finding corresponding corpus and train data c = 0 corpus_train = [] for i in range(0, 500): temp = id.index(id_train[i]) print(id[temp]) print(title[temp]) corpus_train.append(title[temp]) from isc_tokenizer import Tokenizer tk = Tokenizer(lang='en') from isc_tagger import Tagger tagger = Tagger(lang='eng') from __future__ import unicode_literals from isc_parser import Parser parser = Parser(lang='eng') from nltk.stem.porter import PorterStemmer ps = PorterStemmer() tokenized = (tk.tokenize(corpus_train[10])) print(tokenized) print(tagger.tag(corpus_train[10].split()))
from __future__ import unicode_literals from source import sentences, source from isc_tokenizer import Tokenizer from isc_tagger import Tagger from isc_parser import Parser import math from kartakaram import kartafunc from finalsentenceanalyze import finalsent from calculate import eq_builder tk = Tokenizer( lang='hin', split_sen=True) # ensures the question is split according to sentences tagger = Tagger(lang='hin') parser = Parser(lang='hin') correct = 0 total = 0 negative = ['टूटे', 'खर्च', 'देने', 'नहीं', 'फटे'] wrong = [] y = [] for i in range(0, 100): y.append(source[i]) total += 1 for i in y: sep_sentence = tk.tokenize( i[0]) # Stores the list of seperated sentences within a question tag_sep_sent = [] # Stores the corresponding tags for j in sep_sentence: