Exemplo n.º 1
0
def get_features1(tweets, subj_dict):

    print(
        "Getting features type 1 ... : [p_verb, n_verb, p_noun, n_noun, punctation, negations]"
    )
    features = []
    tknzr = Tokenizer(lang='hin')
    tagger = Tagger(lang='hin')
    #take positive and negative noun/verb phrases
    for tweet in tweets:
        feature_list = [0.0] * 6
        tokens = tknzr.tokenize(tweet)
        try:
            pos = tagger.tag(tokens)
        except:
            pos = []
        #print("=>",pos,'\n')
        pos = [p for p in pos if 'V' in p[1] or 'NN' in p[1]]
        #print("==>",pos,'\n')
        for p in pos:
            word = p[0]
            if 'V' in p[1] and word in subj_dict:
                if 'verb' in subj_dict[word]:
                    if 'positive' in subj_dict[word]['verb']:
                        feature_list[0] += 1.0
                    if 'negative' in subj_dict[word]['verb']:
                        feature_list[1] += 1.0
                elif 'anypos' in subj_dict[word]:
                    if 'positive' in subj_dict[word]['anypos']:
                        feature_list[0] += 1.0
                    if 'negative' in subj_dict[word]['anypos']:
                        feature_list[1] += 1.0
            if 'NN' in p[1] in pos and word in subj_dict:
                if 'noun' in subj_dict[word]:
                    if 'positive' in subj_dict[word]['noun']:
                        feature_list[2] += 1.0
                    if 'negative' in subj_dict[word]['noun']:
                        feature_list[3] += 1.0
                elif 'anypos' in subj_dict[word]:
                    if 'positive' in subj_dict[word]['anypos']:
                        feature_list[2] += 1.0
                    if 'negative' in subj_dict[word]['anypos']:
                        feature_list[3] += 1.0
        #derive feature from punctuations
        feature_list[4] += count_apparitions(tokens, helper.punctuation)
        #derive number of strong negations words
        feature_list[5] += count_apparitions(tokens, helper.strong_negations)

        features.append(feature_list)
    print("Done")
    return features
Exemplo n.º 2
0
@author: vaibhav
"""

#question-answer
q1 = "What is Inexhaustible Natural Resource?"
a1 = "The resources which are present in unlimited quantity in nature and are not likely to be exhausted by human activities are known as Inexhaustible Resources. For Example: Sunlight, air"

a2 = "It is a natural resource that will never run out so if we take advantage of the greatest natural resources will not be depleted and will continue to exist, such as water, sunlight, tidal energy, ocean energy and wind energy."

#importing tokenizer,tagger,parser,stemmer
from isc_tokenizer import Tokenizer
tk = Tokenizer(lang='en')

from isc_tagger import Tagger
tagger = Tagger(lang='eng')
"""
from __future__ import unicode_literals
from isc_parser import Parser
parser = Parser(lang='eng')
"""

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
Exemplo n.º 3
0
#--------------------------------------------------------------
parser = argparse.ArgumentParser()

parser.add_argument("-in", "--input_file", help="Input File")
parser.add_argument("-out", "--output_file", help="Output File")

args = parser.parse_args()

# print( "input {} output {}".format(args.input_file,args.output_file))

input_file = open(args.input_file)
input_text = input_file.read()
output_file = open(args.output_file, "w")
dict_file_nel = open('./NER-LIST-UTF/NEL.txt.utf')
tk_ner = Tokenizer(lang='hin')
tagger = Tagger(lang='hin')

seq = tk_ner.tokenize(input_text)
# print(seq[0])
# list_ner = tagger.tag(seq)

dict_read_nel = dict_file_nel.read()
dict_word_nel = tk_ner.tokenize(dict_read_nel)
count = -1

tag_dict = {}

dict_nem = open('./NER-LIST-UTF/NEM.txt.utf')
text = dict_nem.read()
seq_dict_nem = tk_ner.tokenize(text)
dict_ned = open('./NER-LIST-UTF/NED.txt.utf')
Exemplo n.º 4
0
from __future__ import unicode_literals
from source import sentences, source
from isc_tokenizer import Tokenizer
from isc_tagger import Tagger
import math
from kartakaram import kartafunc

# ensures the question is split according to sentences
tk = Tokenizer(lang='hin', split_sen=True)
tagger = Tagger(lang='hin')


def finalsent(sent, assign, is_transfer, default_change):
    container = []
    positive = ['कुल', 'मिलकर', 'मिलाकर', 'अखंडित']
    negative = [
        'पहले', 'पेहले', 'ज़्यादा', 'ज्यादा', 'बाकी', 'खर्च', 'खरीदीं', 'बच',
        'खरीदी', 'बची'
    ]
    tagged_sent = tagger.tag(sent)
    foundcont = "*"
    foundobj = "*"
    got_adj = ""
    qf_flag = 0
    got_one = False
    if is_transfer == True:  # there is a transfer occuring
        for index in range(0, len(tagged_sent) - 1):
            current_word = tagged_sent[index][0].strip()
            current_tag = tagged_sent[index][1]
            next_word = tagged_sent[index + 1][0].strip()
            next_tag = tagged_sent[index + 1][1]
Exemplo n.º 5
0
from __future__ import unicode_literals
import time
start = (time.time())

from isc_tokenizer import Tokenizer
from isc_tagger import Tagger

tk = Tokenizer(lang='hin')
tagger = Tagger(lang='hin')
print(
    str(time.time() - start) +
    " seconds in intializing tokenizer and tagger.\n")
#sequence = tk.tokenize("राम फल खा रहा है| :-)")

tweets = ''
with open('tokens_clean_original_train_hn.txt', 'r') as filename:
    print("Loading File....")
    tweets = filename.read().split("\n")

print("Number of sentences loaded .. " + str(len(tweets)))


def get_tag(pos_list):
    tag_list = []
    for word, tag in pos_list:
        tag_list.append(tag)
    return ' '.join(tag_list)


start = time.time()
pos_tweets = []
Exemplo n.º 6
0
Arquivo: test.py Projeto: poojay1/HSDS
def hin_tool():
    tk = Tokenizer(lang='hin')
    tagger = Tagger(lang='hin')
    return tk, tagger
Exemplo n.º 7
0
    topic_features = {}
    doc_topics, word_topics, phi_values = ldamodel.get_document_topics(
        corpus, per_word_topics=True)[index]
    for topic in doc_topics:
        topic_features['topic ' + str(topic[0])] = topic[1]
    return topic_features


if __name__ == '__main__':

    sample = 'मैं लगातार ट्विटर पर आर्सेनल के बारे में ट्वीट्स देखता हूं। दुनिया को अपडेट करने के लिए धन्यवाद @उपयोगकर्ता & @उपयोगकर्ता शॉनक्स। #'

    tknzr = Tokenizer(lang='hin')
    sys.stdout = open("toutput.txt", "a", encoding='utf-8')
    tokens = tknzr.tokenize(sample)
    tagger = Tagger(lang='hin')
    tags = tagger.tag(tokens)
    valid_tokens = []
    for p in tags:
        if p[1] != 'SYM' and p[0] != '#':
            valid_tokens.append(p[0])

    #for t in tokens:

    #print("=>",tokens)
    #ngram_list = [gram for gram in ngrams(tokens, 2)]
    #print(get_ngrams(tokens, [1,2]))
    print("Tokens ", tokens)
    print("POS ", tags)
    print("Filtered:", valid_tokens)
s = ""
dataset = []
for i in range(1, 47):
    s = 'A' + str(i)
    print(sheet_ranges[s].value)
    dataset.append(sheet_ranges[s].value)

#feature extraction-unigram,nouns,window

#tokenizer for unigram
from isc_tokenizer import Tokenizer
tk = Tokenizer(lang='eng')

#tagger for tagging nouns
from isc_tagger import Tagger
tagger = Tagger(lang='eng')

#unigram
temp_u = []
for i in range(0, len(dataset)):
    if (dataset[i] is not None):
        #print(dataset[i])
        temp_u.append(tk.tokenize(dataset[i]))
    else:
        temp_u.append("")

unigram = []
temp = ""
for i in range(0, len(temp_u)):
    for j in range(1,
                   len(temp_u[i])):  #started from 1 as we don't need indexing
Exemplo n.º 9
0
##finding corresponding corpus and train data
c = 0
corpus_train = []
for i in range(0, 500):
    temp = id.index(id_train[i])
    print(id[temp])
    print(title[temp])
    corpus_train.append(title[temp])

from isc_tokenizer import Tokenizer

tk = Tokenizer(lang='en')

from isc_tagger import Tagger

tagger = Tagger(lang='eng')

from __future__ import unicode_literals
from isc_parser import Parser

parser = Parser(lang='eng')

from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

tokenized = (tk.tokenize(corpus_train[10]))
print(tokenized)

print(tagger.tag(corpus_train[10].split()))
Exemplo n.º 10
0
from __future__ import unicode_literals
from source import sentences, source
from isc_tokenizer import Tokenizer
from isc_tagger import Tagger
from isc_parser import Parser
import math
from kartakaram import kartafunc
from finalsentenceanalyze import finalsent
from calculate import eq_builder

tk = Tokenizer(
    lang='hin',
    split_sen=True)  # ensures the question is split according to sentences
tagger = Tagger(lang='hin')
parser = Parser(lang='hin')

correct = 0
total = 0
negative = ['टूटे', 'खर्च', 'देने', 'नहीं', 'फटे']
wrong = []

y = []
for i in range(0, 100):
    y.append(source[i])
    total += 1

for i in y:
    sep_sentence = tk.tokenize(
        i[0])  #  Stores the list of seperated sentences within a question
    tag_sep_sent = []  # Stores the corresponding tags
    for j in sep_sentence: