def embeddings(tweetText, path2voca): f = codecs.open(path2voca).read().splitlines() dico = { i.split()[0]: np.array([float(x) for x in i.split()[1:]]) for i in f } tokenizer = Tokenizer(preserve_case=False) feat = [] for key, tweet in enumerate(tweetText): words = tokenizer.tokenize(tweet) my_vec, cnt, min_max = np.zeros(50), 0, [] for i in words: j = i.strip("_neg") try: my_vec += dico[j] cnt += 1 min_max.append(dico[j]) except: pass if len(min_max) > 1: min_max = np.array(min_max) my_min = np.amin(min_max, axis=0) my_max = np.amax(min_max, axis=0) else: my_min, my_max = np.zeros(50), np.zeros(50) if cnt > 1: my_vec /= cnt feat.append(np.hstack((my_vec, my_max, my_min))) return np.array(feat)
def bing_lius(tweetText, pos, different_pos_tags, pos_text ): with codecs.open('../lexicons/positive-words_bing_liu.txt', 'r') as inFile: positive = set(inFile.read().splitlines()) with codecs.open('../lexicons/negative-words_bing_liu.txt', 'r') as inFile: negative = set(inFile.read().splitlines()) feat = [] tokenizer = Tokenizer(preserve_case=True) for key, tweet in enumerate(tweetText): words = tokenizer.tokenize(tweet) counters, counters_cap = np.zeros(8), np.zeros(8) for j in words: if j.isupper(): counters_cap += np.array(getBingLiusCounters(positive, negative, j.lower())) else: counters += np.array(getBingLiusCounters(positive, negative, j.lower())) pos_sen = OrderedDict({x:[0,0,0,0] for x in different_pos_tags}) for k_key, k in enumerate(pos_text[key]): if k in positive: pos_sen[pos[key][k_key]][0]+=1 if k in negative: pos_sen[pos[key][k_key]][2]+=1 if k.endswith("_NEG"): if k.strip("_NEG") in positive: pos_sen[pos[key][k_key]][1]+=1 if k.strip("_NEG") in negative: pos_sen[pos[key][k_key]][3]+=1 # my_feat = list(counters)+list(counters_cap)+[g for gg in pos_sen.values() for g in gg] my_feat = list(counters+counters_cap)+[g for gg in pos_sen.values() for g in gg] feat.append(np.array(my_feat)) return np.array(feat)
def embeddings(tweetText, path2voca): f = codecs.open(path2voca).read().splitlines() dico = {i.split()[0]:np.array([float(x) for x in i.split()[1:]]) for i in f} tokenizer = Tokenizer(preserve_case=False) feat = [] for key, tweet in enumerate(tweetText): words = tokenizer.tokenize(tweet) my_vec, cnt, min_max = np.zeros(50), 0, [] for i in words: j = i.strip("_neg") try: my_vec += dico[j] cnt += 1 min_max.append(dico[j]) except: pass if len(min_max)>1: min_max= np.array(min_max) my_min = np.amin(min_max, axis=0) my_max = np.amax(min_max, axis=0) else: my_min,my_max = np.zeros(50), np.zeros(50) if cnt > 1: my_vec /= cnt feat.append(np.hstack((my_vec, my_max, my_min))) return np.array(feat)
def createDataMatrix(ngram_features, character_gram_features,tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories): tokenizer_case_preserve = Tokenizer(preserve_case=True) tokenizer = Tokenizer(preserve_case=False) handmade_features, cll, cll2 = [], [], [] for tweet in tweetText: feat = [] feat.append(exclamations(tweet)) feat.append(questions(tweet)) feat.append(questions_and_exclamation(tweet)) feat.append(emoticon_negative(tweet)) feat.append(emoticon_positive(tweet)) words = tokenizer_case_preserve.tokenize(tweet) #preserving casing feat.append(allCaps(words)) feat.append(elongated(words)) feat.append(questions_and_exclamation(words[-1])) handmade_features.append(np.array(feat)) words = tokenizer.tokenize(tweet) words = [word.strip("_NEG") for word in words] cll.append(getClusters(voca_clusters, words)) #cll2.append(getClusters(voca_handmade, words)) bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text)) nrc_emo = csr_matrix(nrc_emotion(tweetText, pos, different_pos_tags, pos_text )) mpqa_feat = csr_matrix(mpqa(tweetText,pos, different_pos_tags, pos_text)) handmade_features = np.array(handmade_features) mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_clusters.values()))) cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll)) #mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values()))) #cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2)) hasht = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt')) # sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt')) hasht_bigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt')) # sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt')) sentQ = csr_matrix(get_sentiwordnet(pos_text, pos)) pos_features = csr_matrix(pos_features) handmade_features = csr_matrix(handmade_features) # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, # sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float) # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float) ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, sentQ, handmade_features, pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo, hasht, hasht_bigrams ), dtype=float) # print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape, # sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape y=[] for i in categories: if i=='positive': y.append(1) elif i == 'negative': y.append(-1) elif i == 'UNKNOWN': y.append(0) else: print i ffeatures = normalize(ffeatures) # ffeatures, y = shuffle(ffeatures,y) return ffeatures, y
def sent140aff(tweetText, pos, different_pos_tags, pos_text, path2lexicon): with codecs.open(path2lexicon, 'r') as inFile: wds = inFile.read().splitlines() pos_cont, nega_cont, nega_cont_first = {}, {}, {} for i in wds: i = i.split("\t") if i[0].endswith("_NEG"): name = "".join(i[0].split('_')[:-1]) nega_cont[name] = float(i[1]) elif i[0].endswith('_NEGFIRST'): name = "".join(i[0].split('_')[:-1]) nega_cont_first[name] = float(i[1]) else: pos_cont[i[0]] = float(i[1]) feat = [] tokenizer = Tokenizer(preserve_case=False) for key, tweet in enumerate(tweetText): cnt, scor = 0, [] words = tokenizer.tokenize(tweet) for my_key, i in enumerate(words): if i in pos_cont: scor.append(pos_cont[i]) if i.endswith('_neg'): j = i.strip("_neg") flag = 0 if not words[my_key - 1].endswith('_neg'): if j in nega_cont_first: scor.append(nega_cont_first[j]) flag = 1 elif j in nega_cont: scor.append(nega_cont[j]) flag = 1 else: pass if j in nega_cont and flag == 0: scor.append(nega_cont[j]) if len(scor) > 0: pos_scores, neg_scores = [x for x in scor if x > 0], [x for x in scor if x < 0] if len(pos_scores) == 0: pos_scores = [0] if len(neg_scores) == 0: neg_scores = [0] feat.append([ len(scor), len(pos_scores), len(neg_scores), sum(scor), sum(pos_scores), sum(neg_scores), max(scor), max(pos_scores), max(neg_scores), scor[-1], pos_scores[-1], neg_scores[-1] ]) else: feat.append(list(np.zeros(12))) return np.array(feat)
def sent140aff(tweetText, pos, different_pos_tags, pos_text, path2lexicon): with codecs.open(path2lexicon, 'r') as inFile: wds = inFile.read().splitlines() pos_cont, nega_cont, nega_cont_first = {},{},{} for i in wds: i = i.split("\t") if i[0].endswith("_NEG"): name = "".join(i[0].split('_')[:-1]) nega_cont[name]=float(i[1]) elif i[0].endswith('_NEGFIRST'): name = "".join(i[0].split('_')[:-1]) nega_cont_first[name]=float(i[1]) else: pos_cont[i[0]]=float(i[1]) feat = [] tokenizer = Tokenizer(preserve_case=False) for key, tweet in enumerate(tweetText): cnt, scor = 0, [] words = tokenizer.tokenize(tweet) for my_key, i in enumerate(words): if i in pos_cont: scor.append(pos_cont[i]) if i.endswith('_neg'): j = i.strip("_neg") flag = 0 if not words[my_key-1].endswith('_neg'): if j in nega_cont_first: scor.append(nega_cont_first[j]) flag = 1 elif j in nega_cont: scor.append(nega_cont[j]) flag = 1 else: pass if j in nega_cont and flag == 0: scor.append(nega_cont[j]) if len(scor)> 0: pos_scores, neg_scores = [x for x in scor if x>0],[x for x in scor if x<0] if len(pos_scores) == 0: pos_scores= [0] if len(neg_scores) == 0: neg_scores=[0] feat.append([len(scor), len(pos_scores), len(neg_scores), sum(scor), sum(pos_scores), sum(neg_scores), max(scor), max(pos_scores), max(neg_scores), scor[-1], pos_scores[-1], neg_scores[-1]]) else: feat.append(list(np.zeros(12))) return np.array(feat)
def nrc_emotion(tweetText, pos, different_pos_tags, pos_text): with codecs.open( '../lexicons/NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', 'r') as inFile: wds = inFile.read().splitlines() positive, negative = [], [] for i in wds: my_i = i.split('\t') if my_i[1] == 'positive' and my_i[2] == '1': positive.append(my_i[0]) if my_i[1] == 'negative' and my_i[2] == '1': negative.append(my_i[0]) feat = [] positive, negative = set(positive), set(negative) # for key, tweet in enumerate(pos_text): tokenizer = Tokenizer(preserve_case=True) for key, tweet in enumerate(tweetText): words = tokenizer.tokenize(tweet) counters, counters_caps = np.zeros(8), np.zeros(8) for i in words: if i.isupper(): counters_caps += np.array( getBingLiusCounters(positive, negative, i.lower())) else: counters += np.array( getBingLiusCounters(positive, negative, i.lower())) pos_sen = OrderedDict({x: [0, 0, 0, 0] for x in different_pos_tags}) for k_key, k in enumerate(pos_text[key]): if k in positive: pos_sen[pos[key][k_key]][0] += 1 if k in negative: pos_sen[pos[key][k_key]][2] += 1 if k.endswith("_NEG"): if k.strip("_NEG") in positive: pos_sen[pos[key][k_key]][1] += 1 if k.strip("_NEG") in negative: pos_sen[pos[key][k_key]][3] += 1 # my_feat = list(counters)+list(counters_caps)+[g for gg in pos_sen.values() for g in gg] my_feat = list(counters + counters_caps) + [ g for gg in pos_sen.values() for g in gg ] feat.append(np.array(my_feat)) return np.array(feat)
def nrc_emotion(tweetText, pos, different_pos_tags, pos_text ): with codecs.open('../lexicons/NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', 'r') as inFile: wds = inFile.read().splitlines() positive, negative = [], [] for i in wds: my_i = i.split('\t') if my_i[1] == 'positive' and my_i[2]=='1': positive.append(my_i[0]) if my_i[1] == 'negative' and my_i[2]=='1': negative.append(my_i[0]) feat = [] positive, negative = set(positive), set(negative) # for key, tweet in enumerate(pos_text): tokenizer = Tokenizer(preserve_case=True) for key, tweet in enumerate(tweetText): words = tokenizer.tokenize(tweet) counters, counters_caps = np.zeros(8), np.zeros(8) for i in words: if i.isupper(): counters_caps += np.array(getBingLiusCounters(positive, negative, i.lower())) else: counters += np.array(getBingLiusCounters(positive, negative, i.lower())) pos_sen = OrderedDict({x:[0,0,0,0] for x in different_pos_tags}) for k_key, k in enumerate(pos_text[key]): if k in positive: pos_sen[pos[key][k_key]][0]+=1 if k in negative: pos_sen[pos[key][k_key]][2]+=1 if k.endswith("_NEG"): if k.strip("_NEG") in positive: pos_sen[pos[key][k_key]][1]+=1 if k.strip("_NEG") in negative: pos_sen[pos[key][k_key]][3]+=1 # my_feat = list(counters)+list(counters_caps)+[g for gg in pos_sen.values() for g in gg] my_feat = list(counters+counters_caps)+[g for gg in pos_sen.values() for g in gg] feat.append(np.array(my_feat)) return np.array(feat)
def bing_lius(tweetText, pos, different_pos_tags, pos_text): with codecs.open('../lexicons/positive-words_bing_liu.txt', 'r') as inFile: positive = set(inFile.read().splitlines()) with codecs.open('../lexicons/negative-words_bing_liu.txt', 'r') as inFile: negative = set(inFile.read().splitlines()) feat = [] tokenizer = Tokenizer(preserve_case=True) for key, tweet in enumerate(tweetText): words = tokenizer.tokenize(tweet) counters, counters_cap = np.zeros(8), np.zeros(8) for j in words: if j.isupper(): counters_cap += np.array( getBingLiusCounters(positive, negative, j.lower())) else: counters += np.array( getBingLiusCounters(positive, negative, j.lower())) pos_sen = OrderedDict({x: [0, 0, 0, 0] for x in different_pos_tags}) for k_key, k in enumerate(pos_text[key]): if k in positive: pos_sen[pos[key][k_key]][0] += 1 if k in negative: pos_sen[pos[key][k_key]][2] += 1 if k.endswith("_NEG"): if k.strip("_NEG") in positive: pos_sen[pos[key][k_key]][1] += 1 if k.strip("_NEG") in negative: pos_sen[pos[key][k_key]][3] += 1 # my_feat = list(counters)+list(counters_cap)+[g for gg in pos_sen.values() for g in gg] my_feat = list(counters + counters_cap) + [ g for gg in pos_sen.values() for g in gg ] feat.append(np.array(my_feat)) return np.array(feat)
def construct_features(self, tokenized_tweet, nrc_lexicons, bing_liu, mpqa, clusters, negations): #print "Tweet : ",tokenized_tweet f = [] #NRC Lexicon #tokenized_tweet=['hello','world','great','worst'] #[min, max, avg of lexicon] #print nrc_lexicons.get_features(tokenized_tweet) f += nrc_lexicons.get_features(tokenized_tweet) #Bing_Liu Lexicon #[no_of_positive_words, no_of_negative_words] #print bing_liu.get_features(tokenized_tweet) #f += bing_liu.get_features(tokenized_tweet) #MPQA_SUb_Lexicon #print mpqa.get_features(tokenized_tweet) f += mpqa.get_features(tokenized_tweet) #print f #Find 1000 clusters #f += clusters.get_features(tokenized_tweet) #Negation words f += negations.get_features(tokenized_tweet) from twitterTokenizer import Tokenizer tokenizer = Tokenizer() #Char Grams char_gram = HashingVectorizer(strip_accents='unicode', binary=True, ngram_range=(4, 5), stop_words=None, lowercase=True, analyzer='char', tokenizer=tokenizer.tokenize, n_features=22000) char_gram_features = char_gram.fit_transform( [' '.join(tokenized_tweet)]) char_grams = char_gram_features.toarray() print len(char_grams[0]) #print len(f) return f
#g = codecs.open('../SemEval2016-task4-test.subtask-BD.txt', encoding='utf8').read().splitlines() #Test data to generate final predictions g = [i.split("\t") for i in g if i.split("\t")[-1] != 'Not Available'] tweetTest, categories_test = [i[-1] for i in g], [i[2] for i in g] l = [i[1] for i in g] #This is to group tweets by topic. Can by improved!! cnt = Counter(l) yo = [0] test_cats = [] for i in range(len(set(l))): num = cnt[l[yo[i]]] test_cats.append(l[num+yo[i]-1]) yo.append(num+yo[i]) tokenizer = Tokenizer() ngram = HashingVectorizer(strip_accents='unicode', binary=True, ngram_range=(1,4), stop_words=None, lowercase=True, tokenizer=tokenizer.tokenize, n_features=10000) #N-gram feature vectorizer character_gram = HashingVectorizer(strip_accents='unicode', binary=True, ngram_range=(4,5), stop_words=None, lowercase=True, analyzer='char', tokenizer=tokenizer.tokenize, n_features=22000) #Char-gram feature vectorizer n_power = float(sys.argv[1]) #parameter of the n_power transformation, I used 0.9 for submission #Linguistic, POS, sentiment disctionaries etc. pos1, pos_features1, different_pos_tags1, pos_text1 = get_pos_tags_and_hashtags(tweetText+tweetTest) #Get POS of everything pos, pos_features, different_pos_tags, pos_text = pos1[:len(categories)], pos_features1[:len(categories)], different_pos_tags1, pos_text1[:len(categories)] #Split train-test again pos_test, pos_features_test, different_pos_tags_test, pos_text_test = pos1[len(categories):], pos_features1[len(categories):], different_pos_tags1, pos_text1[len(categories):] #Split train-test again ngram_features = ngram.fit_transform(tweetText) #Get n-gram features character_gram_features = character_gram.fit_transform(tweetText) #Get char-gram features ngram_features.data **= n_power #a-power transformation character_gram_features.data **= n_power #a-power transformation
def createDataMatrix(ngram_features, character_gram_features, tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories): tokenizer_case_preserve = Tokenizer(preserve_case=True) tokenizer = Tokenizer(preserve_case=False) handmade_features, cll, cll2 = [], [], [] for tweet in tweetText: feat = [] feat.append(exclamations(tweet)) feat.append(questions(tweet)) feat.append(questions_and_exclamation(tweet)) feat.append(emoticon_negative(tweet)) feat.append(emoticon_positive(tweet)) words = tokenizer_case_preserve.tokenize(tweet) #preserving casing feat.append(allCaps(words)) feat.append(elongated(words)) feat.append(questions_and_exclamation(words[-1])) handmade_features.append(np.array(feat)) words = tokenizer.tokenize(tweet) words = [word.strip("_NEG") for word in words] cll.append(getClusters(voca_clusters, words)) #cll2.append(getClusters(voca_handmade, words)) bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text)) nrc_emo = csr_matrix( nrc_emotion(tweetText, pos, different_pos_tags, pos_text)) mpqa_feat = csr_matrix(mpqa(tweetText, pos, different_pos_tags, pos_text)) handmade_features = np.array(handmade_features) mlb = MultiLabelBinarizer(sparse_output=True, classes=list(set(voca_clusters.values()))) cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll)) #mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values()))) #cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2)) hasht = csr_matrix( sent140aff( tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt' )) # sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt')) hasht_bigrams = csr_matrix( sent140aff_bigrams( tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt' )) # sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt')) sentQ = csr_matrix(get_sentiwordnet(pos_text, pos)) pos_features = csr_matrix(pos_features) handmade_features = csr_matrix(handmade_features) # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, # sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float) # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float) ffeatures = scipy.sparse.hstack( (ngram_features, character_gram_features, sentQ, handmade_features, pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo, hasht, hasht_bigrams), dtype=float) # print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape, # sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape y = [] for i in categories: if i == 'positive': y.append(1) elif i == 'negative': y.append(-1) elif i == 'UNKNOWN': y.append(0) else: print i ffeatures = normalize(ffeatures) # ffeatures, y = shuffle(ffeatures,y) return ffeatures, y
def mpqa(tweetText, pos, different_pos_tags, pos_text): voca = codecs.open( '../lexicons/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff', 'r').read().splitlines() wds1, wds = {}, {} for i in voca: i = i.split() try: if wds1[i[2].split('=')[1]] != i[5].split('=')[1]: pass except: if i[5].split('=')[1] in ['positive', 'negative']: wds1[i[2].split('=')[1]] = i[5].split('=')[1] wds[i[2].split('=')[1]] = (i[0].split('=')[1], i[5].split('=')[1]) feat = [] tokenizer = Tokenizer(preserve_case=False) for key, tweet in enumerate(tweetText): direction = { 'negative': -1, 'positive': 1, 'neutral': 0, 'both': 0, 'weaksubj': 1, 'strongsubj': 2 } pp, pn, npp, nn, pp_hash, pn_hash, npp_hash, nn_hash = 0, 0, 0, 0, 0, 0, 0, 0 words = tokenizer.tokenize(tweet) for i in words: if i in wds: if direction[wds[i][1]] > 0: pp += direction[wds[i][0]] * direction[wds[i][1]] if direction[wds[i][1]] < 0: pn += direction[wds[i][0]] * direction[wds[i][1]] if i.endswith("_neg"): my_i = i.strip("_neg") if my_i in wds: if direction[wds[my_i][1]] > 0: npp += direction[wds[my_i][0]] * direction[wds[my_i] [1]] if direction[wds[my_i][1]] < 0: nn += direction[wds[my_i][0]] * direction[wds[my_i][1]] if i[0] == "#": if i[1:] in wds: if direction[wds[i[1:]][1]] > 0: pp_hash += direction[wds[i[1:]][0]] * direction[wds[ i[1:]][1]] if direction[wds[i[1:]][1]] < 0: pn_hash += direction[wds[i[1:]][0]] * direction[wds[ i[1:]][1]] if i.endswith("_neg"): my_i = i[1:].strip("_neg") if my_i in wds: if direction[wds[my_i][1]] > 0: npp_hash += direction[wds[my_i][0]] * direction[ wds[my_i][1]] if direction[wds[my_i][1]] < 0: nn_hash += direction[wds[my_i][0]] * direction[ wds[my_i][1]] pos_sen = OrderedDict({x: [0, 0, 0, 0] for x in different_pos_tags}) for k_key, i in enumerate(pos_text[key]): if i in wds: if direction[wds[i][1]] > 0: pos_sen[pos[key][k_key]][0] += 1 if direction[wds[i][1]] < 0: pos_sen[pos[key][k_key]][1] += 1 if i.endswith("_NEG"): if i.strip('_NEG') in wds: ii = i.strip('_NEG') if direction[wds[ii][1]] > 0: pos_sen[pos[key][k_key]][2] += 1 if direction[wds[ii][1]] < 0: pos_sen[pos[key][k_key]][3] += 1 my_feat = [pp, pn, npp, nn, pp_hash, pn_hash, npp_hash, nn_hash ] + [g for gg in pos_sen.values() for g in gg] feat.append(np.array(my_feat)) return np.array(feat)
def mpqa(tweetText, pos, different_pos_tags, pos_text): voca = codecs.open('../lexicons/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff', 'r').read().splitlines() wds1, wds = {}, {} for i in voca: i = i.split() try: if wds1[i[2].split('=')[1]] != i[5].split('=')[1]: pass except: if i[5].split('=')[1] in ['positive', 'negative']: wds1[i[2].split('=')[1]] = i[5].split('=')[1] wds[i[2].split('=')[1]]=(i[0].split('=')[1], i[5].split('=')[1]) feat = [] tokenizer = Tokenizer(preserve_case=False) for key, tweet in enumerate(tweetText): direction = {'negative':-1, 'positive':1, 'neutral':0, 'both':0, 'weaksubj':1, 'strongsubj':2} pp, pn, npp, nn, pp_hash, pn_hash, npp_hash, nn_hash = 0,0,0,0,0,0,0,0 words = tokenizer.tokenize(tweet) for i in words: if i in wds: if direction[wds[i][1]] > 0: pp += direction[wds[i][0]]*direction[wds[i][1]] if direction[wds[i][1]] < 0: pn += direction[wds[i][0]]*direction[wds[i][1]] if i.endswith("_neg"): my_i = i.strip("_neg") if my_i in wds: if direction[wds[my_i][1]] > 0: npp += direction[wds[my_i][0]]*direction[wds[my_i][1]] if direction[wds[my_i][1]] < 0: nn += direction[wds[my_i][0]]*direction[wds[my_i][1]] if i[0] == "#": if i[1:] in wds: if direction[wds[i[1:]][1]] > 0: pp_hash += direction[wds[i[1:]][0]]*direction[wds[i[1:]][1]] if direction[wds[i[1:]][1]] < 0: pn_hash += direction[wds[i[1:]][0]]*direction[wds[i[1:]][1]] if i.endswith("_neg"): my_i = i[1:].strip("_neg") if my_i in wds: if direction[wds[my_i][1]] > 0: npp_hash += direction[wds[my_i][0]]*direction[wds[my_i][1]] if direction[wds[my_i][1]] < 0: nn_hash += direction[wds[my_i][0]]*direction[wds[my_i][1]] pos_sen = OrderedDict({x:[0,0,0,0] for x in different_pos_tags}) for k_key, i in enumerate(pos_text[key]): if i in wds: if direction[wds[i][1]] > 0: pos_sen[pos[key][k_key]][0]+=1 if direction[wds[i][1]] < 0: pos_sen[pos[key][k_key]][1]+=1 if i.endswith("_NEG"): if i.strip('_NEG') in wds: ii = i.strip('_NEG') if direction[wds[ii][1]] > 0: pos_sen[pos[key][k_key]][2]+=1 if direction[wds[ii][1]] < 0: pos_sen[pos[key][k_key]][3]+=1 my_feat = [pp, pn, npp, nn, pp_hash, pn_hash, npp_hash, nn_hash]+[g for gg in pos_sen.values() for g in gg] feat.append(np.array(my_feat)) return np.array(feat)