def t2(): f = open('crf-input-data') clines = f.readlines() f.close() u2 = list() utt = list() t2 = list() tutt = list() for cl in clines: parts = cl.strip() if parts == '': if utt != []: u2.append(utt) t2.append(tutt) utt = list() tutt = list() else: parts = parts.split() utt.append(parts[0]) tutt.append(parts[2]) if utt != []: u2.append(utt) t2.append(tutt) utt = list() tutt = list() lines = NLU.getALines() utterances = NLU.getUtterances(lines) for u in range(0, len(utterances)): slots = NLU.getSlots(utterances[u]) sclist = list() for slot in slots[0]: sclist.append([slot[1], slot[2]]) entlist = NLU.getEntities(u2[u], t2[u])[0] l1 = list() l2 = sclist for ent in entlist: l1.append([ent[1], ent[2]]) if l1 != l2: print(str(l1) + '_' + str(l2))
def instructorLevel(): ICOR = 0 IGUE = 0 IACT = 0 profs = dicts.getProfWords() pattern = re.compile("[\W_]+") print profs sentences = NLU.getALines() utterances = NLU.getUtterances(sentences) for u in utterances: names = list() cname = "" slots = NLU.getSlots(u)[1] tutt = u[0].strip().lower().split() print slots for tok in tutt: ttok = pattern.sub("", tok) if ttok in profs: if cname != "": cname += " " cname += ttok else: if cname != "": names.append(cname) cname = "" if cname != "": names.append(cname) print(names) slist = list() for slot in slots: slist.append(slot[0].lower()) IACT += len(slots) IGUE += len(names) for name in names: if name in slist: ICOR += 1 print(str(ICOR * 1.0 / IGUE)) print(str(ICOR * 1.0 / IACT)) print(IACT) return ICOR, IGUE, IACT
def genLikesMap(utterances): likes_map.clear() for i in s_set: likes_map[i] = [list(), list()] for i in range(0, len(utterances)): slots = NLU.getSlots(utterances[i]) speaker = s_map[utterances[i][0].strip()] if slots[0]: likes_map[speaker][0].extend(slots[0]) if slots[1]: likes_map[speaker][1].extend(slots[1]) # generate dictionary for similar likes for each person for q in s_set: simlikeq = list() for i in s_set: if i == q: continue found = False for j in range(0, len(likes_map[i][0])): if (("EECS", likes_map[i][0][j][1], likes_map[i][0][j][2]) in likes_map[q][0] or ("", likes_map[i][0][j][1], likes_map[i][0][j][2]) in likes_map[q][0] ) and likes_map[i][0][j][2] != "neutral": #print("similar likes for " + i + " and " + q + ": " + str(likes_map[i][0][j])); simlikeq.append(i) found = True break if not found: for j in range(0, len(likes_map[i][1])): if likes_map[i][1][j] in likes_map[q][ 1] and likes_map[i][1][j][1] != "neutral": #print("similar likes for " + i + " and " + q + ": " + str(likes_map[i][1][j])); simlikeq.append(i) found = True break sim_likes[q] = simlikeq
def classLevel(): CCOR = 0 CGUE = 0 CACT = 0 pattern = re.compile("[\W_]+") w = dicts.getEECSdict() ww = list() for key in w.keys(): ww.append(w[key]) sentences = NLU.getALines() utterances = NLU.getUtterances(sentences) for u in utterances: xmatches = list() tutt = u[0].strip().lower() slots = NLU.getSlots(u)[0] for q in tutt.split(): qq = pattern.sub("", q) if is_number(qq): xmatches.append(qq) for q in ww: if q.lower() in tutt: xmatches.append(q.lower()) slist = list() for slot in slots: slist.append(slot[1].lower()) print(slist) print(xmatches) CACT += len(slots) CGUE += len(xmatches) for name in xmatches: if name in slist: CCOR += 1 print(str(CCOR * 1.0 / CGUE)) print(str(CCOR * 1.0 / CACT)) print(CACT) return CCOR, CGUE, CACT
def main(): name = "MEGHAN" fi = open("../data/extract_samples/pID_AEU") pid = fi.readlines() fi.close() pidmap = dict() pset = set() for i in range(0, len(pid)): parts = pid[i].split("\t") pset.add(parts[0]) pidmap[parts[1].strip()] = parts[0] fl = open("EECS_annotated_samples_anonymized") lines = fl.readlines() fl.close() utterances = NLU.getUtterances(lines) print(utterances[0]) print("Speaker: " + pidmap[utterances[0][0].strip()]) slots = NLU.getSlots(utterances[0]) print(slots) plikes = dict() for i in pset: plikes[i] = [list(), list()] for i in range(0, len(utterances)): slots = NLU.getSlots(utterances[i]) speaker = pidmap[utterances[i][0].strip()] if slots[0]: plikes[speaker][0].extend(slots[0]) if slots[1]: plikes[speaker][1].extend(slots[1]) print("\n\nGiven that EECS 492 sentiment is neutral...") #print(plikes[name]); wholikes = ("EECS", "492", "neutral") likers = list() for i in pset: if wholikes in plikes[i][0]: likers.append(i) # check instructors in likers ucontains_i = "Quentin Stout" print("\n\nWho likes " + ucontains_i) for i in likers: for j in range(0, len(plikes[i][1])): if plikes[i][1][j][0] == ucontains_i: print(i + ": " + str(plikes[i][1][j])) # check classes in likers ucontains_cd = "EECS" ucontains_cid = "545" print("\n\nWho likes " + ucontains_cd + " " + ucontains_cid) for i in likers: for j in range(0, len(plikes[i][0])): # don't worry about department but if you want to... then use this line # plikes[i][0][j][0] == ucontains_cd and if plikes[i][0][j][1] == ucontains_cid: print(i + ": " + str(plikes[i][0][j])) # find all people with similar sentiments to <name> in the data set print("\n\nSimlikes!") simlikesmap = dict() for q in pset: simlikes = list() for i in pset: if i == q: continue found = False for j in range(0, len(plikes[i][0])): if (("EECS", plikes[i][0][j][1], plikes[i][0][j][2]) in plikes[name][0] or ("", plikes[i][0][j][1], plikes[i][0][j][2] ) in plikes[name][0]) and plikes[i][0][j][2] != "neutral": print("similar likes for " + i + " and " + name + ": " + str(plikes[i][0][j])) simlikes.append(i) found = True break if not found: for j in range(0, len(plikes[i][1])): if plikes[i][1][j] in plikes[name][ 1] and plikes[i][1][j][1] != "neutral": print("similar likes for " + i + " and " + name + ": " + str(plikes[i][1][j])) simlikes.append(i) found = True break simlikesmap[q] = simlikes # calculate % of times where OSCORE will be nonzero times = 0 ttimes = 0 for u in utterances: slots = NLU.getSlots(u) speaker = pidmap[u[0].strip()] for slot in slots[0]: ttimes += 1 oscore = 0 for i in simlikesmap[speaker]: pscore = 0 for j in range(0, len(plikes[i][0])): if slot[1] == plikes[i][0][j][1]: if plikes[i][0][j][2] == "positive": pscore += 1 elif plikes[i][0][j][2] == "negative": pscore -= 1 if pscore > 0: oscore += 1 elif pscore < 0: oscore -= 1 if oscore != 0: times += 1 for slot in slots[1]: ttimes += 1 oscore = 0 for i in simlikesmap[speaker]: pscore = 0 for j in range(0, len(plikes[i][1])): if slot[0] == plikes[i][1][j][0]: if plikes[i][1][j][1] == "positive": pscore += 1 elif plikes[i][1][j][1] == "negative": pscore -= 1 if pscore > 0: oscore += 1 elif pscore < 0: oscore -= 1 if oscore != 0: times += 1 print("Times: " + str(times)) print("Total Times: " + str(ttimes)) print("Percentage: " + str(times * 100.0 / ttimes))
def main(): fi = open("sentimentAnnotations") line1 = fi.readlines() fi.close() fo = open("EECS_annotated_samples_anonymized") line2 = fo.readlines() fo.close() utt1 = NLU.getUtterances(line1) utt2 = NLU.getUtterances(line2) correct = 0 wrong = 0 NEU_NEG = 0 NEU_POS = 0 POS_NEG = 0 SNEU_NEG = set() SNEU_NEG.add("neutral") SNEU_NEG.add("negative") SNEU_POS = set() SNEU_POS.add("neutral") SNEU_POS.add("positive") SPOS_NEG = set() SPOS_NEG.add("negative") SPOS_NEG.add("positive") disagrees = list() inst = 1 insttype = "neutral" for i in range(0, len(utt1)): slots1 = NLU.getSlots(utt1[i]) slots2 = NLU.getSlots(utt2[i]) for j in range(0, len(slots1[0])): if insttype == slots2[0][j][2]: inst += 1 if slots1[0][j][3] == slots2[0][j][3]: correct += 1 else: tset = set() tset.add(slots1[0][j][3]) tset.add(slots2[0][j][3]) disagrees.append(utt1[i]) if slots2[0][j][3] == insttype: if tset == SNEU_NEG: NEU_NEG += 1 elif tset == SNEU_POS: NEU_POS += 1 elif tset == SPOS_NEG: POS_NEG += 1 wrong += 1 for j in range(0, len(slots1[1])): if slots1[1][j][1] == slots2[1][j][1]: correct += 1 else: tset = set() disagrees.append(utt1[i]) tset.add(slots1[1][j][1]) tset.add(slots2[1][j][1]) if slots2[1][j][1] == insttype: if tset == SNEU_NEG: NEU_NEG += 1 elif tset == SNEU_POS: NEU_POS += 1 elif tset == SPOS_NEG: POS_NEG += 1 wrong += 1 print("Agree on " + str(correct)) print("Disagree on " + str(wrong)) print("Percent agreement is " + str(correct * 1.0 / (correct + wrong)) + "%") #print("NEU_NEG: " + str(NEU_NEG*1.0/(correct+wrong))); #print("NEU_POS: " + str(NEU_POS*1.0/(correct+wrong))); #print("POS_NEG: " + str(POS_NEG*1.0/(correct+wrong))); print("NEU_NEG: " + str(NEU_NEG * 1.0 / inst)) print("NEU_POS: " + str(NEU_POS * 1.0 / inst)) print("POS_NEG: " + str(POS_NEG * 1.0 / inst))
def main(): if not os.path.exists('classifiers'): os.makedirs('classifiers') allines = NLU.getALines() allU = NLU.getUtterances(allines) textLines = NLU.getTextLines(allU) slots = [NLU.getSlots(i) for i in allU] sents = list() targets = list() tagset = list() sent_to_xtc = dict() index = 0 for i in range(len(slots)): tstx = [] for etype in ENT_TYPES: for j in range(len(slots[i][etype])): tstx.append(index) index += 1 targets.append(slots[i][etype][j]['sentiment']) ttags = [ slots[i][etype][j][k] for k in ALL_IDS if k in slots[i][etype][j] ] tagset.append(ttags) sents.append(textLines[i]) sent_to_xtc[i] = tstx cprint('Number of Utterances: ' + str(index)) cprint('Length of Lines: ' + str(len(sents))) cprint('Length of Targets: ' + str(len(targets))) cv = set() regex = re.compile(r'[^a-zA-Z0-9_\~\- ]+') for sent in range(0, len(sents)): parts = sents[sent].split(' ') for part in range(0, len(parts)): thepart = regex.sub('', parts[part]) # corner case for hyphens hps = thepart.split('-') if len(hps) > 1: for hi in range(0, len(hps)): cv.add(hps[hi].lower()) # end corner case for hyphens thepart = thepart.lower() cv.add(thepart) cv = list(cv) cprint('Vocabulary Size: ' + str(len(cv))) xtc = [] for sent in range(0, len(sents)): #print('sentence: ' + str(sent)) #print('s1: ' + str(sents[sent])) #print(sents[sent] + ' - with tagset - ' + str(tagset[sent])) #dparse = spwrap.parse(sents[sent]) #print('DPARSE: ' + dparse) # add token boundaries to the sentence tokenSent = sents[sent] for tag in range(0, len(tagset[sent])): tokenSent = tokenSent.replace(tagset[sent][tag], ' ~~t~~ ' + tagset[sent][tag]) #print(tokenSent) parts = regex.sub('', tokenSent) # this handles split and hyphen corner case parts = re.split(' |-', parts) # remove empty parts from the sentence while '' in parts: parts.remove('') # locate window feature indicies windowFeatures = [] done = False while not done: for part in range(0, len(parts)): if '~~t~~' == parts[part]: windowFeatures += [part] parts.remove(parts[part]) #print('parts?: ' + str(parts)) break if part == len(parts) - 1: done = True #print('window features: ' + str(windowFeatures)) #print('parts: ' + str(parts)) row = [] # featureMapG = [[0]*300]*4 featureMap = {} Nflag = 0 for part in range(0, len(parts)): #thepart = regex.sub('', parts[part]) #thepart = thepart.lower() thepart = parts[part].lower() theid = cv.index(thepart) #print(theid) #g_vec = glove_features.getGloveWord(glove_dict, parts[part]) mindist = 999 for wf in range(0, len(windowFeatures)): ############################################################## ## This is the distance measure for window linear distance! distance = abs(windowFeatures[wf] - part) ############################################################## ## This is the distance measure for dependency tree distnace! ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse) ############################################################## if distance < mindist: mindist = distance mindist += 1 sentiz = senti_lexis.lexCounts(thepart) #for g_vi in range(0, len(g_vec)): # featureMapG[0][g_vi] += g_vec[g_vi];# - mindist/10.0 # featureMapG[1][g_vi] += g_vec[g_vi];# - mindist/10.0 # featureMapG[2][g_vi] += g_vec[g_vi];# - mindist/10.0 # featureMapG[3][g_vi] += g_vec[g_vi];# - mindist/10.0 if theid in featureMap: # 1.0 - mindist / 10.0 worked well for the first distance measure... # featureMap[theid] += 1.0 / mindist featureMap[theid][0] += 1.0 - mindist / 10.0 featureMap[theid][1] += (1.0 - mindist / 10.0) * sentiz[0] featureMap[theid][2] += (1.0 - mindist / 10.0) * sentiz[1] featureMap[theid][3] += (1.0 - mindist / 10.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 else: # featureMap[theid] = 1.0 / mindist # count, positive, negative, neutral, negate featureMap[theid] = [0, 0, 0, 0, 0] featureMap[theid][0] = 1.0 - mindist / 10.0 featureMap[theid][1] = (1.0 - mindist / 10.0) * sentiz[0] featureMap[theid][2] = (1.0 - mindist / 10.0) * sentiz[1] featureMap[theid][3] = (1.0 - mindist / 10.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 if Nflag > 0: Nflag -= 1 if senti_lexis.lexNegate(thepart): Nflag = 2 for i in range(0, len(cv)): if i in featureMap: row.extend(featureMap[i]) else: row.extend([0, 0, 0, 0, 0]) # add on the glove features # for a in range(0, len(featureMapG)): # temp_vec = [] # for a_a in range(0, len(featureMapG[a])): # temp_vec.append(featureMapG[a][a_a]*1.0/len(parts)) # row.extend(temp_vec) xtc.append(row) #instead read the data from splits file fsplits = open('splits') lines = fsplits.readlines() splits = list() for i in range(0, len(lines)): parts = lines[i].strip().split(':') train = list() test = list() for s in parts[0][1:-1].split(', '): train.append(int(s)) for s in parts[1][1:-1].split(', '): test.append(int(s)) splits.append((train, test)) fsplits.close() #test print the first split #print(splits[0][0]) #print(splits[0][1]) #do gridsearch + evaluation fscores = open('scores_sentiment', 'w') bestsplit = -1 BSscore = 0 for i in range(0, len(splits)): bestC = 0 bestGamma = 0 bestScore = 0 xtest = list() xtrain = list() ytest = list() ytrain = list() # add the utterance set generation here for senti_set # senti_utters = list() # for j in range(0, len(splits[i][0])): # senti_utters.append(utterances[splits[i][0][j]]) #likesMatrix, slist = leastSquares.getMatrix(senti_utters) # do train-test split for j in range(0, len(splits[i][0])): #speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0]) #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker)) #print('\n' + speaker + ': ' + utterances[splits[i][0][j]][0].strip()) # VECTOR is 38 x 141 -> 264 total for LL in range(0, len(sent_to_xtc[splits[i][0][j]])): #fvector = likesMatrix[slist.index(speaker)] #fvector = fvector.tolist()[0] fvector = xtc[sent_to_xtc[splits[i][0][j]][LL]] #fvector.append(slist.index(speaker)) ############################################################## #entity = tagset[sent_to_xtc[splits[i][0][j]][LL]] #entity = tagset2entity(entity) #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker)) #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity) #print('speaker: ' + str(speaker) + ' - ' + str(slist.index(speaker))) #fvector.append(gscore) ########fvector = [gscore] ############################################################## xtrain.append(fvector) ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]]) for j in range(0, len(splits[i][1])): #speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0]) #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker)) for LL in range(0, len(sent_to_xtc[splits[i][1][j]])): #fvector = likesMatrix[slist.index(speaker)] #fvector = fvector.tolist()[0] fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]] #fvector.append(slist.index(speaker)) ############################################################## #entity = tagset[sent_to_xtc[splits[i][1][j]][LL]] #entity = tagset2entity(entity) #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker)) #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity) #fvector.append(gscore) ########fvector = [gscore] ############################################################## xtest.append(fvector) ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) score = 0 for gamma in numpy.linspace(0.0001, 0.05, 10): #10steps for C in numpy.linspace(0.1, 10, 10): #10steps #2 fold x1 = xtrain[len(xtrain) / 2:] x2 = xtrain[:len(xtrain) / 2] y1 = ytrain[len(ytrain) / 2:] y2 = ytrain[:len(ytrain) / 2] x11 = csr_matrix(x1) x22 = csr_matrix(x2) clf = svm.SVC(gamma=gamma, C=C) testout = clf.fit(x1, y1) score = clf.score(x2, y2) clf = svm.SVC(gamma=gamma, C=C) testout = clf.fit(x2, y2) score += clf.score(x1, y1) score /= 2 if score > bestScore: bestC = C bestGamma = gamma bestScore = score cprint('Cross Validation Score: ' + str(score)) cprint('Gamma = ' + str(gamma) + ' and C = ' + str(C)) ################ THIS IS FOR CvI EVALUATION ################ #Ixtest = list() #Iytest = list() #Cxtest = list() #Cytest = list() #for j in range(0, len(splits[i][1])): # for LL in range(0, len(sent_to_xtc[splits[i][1][j]])): # fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]] # if coriset[sent_to_xtc[splits[i][1][j]][LL]]: # Cxtest.append(fvector) # Cytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) # else: # Ixtest.append(fvector) # Iytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) #xtrain = csr_matrix(xtrain) #Cxtest = csr_matrix(Cxtest) #Ixtest = csr_matrix(Ixtest) #clf = svm.SVC(gamma=bestGamma, C=bestC) #testout = clf.fit(xtrain, ytrain) #CBscore = clf.score(Cxtest, Cytest) #IBscore = clf.score(Ixtest, Iytest) #cprint('Actual Score: ' + str(CBscore) + ':' + str(IBscore)) #fscores.write(str(CBscore) + ':' + str(IBscore) + '\n') #fscores.flush() ############################################################### ################ THIS IS FOR NORMAL EVALUATION ################ xtrain = csr_matrix(xtrain) xtest = csr_matrix(xtest) clf = svm.SVC(gamma=bestGamma, C=bestC) testout = clf.fit(xtrain, ytrain) bestScore = clf.score(xtest, ytest) cprint('Actual Score: ' + str(bestScore)) fscores.write(str(bestScore) + '\n') ############################################################### # save best classifier per fold cString = pickle.dumps(clf) fsave1 = open('classifiers/sentiment_classifier' + str(i), 'w') fsave1.write(cString) fsave1.close() fscores.close() # save feature dictionary cvString = pickle.dumps(cv) fsave2 = open('sentiment_dictionary', 'w') fsave2.write(cvString) fsave2.close()
def getMatrix(utterances): GROUNDTRUTHS = True np.set_printoptions(threshold='nan') #lines = NLU.getALines(); #do ALS stuff ioffset = len(classes) X = np.ones((len(sset), len(classes) + len(instructors))) * -1 #print(X.shape); for i in range(0, len(utterances)): slots = NLU.getSlots(utterances[i]) cslots = slots[0] islots = slots[1] for slot in islots: iname = "" if GROUNDTRUTHS: iname = slot[0] else: if slot[0] in entcache.keys(): iname = entcache[slot[0]] else: iname = ed.entityDistance(slot[0])[1][1] entcache[slot[0]] = iname if slot[1] == "positive": X[slist.index(smap[utterances[i][0].strip()])][ ioffset + instructors.index(iname)] = 10 elif slot[1] == "negative": X[slist.index(smap[utterances[i][0].strip()])][ ioffset + instructors.index(iname)] = 0 elif slot[1] == "neutral": X[slist.index(smap[utterances[i][0].strip()])][ ioffset + instructors.index(iname)] = 5 for slot in cslots: if is_number(slot[1]): if slot[1] in classes: if slot[2] == "positive": X[slist.index( smap[utterances[i][0].strip()])][classes.index( slot[1])] = 10 elif slot[2] == "negative": X[slist.index( smap[utterances[i][0].strip()])][classes.index( slot[1])] = 0 elif slot[2] == "neutral": X[slist.index( smap[utterances[i][0].strip()])][classes.index( slot[1])] = 5 else: pass #print(slot[1] + " is not a class..."); else: classname = "" if GROUNDTRUTHS: classname = slot[1] else: if slot[1] in entcache.keys(): classname = entcache[slot[1]] else: classname = ed.entityDistance(slot[1])[0][1] entcache[slot[1]] = classname if slot[2] == "positive": X[slist.index(smap[utterances[i][0].strip()])][ classNames.index(classname)] = 10 elif slot[2] == "negative": X[slist.index(smap[utterances[i][0].strip()])][ classNames.index(classname)] = 0 elif slot[2] == "neutral": X[slist.index(smap[utterances[i][0].strip()])][ classNames.index(classname)] = 5 # Add back these four lines and change return X to newX if you want to use ALS A, Y = nmf(X, 50) A = np.matrix(A) Y = np.matrix(Y) newX = A * Y return newX, slist