def main(): np.set_printoptions(threshold='nan') lines = NLU.getALines() utterances = NLU.getUtterances(lines) #do ALS stuff X = getMatrix(utterances)[0] idtestp = 28 print(X[idtestp]) cossim = consineUser(X, idtestp) classtest = 280 print(getWeightedGuess(cossim, X, classtest))
def xactinst(): w = dicts.getEECSprofs() sentences = NLU.getALines() utterances = NLU.getUtterances(sentences) xmatches = list() for i in utterances: tutt = i[0].strip().lower() for q in w: if q.lower() in tutt: xmatches.append(q) bees = len(xmatches) eyes = 0 for x in xmatches: ptz = x.split() eyes += len(ptz) - 1 print(bees) print(eyes)
def xactclass(): w = dicts.getEECSdict() ww = list() for key in w.keys(): ww.append(w[key]) sentences = NLU.getALines() utterances = NLU.getUtterances(sentences) xmatches = list() for i in utterances: tutt = i[0].strip().lower() for q in ww: if q.lower() in tutt: xmatches.append(q) bees = len(xmatches) eyes = 0 for x in xmatches: ptz = x.split() eyes += len(ptz) - 1 print(bees) print(eyes)
def t2(): f = open('crf-input-data') clines = f.readlines() f.close() u2 = list() utt = list() t2 = list() tutt = list() for cl in clines: parts = cl.strip() if parts == '': if utt != []: u2.append(utt) t2.append(tutt) utt = list() tutt = list() else: parts = parts.split() utt.append(parts[0]) tutt.append(parts[2]) if utt != []: u2.append(utt) t2.append(tutt) utt = list() tutt = list() lines = NLU.getALines() utterances = NLU.getUtterances(lines) for u in range(0, len(utterances)): slots = NLU.getSlots(utterances[u]) sclist = list() for slot in slots[0]: sclist.append([slot[1], slot[2]]) entlist = NLU.getEntities(u2[u], t2[u])[0] l1 = list() l2 = sclist for ent in entlist: l1.append([ent[1], ent[2]]) if l1 != l2: print(str(l1) + '_' + str(l2))
def instructorLevel(): ICOR = 0 IGUE = 0 IACT = 0 profs = dicts.getProfWords() pattern = re.compile("[\W_]+") print profs sentences = NLU.getALines() utterances = NLU.getUtterances(sentences) for u in utterances: names = list() cname = "" slots = NLU.getSlots(u)[1] tutt = u[0].strip().lower().split() print slots for tok in tutt: ttok = pattern.sub("", tok) if ttok in profs: if cname != "": cname += " " cname += ttok else: if cname != "": names.append(cname) cname = "" if cname != "": names.append(cname) print(names) slist = list() for slot in slots: slist.append(slot[0].lower()) IACT += len(slots) IGUE += len(names) for name in names: if name in slist: ICOR += 1 print(str(ICOR * 1.0 / IGUE)) print(str(ICOR * 1.0 / IACT)) print(IACT) return ICOR, IGUE, IACT
def classLevel(): CCOR = 0 CGUE = 0 CACT = 0 pattern = re.compile("[\W_]+") w = dicts.getEECSdict() ww = list() for key in w.keys(): ww.append(w[key]) sentences = NLU.getALines() utterances = NLU.getUtterances(sentences) for u in utterances: xmatches = list() tutt = u[0].strip().lower() slots = NLU.getSlots(u)[0] for q in tutt.split(): qq = pattern.sub("", q) if is_number(qq): xmatches.append(qq) for q in ww: if q.lower() in tutt: xmatches.append(q.lower()) slist = list() for slot in slots: slist.append(slot[1].lower()) print(slist) print(xmatches) CACT += len(slots) CGUE += len(xmatches) for name in xmatches: if name in slist: CCOR += 1 print(str(CCOR * 1.0 / CGUE)) print(str(CCOR * 1.0 / CACT)) print(CACT) return CCOR, CGUE, CACT
def main(): fo = open("EECS_annotated_samples_anonymized", "r") lines = fo.readlines() utterances = NLU.getUtterances(lines) mode = False sents = list() targets = list() lastTaken = "" lastSent = "" isclass = False tagset = list() coriset = list() lastTagset = list() index = 0 # to make cross validation work after sentences are duplicated for entities sent_to_xtc = dict() sent_to_xtc[0] = list() for i in range(len(lines)): data = lines[i].strip() if "" == data: index += 1 sent_to_xtc[index] = list() if data.startswith("<class") or data.startswith("<instructor"): mode = True lastTaken = "" lastTagset = list() if data.startswith("<class"): isclass = True if mode and data.startswith("sentiment="): lastTaken = data[10:] if lastTaken.endswith(">"): lastTaken = lastTaken[:-1] if mode and data.startswith("name="): temp = data[5:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("id="): temp = data[3:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("department="): temp = data[11:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if not mode and "" != data: lastSent = data if data.endswith(">"): mode = False coriset.append(isclass) isclass = False sents.append(lastSent) tagset.append(lastTagset) sent_to_xtc[index].append(len(sents) - 1) if lastTaken == "": targets.append("neutral") else: targets.append(lastTaken) # This will print out mapping from sentences to entity vectors (XTC) #foutest = open("outtestJ", "w"); #for key in sent_to_xtc: # foutest.write(str(key) + " : " + str(sent_to_xtc[key]) + "\n"); #foutest.flush(); #foutest.close(); #randomly sample utterances #testdata = random.sample(range(0, index), index/5); print("number of utterances: " + str(index)) print("length of lines: " + str(len(sents))) print("length of targets: " + str(len(targets))) print("sent 2: " + str(sents[2])) print("tagset 2: " + str(tagset[2])) cv = set() regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+") for sent in range(0, len(sents)): parts = sents[sent].split(" ") for part in range(0, len(parts)): thepart = regex.sub("", parts[part]) # corner case for hyphens hps = thepart.split("-") if len(hps) > 1: for hi in range(0, len(hps)): cv.add(hps[hi].lower()) # end corner case for hyphens thepart = thepart.lower() cv.add(thepart) cv = list(cv) cv.append("452") #bug? print("vocabulary size: " + str(len(cv))) print("index of I: " + str(cv.index("i"))) xtc = [] for sent in range(0, len(sents)): print("sentence: " + str(sent)) print("s1: " + str(sents[sent])) #print(sents[sent] + " - with tagset - " + str(tagset[sent])); #dparse = spwrap.parse(sents[sent]); #print("DPARSE: " + dparse); # add token boundaries to the sentence tokenSent = sents[sent] for tag in range(0, len(tagset[sent])): tokenSent = tokenSent.replace(tagset[sent][tag], " ~~t~~ " + tagset[sent][tag]) print(tokenSent) parts = regex.sub("", tokenSent) # this handles split and hyphen corner case parts = re.split(" |-", parts) # remove empty parts from the sentence while "" in parts: parts.remove("") # locate window feature indicies windowFeatures = [] done = False while not done: for part in range(0, len(parts)): if "~~t~~" == parts[part]: windowFeatures += [part] parts.remove(parts[part]) print("parts?: " + str(parts)) break if part == len(parts) - 1: done = True print("window features: " + str(windowFeatures)) print("parts: " + str(parts)) row = [] featureMap = {} Nflag = 0 for part in range(0, len(parts)): #thepart = regex.sub("", parts[part]); #thepart = thepart.lower(); thepart = parts[part].lower() theid = cv.index(thepart) print(theid) mindist = 999 for wf in range(0, len(windowFeatures)): ############################################################## ## This is the distance measure for window linear distance! distance = abs(windowFeatures[wf] - part) ############################################################## ## This is the distance measure for dependency tree distnace! ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse); ############################################################## if distance < mindist: mindist = distance mindist += 1 sentiz = senti_lexis.lexCounts(thepart) if theid in featureMap: # 2.0 - mindist / 7.0 worked well for the first distance measure... # featureMap[theid] += 1.0 / mindist; featureMap[theid][0] += 2.0 - mindist / 7.0 featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0] featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1] featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 else: # featureMap[theid] = 1.0 / mindist; # count, positive, negative, neutral, negate featureMap[theid] = [0, 0, 0, 0, 0] featureMap[theid][0] = 2.0 - mindist / 7.0 featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0] featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1] featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 if Nflag > 0: Nflag -= 1 if senti_lexis.lexNegate(thepart): Nflag = 2 for i in range(0, len(cv)): if i in featureMap: row.extend(featureMap[i]) else: row.extend([0, 0, 0, 0, 0]) xtc.append(row) #instead read the data from splits file fsplits = open("splits") lines = fsplits.readlines() splits = list() for i in range(0, len(lines)): parts = lines[i].strip().split(":") train = list() test = list() for s in parts[0][1:-1].split(", "): train.append(int(s)) for s in parts[1][1:-1].split(", "): test.append(int(s)) splits.append((train, test)) fsplits.close() #test print the first split #print(splits[0][0]); #print(splits[0][1]); bestsplit = -1 BSscore = 0 for i in range(0, len(splits)): bestC = 0 bestGamma = 0 bestScore = 0 xtest = list() xtrain = list() ytest = list() ytrain = list() # add the utterance set generation here for senti_set senti_utters = list() for j in range(0, len(splits[i][0])): senti_utters.append(utterances[splits[i][0][j]]) likesMatrix, slist = leastSquares.getMatrix(senti_utters) # do train-test split csims = np.array([0.0] * 38) totz = 0 #for j in range(0, len(splits[i][0])): # speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0]); # cossim = leastSquares.cosineUserWE(likesMatrix, slist.index(speaker)); # np.add(csims, cossim); # totz += 1; for j in range(0, len(splits[i][1])): speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0]) cossim = leastSquares.cosineUserWE(likesMatrix, slist.index(speaker)) cossim = np.array(cossim) csims = np.add(csims, cossim) totz += 1 for j in range(0, len(csims)): csims[j] /= totz print(csims.tolist())
def main(): fo = open("../data/extract_samples/EECS_annotated_samples_anonymized", "r") lines = fo.readlines() utterances = NLU.getUtterances(lines) mode = False sents = list() targets = list() lastTaken = "" lastSent = "" isclass = False tagset = list() coriset = list() lastTagset = list() index = 0 sent_to_xtc = dict() sent_to_xtc[0] = list() for i in range(len(lines)): data = lines[i].strip() if "" == data: index += 1 sent_to_xtc[index] = list() if data.startswith("<class") or data.startswith("<instructor"): mode = True lastTaken = "" lastTagset = list() if data.startswith("<class"): isclass = True if mode and data.startswith("sentiment="): lastTaken = data[10:] if lastTaken.endswith(">"): lastTaken = lastTaken[:-1] if mode and data.startswith("name="): temp = data[5:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("id="): temp = data[3:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("department="): temp = data[11:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if not mode and "" != data: lastSent = data if data.endswith(">"): mode = False coriset.append(isclass) isclass = False sents.append(lastSent) tagset.append(lastTagset) sent_to_xtc[index].append(len(sents) - 1) if lastTaken == "": targets.append("neutral") else: targets.append(lastTaken) f2 = open("RNTN_sent") gendict = f2.readlines() f2.close() sdict = dict() slist = list() parent = None lastpart = None for i in gendict: if i.startswith(":"): parent = i[1:].strip() sdict[parent] = dict() slist.append(parent) elif is_number(i.strip()): sdict[parent][lastpart] = int(i.strip()) else: lastpart = i.strip() sdict[parent][lastpart] = -1 print(len(tagset)) print(len(sdict.keys())) print(len(sent_to_xtc)) print(len(targets)) tries = 0 correct = 0 for q in range(0, len(slist)): print(sdict[slist[q]]) print(sent_to_xtc[q]) for i in sent_to_xtc[q]: print(str(tagset[i]) + ":" + str(targets[i])) for j in sdict[slist[q]]: if tagset[i][0] in j: asent = "neutral" if int(sdict[slist[q]][j]) > 2: asent = "positive" elif int(sdict[slist[q]][j]) < 1: asent = "negative" print(asent) tries += 1 if targets[i] == asent: correct += 1 print("correct: " + str(correct * 1.0 / tries))
def main(): # get scores fscores = open("S1feature");#S1feature -S1single_lies lines = fscores.readlines(); fscores.close(); scores = list(); for i in lines: scores.append(float(i.strip())); sort_scores = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])]; sort_scores.reverse(); # get splits fsplits = open("splits"); splines = fsplits.readlines(); splits = list(); for i in range(0, len(splines)): parts = splines[i].strip().split(":"); train = list(); test = list(); for s in parts[0][1:-1].split(", "): train.append(int(s)); for s in parts[1][1:-1].split(", "): test.append(int(s)); splits.append((train, test)); fsplits.close(); #get speakers nlines = NLU.getALines(); utterances = NLU.getUtterances(nlines); nlist = list(); for i in range(0, len(splits)): senti_utters = list(); for j in range(0, len(splits[i][0])): senti_utters.append(utterances[splits[i][0][j]]); likesMatrix, slist = leastSquares.getMatrix(senti_utters); test_utters = list(); for j in range(0, len(splits[i][1])): test_utters.append(utterances[splits[i][1][j]]); TlikesMatrix, Tslist = leastSquares.getMatrix(test_utters); nonneus = 0; nnews = 0; density = 0.0; counts = list(); #iterate over rows for k in range(0, len(likesMatrix)): nonneu = 0; for j in range(0, len(likesMatrix[k])): if int(likesMatrix[k][j]) != 5: nonneu += 1; if nonneu > 0: nnews += 1; nonneus += nonneu; counts.append(nonneu); #iterate over columns elaps = 0; for k in range(0, len(likesMatrix[0])): nonneu = 0; TNEW = 0; for j in range(0, len(likesMatrix)): if int(likesMatrix[j][k]) != 5: nonneu = 1; if int(TlikesMatrix[j][k]) != 5: TNEW = 1; if nonneu == 1 and TNEW == 1: elaps += 1; nlist.append(str(nnews) + ":" + str(nonneus) + ":" + str(counts) + ":" + str(elaps)); #print correlations for i in sort_scores: print(str(scores[i]) + " - " + nlist[i]);
def main(): name = "MEGHAN" fi = open("../data/extract_samples/pID_AEU") pid = fi.readlines() fi.close() pidmap = dict() pset = set() for i in range(0, len(pid)): parts = pid[i].split("\t") pset.add(parts[0]) pidmap[parts[1].strip()] = parts[0] fl = open("EECS_annotated_samples_anonymized") lines = fl.readlines() fl.close() utterances = NLU.getUtterances(lines) print(utterances[0]) print("Speaker: " + pidmap[utterances[0][0].strip()]) slots = NLU.getSlots(utterances[0]) print(slots) plikes = dict() for i in pset: plikes[i] = [list(), list()] for i in range(0, len(utterances)): slots = NLU.getSlots(utterances[i]) speaker = pidmap[utterances[i][0].strip()] if slots[0]: plikes[speaker][0].extend(slots[0]) if slots[1]: plikes[speaker][1].extend(slots[1]) print("\n\nGiven that EECS 492 sentiment is neutral...") #print(plikes[name]); wholikes = ("EECS", "492", "neutral") likers = list() for i in pset: if wholikes in plikes[i][0]: likers.append(i) # check instructors in likers ucontains_i = "Quentin Stout" print("\n\nWho likes " + ucontains_i) for i in likers: for j in range(0, len(plikes[i][1])): if plikes[i][1][j][0] == ucontains_i: print(i + ": " + str(plikes[i][1][j])) # check classes in likers ucontains_cd = "EECS" ucontains_cid = "545" print("\n\nWho likes " + ucontains_cd + " " + ucontains_cid) for i in likers: for j in range(0, len(plikes[i][0])): # don't worry about department but if you want to... then use this line # plikes[i][0][j][0] == ucontains_cd and if plikes[i][0][j][1] == ucontains_cid: print(i + ": " + str(plikes[i][0][j])) # find all people with similar sentiments to <name> in the data set print("\n\nSimlikes!") simlikesmap = dict() for q in pset: simlikes = list() for i in pset: if i == q: continue found = False for j in range(0, len(plikes[i][0])): if (("EECS", plikes[i][0][j][1], plikes[i][0][j][2]) in plikes[name][0] or ("", plikes[i][0][j][1], plikes[i][0][j][2] ) in plikes[name][0]) and plikes[i][0][j][2] != "neutral": print("similar likes for " + i + " and " + name + ": " + str(plikes[i][0][j])) simlikes.append(i) found = True break if not found: for j in range(0, len(plikes[i][1])): if plikes[i][1][j] in plikes[name][ 1] and plikes[i][1][j][1] != "neutral": print("similar likes for " + i + " and " + name + ": " + str(plikes[i][1][j])) simlikes.append(i) found = True break simlikesmap[q] = simlikes # calculate % of times where OSCORE will be nonzero times = 0 ttimes = 0 for u in utterances: slots = NLU.getSlots(u) speaker = pidmap[u[0].strip()] for slot in slots[0]: ttimes += 1 oscore = 0 for i in simlikesmap[speaker]: pscore = 0 for j in range(0, len(plikes[i][0])): if slot[1] == plikes[i][0][j][1]: if plikes[i][0][j][2] == "positive": pscore += 1 elif plikes[i][0][j][2] == "negative": pscore -= 1 if pscore > 0: oscore += 1 elif pscore < 0: oscore -= 1 if oscore != 0: times += 1 for slot in slots[1]: ttimes += 1 oscore = 0 for i in simlikesmap[speaker]: pscore = 0 for j in range(0, len(plikes[i][1])): if slot[0] == plikes[i][1][j][0]: if plikes[i][1][j][1] == "positive": pscore += 1 elif plikes[i][1][j][1] == "negative": pscore -= 1 if pscore > 0: oscore += 1 elif pscore < 0: oscore -= 1 if oscore != 0: times += 1 print("Times: " + str(times)) print("Total Times: " + str(ttimes)) print("Percentage: " + str(times * 100.0 / ttimes))
def main(): fi = open("sentimentAnnotations") line1 = fi.readlines() fi.close() fo = open("EECS_annotated_samples_anonymized") line2 = fo.readlines() fo.close() utt1 = NLU.getUtterances(line1) utt2 = NLU.getUtterances(line2) correct = 0 wrong = 0 NEU_NEG = 0 NEU_POS = 0 POS_NEG = 0 SNEU_NEG = set() SNEU_NEG.add("neutral") SNEU_NEG.add("negative") SNEU_POS = set() SNEU_POS.add("neutral") SNEU_POS.add("positive") SPOS_NEG = set() SPOS_NEG.add("negative") SPOS_NEG.add("positive") disagrees = list() inst = 1 insttype = "neutral" for i in range(0, len(utt1)): slots1 = NLU.getSlots(utt1[i]) slots2 = NLU.getSlots(utt2[i]) for j in range(0, len(slots1[0])): if insttype == slots2[0][j][2]: inst += 1 if slots1[0][j][3] == slots2[0][j][3]: correct += 1 else: tset = set() tset.add(slots1[0][j][3]) tset.add(slots2[0][j][3]) disagrees.append(utt1[i]) if slots2[0][j][3] == insttype: if tset == SNEU_NEG: NEU_NEG += 1 elif tset == SNEU_POS: NEU_POS += 1 elif tset == SPOS_NEG: POS_NEG += 1 wrong += 1 for j in range(0, len(slots1[1])): if slots1[1][j][1] == slots2[1][j][1]: correct += 1 else: tset = set() disagrees.append(utt1[i]) tset.add(slots1[1][j][1]) tset.add(slots2[1][j][1]) if slots2[1][j][1] == insttype: if tset == SNEU_NEG: NEU_NEG += 1 elif tset == SNEU_POS: NEU_POS += 1 elif tset == SPOS_NEG: POS_NEG += 1 wrong += 1 print("Agree on " + str(correct)) print("Disagree on " + str(wrong)) print("Percent agreement is " + str(correct * 1.0 / (correct + wrong)) + "%") #print("NEU_NEG: " + str(NEU_NEG*1.0/(correct+wrong))); #print("NEU_POS: " + str(NEU_POS*1.0/(correct+wrong))); #print("POS_NEG: " + str(POS_NEG*1.0/(correct+wrong))); print("NEU_NEG: " + str(NEU_NEG * 1.0 / inst)) print("NEU_POS: " + str(NEU_POS * 1.0 / inst)) print("POS_NEG: " + str(POS_NEG * 1.0 / inst))
def main(): if not os.path.exists('classifiers'): os.makedirs('classifiers') allines = NLU.getALines() allU = NLU.getUtterances(allines) textLines = NLU.getTextLines(allU) slots = [NLU.getSlots(i) for i in allU] sents = list() targets = list() tagset = list() sent_to_xtc = dict() index = 0 for i in range(len(slots)): tstx = [] for etype in ENT_TYPES: for j in range(len(slots[i][etype])): tstx.append(index) index += 1 targets.append(slots[i][etype][j]['sentiment']) ttags = [ slots[i][etype][j][k] for k in ALL_IDS if k in slots[i][etype][j] ] tagset.append(ttags) sents.append(textLines[i]) sent_to_xtc[i] = tstx cprint('Number of Utterances: ' + str(index)) cprint('Length of Lines: ' + str(len(sents))) cprint('Length of Targets: ' + str(len(targets))) cv = set() regex = re.compile(r'[^a-zA-Z0-9_\~\- ]+') for sent in range(0, len(sents)): parts = sents[sent].split(' ') for part in range(0, len(parts)): thepart = regex.sub('', parts[part]) # corner case for hyphens hps = thepart.split('-') if len(hps) > 1: for hi in range(0, len(hps)): cv.add(hps[hi].lower()) # end corner case for hyphens thepart = thepart.lower() cv.add(thepart) cv = list(cv) cprint('Vocabulary Size: ' + str(len(cv))) xtc = [] for sent in range(0, len(sents)): #print('sentence: ' + str(sent)) #print('s1: ' + str(sents[sent])) #print(sents[sent] + ' - with tagset - ' + str(tagset[sent])) #dparse = spwrap.parse(sents[sent]) #print('DPARSE: ' + dparse) # add token boundaries to the sentence tokenSent = sents[sent] for tag in range(0, len(tagset[sent])): tokenSent = tokenSent.replace(tagset[sent][tag], ' ~~t~~ ' + tagset[sent][tag]) #print(tokenSent) parts = regex.sub('', tokenSent) # this handles split and hyphen corner case parts = re.split(' |-', parts) # remove empty parts from the sentence while '' in parts: parts.remove('') # locate window feature indicies windowFeatures = [] done = False while not done: for part in range(0, len(parts)): if '~~t~~' == parts[part]: windowFeatures += [part] parts.remove(parts[part]) #print('parts?: ' + str(parts)) break if part == len(parts) - 1: done = True #print('window features: ' + str(windowFeatures)) #print('parts: ' + str(parts)) row = [] # featureMapG = [[0]*300]*4 featureMap = {} Nflag = 0 for part in range(0, len(parts)): #thepart = regex.sub('', parts[part]) #thepart = thepart.lower() thepart = parts[part].lower() theid = cv.index(thepart) #print(theid) #g_vec = glove_features.getGloveWord(glove_dict, parts[part]) mindist = 999 for wf in range(0, len(windowFeatures)): ############################################################## ## This is the distance measure for window linear distance! distance = abs(windowFeatures[wf] - part) ############################################################## ## This is the distance measure for dependency tree distnace! ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse) ############################################################## if distance < mindist: mindist = distance mindist += 1 sentiz = senti_lexis.lexCounts(thepart) #for g_vi in range(0, len(g_vec)): # featureMapG[0][g_vi] += g_vec[g_vi];# - mindist/10.0 # featureMapG[1][g_vi] += g_vec[g_vi];# - mindist/10.0 # featureMapG[2][g_vi] += g_vec[g_vi];# - mindist/10.0 # featureMapG[3][g_vi] += g_vec[g_vi];# - mindist/10.0 if theid in featureMap: # 1.0 - mindist / 10.0 worked well for the first distance measure... # featureMap[theid] += 1.0 / mindist featureMap[theid][0] += 1.0 - mindist / 10.0 featureMap[theid][1] += (1.0 - mindist / 10.0) * sentiz[0] featureMap[theid][2] += (1.0 - mindist / 10.0) * sentiz[1] featureMap[theid][3] += (1.0 - mindist / 10.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 else: # featureMap[theid] = 1.0 / mindist # count, positive, negative, neutral, negate featureMap[theid] = [0, 0, 0, 0, 0] featureMap[theid][0] = 1.0 - mindist / 10.0 featureMap[theid][1] = (1.0 - mindist / 10.0) * sentiz[0] featureMap[theid][2] = (1.0 - mindist / 10.0) * sentiz[1] featureMap[theid][3] = (1.0 - mindist / 10.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 if Nflag > 0: Nflag -= 1 if senti_lexis.lexNegate(thepart): Nflag = 2 for i in range(0, len(cv)): if i in featureMap: row.extend(featureMap[i]) else: row.extend([0, 0, 0, 0, 0]) # add on the glove features # for a in range(0, len(featureMapG)): # temp_vec = [] # for a_a in range(0, len(featureMapG[a])): # temp_vec.append(featureMapG[a][a_a]*1.0/len(parts)) # row.extend(temp_vec) xtc.append(row) #instead read the data from splits file fsplits = open('splits') lines = fsplits.readlines() splits = list() for i in range(0, len(lines)): parts = lines[i].strip().split(':') train = list() test = list() for s in parts[0][1:-1].split(', '): train.append(int(s)) for s in parts[1][1:-1].split(', '): test.append(int(s)) splits.append((train, test)) fsplits.close() #test print the first split #print(splits[0][0]) #print(splits[0][1]) #do gridsearch + evaluation fscores = open('scores_sentiment', 'w') bestsplit = -1 BSscore = 0 for i in range(0, len(splits)): bestC = 0 bestGamma = 0 bestScore = 0 xtest = list() xtrain = list() ytest = list() ytrain = list() # add the utterance set generation here for senti_set # senti_utters = list() # for j in range(0, len(splits[i][0])): # senti_utters.append(utterances[splits[i][0][j]]) #likesMatrix, slist = leastSquares.getMatrix(senti_utters) # do train-test split for j in range(0, len(splits[i][0])): #speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0]) #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker)) #print('\n' + speaker + ': ' + utterances[splits[i][0][j]][0].strip()) # VECTOR is 38 x 141 -> 264 total for LL in range(0, len(sent_to_xtc[splits[i][0][j]])): #fvector = likesMatrix[slist.index(speaker)] #fvector = fvector.tolist()[0] fvector = xtc[sent_to_xtc[splits[i][0][j]][LL]] #fvector.append(slist.index(speaker)) ############################################################## #entity = tagset[sent_to_xtc[splits[i][0][j]][LL]] #entity = tagset2entity(entity) #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker)) #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity) #print('speaker: ' + str(speaker) + ' - ' + str(slist.index(speaker))) #fvector.append(gscore) ########fvector = [gscore] ############################################################## xtrain.append(fvector) ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]]) for j in range(0, len(splits[i][1])): #speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0]) #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker)) for LL in range(0, len(sent_to_xtc[splits[i][1][j]])): #fvector = likesMatrix[slist.index(speaker)] #fvector = fvector.tolist()[0] fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]] #fvector.append(slist.index(speaker)) ############################################################## #entity = tagset[sent_to_xtc[splits[i][1][j]][LL]] #entity = tagset2entity(entity) #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker)) #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity) #fvector.append(gscore) ########fvector = [gscore] ############################################################## xtest.append(fvector) ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) score = 0 for gamma in numpy.linspace(0.0001, 0.05, 10): #10steps for C in numpy.linspace(0.1, 10, 10): #10steps #2 fold x1 = xtrain[len(xtrain) / 2:] x2 = xtrain[:len(xtrain) / 2] y1 = ytrain[len(ytrain) / 2:] y2 = ytrain[:len(ytrain) / 2] x11 = csr_matrix(x1) x22 = csr_matrix(x2) clf = svm.SVC(gamma=gamma, C=C) testout = clf.fit(x1, y1) score = clf.score(x2, y2) clf = svm.SVC(gamma=gamma, C=C) testout = clf.fit(x2, y2) score += clf.score(x1, y1) score /= 2 if score > bestScore: bestC = C bestGamma = gamma bestScore = score cprint('Cross Validation Score: ' + str(score)) cprint('Gamma = ' + str(gamma) + ' and C = ' + str(C)) ################ THIS IS FOR CvI EVALUATION ################ #Ixtest = list() #Iytest = list() #Cxtest = list() #Cytest = list() #for j in range(0, len(splits[i][1])): # for LL in range(0, len(sent_to_xtc[splits[i][1][j]])): # fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]] # if coriset[sent_to_xtc[splits[i][1][j]][LL]]: # Cxtest.append(fvector) # Cytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) # else: # Ixtest.append(fvector) # Iytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) #xtrain = csr_matrix(xtrain) #Cxtest = csr_matrix(Cxtest) #Ixtest = csr_matrix(Ixtest) #clf = svm.SVC(gamma=bestGamma, C=bestC) #testout = clf.fit(xtrain, ytrain) #CBscore = clf.score(Cxtest, Cytest) #IBscore = clf.score(Ixtest, Iytest) #cprint('Actual Score: ' + str(CBscore) + ':' + str(IBscore)) #fscores.write(str(CBscore) + ':' + str(IBscore) + '\n') #fscores.flush() ############################################################### ################ THIS IS FOR NORMAL EVALUATION ################ xtrain = csr_matrix(xtrain) xtest = csr_matrix(xtest) clf = svm.SVC(gamma=bestGamma, C=bestC) testout = clf.fit(xtrain, ytrain) bestScore = clf.score(xtest, ytest) cprint('Actual Score: ' + str(bestScore)) fscores.write(str(bestScore) + '\n') ############################################################### # save best classifier per fold cString = pickle.dumps(clf) fsave1 = open('classifiers/sentiment_classifier' + str(i), 'w') fsave1.write(cString) fsave1.close() fscores.close() # save feature dictionary cvString = pickle.dumps(cv) fsave2 = open('sentiment_dictionary', 'w') fsave2.write(cvString) fsave2.close()
def main(): fo = open("EECS_annotated_samples_anonymized", "r") lines = fo.readlines() utterances = NLU.getUtterances(lines) mode = False sents = list() targets = list() lastTaken = "" lastSent = "" isclass = False tagset = list() lastTagset = list() index = 0 # to make cross validation work after sentences are duplicated for entities sent_to_xtc = dict() sent_to_xtc[0] = list() for i in range(len(lines)): data = lines[i].strip() if "" == data: index += 1 sent_to_xtc[index] = list() if data.startswith("<class") or data.startswith("<instructor"): mode = True lastTaken = "" lastTagset = list() if data.startswith("<class"): isclass = True if mode and data.startswith("sentiment="): lastTaken = data[10:] if lastTaken.endswith(">"): lastTaken = lastTaken[:-1] if mode and data.startswith("name="): temp = data[5:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("id="): temp = data[3:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("department="): temp = data[11:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if not mode and "" != data: lastSent = data if data.endswith(">"): mode = False isclass = False sents.append(lastSent) tagset.append(lastTagset) sent_to_xtc[index].append(len(sents) - 1) if lastTaken == "": targets.append("neutral") else: targets.append(lastTaken) # This will print out mapping from sentences to entity vectors (XTC) #foutest = open("outtestJ", "w"); #for key in sent_to_xtc: # foutest.write(str(key) + " : " + str(sent_to_xtc[key]) + "\n"); #foutest.flush(); #foutest.close(); #randomly sample utterances #testdata = random.sample(range(0, index), index/5); print("number of utterances: " + str(index)) print("length of lines: " + str(len(sents))) print("length of targets: " + str(len(targets))) print("sent 2: " + str(sents[2])) print("tagset 2: " + str(tagset[2])) cv = set() regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+") for sent in range(0, len(sents)): parts = sents[sent].split(" ") for part in range(0, len(parts)): thepart = regex.sub("", parts[part]) # corner case for hyphens hps = thepart.split("-") if len(hps) > 1: for hi in range(0, len(hps)): cv.add(hps[hi].lower()) # end corner case for hyphens thepart = thepart.lower() cv.add(thepart) cv = list(cv) cv.append("452") #bug? print("vocabulary size: " + str(len(cv))) print("index of I: " + str(cv.index("i"))) xtc = [] for sent in range(0, len(sents)): print("sentence: " + str(sent)) print("s1: " + str(sents[sent])) #print(sents[sent] + " - with tagset - " + str(tagset[sent])); #dparse = spwrap.parse(sents[sent]); #print("DPARSE: " + dparse); # add token boundaries to the sentence tokenSent = sents[sent] for tag in range(0, len(tagset[sent])): tokenSent = tokenSent.replace(tagset[sent][tag], " ~~t~~ " + tagset[sent][tag]) print(tokenSent) parts = regex.sub("", tokenSent) # this handles split and hyphen corner case parts = re.split(" |-", parts) # remove empty parts from the sentence while "" in parts: parts.remove("") # locate window feature indicies windowFeatures = [] done = False while not done: for part in range(0, len(parts)): if "~~t~~" == parts[part]: windowFeatures += [part] parts.remove(parts[part]) print("parts?: " + str(parts)) break if part == len(parts) - 1: done = True print("window features: " + str(windowFeatures)) print("parts: " + str(parts)) row = [] featureMap = {} Nflag = 0 for part in range(0, len(parts)): #thepart = regex.sub("", parts[part]); #thepart = thepart.lower(); thepart = parts[part].lower() theid = cv.index(thepart) print(theid) mindist = 999 for wf in range(0, len(windowFeatures)): ############################################################## ## This is the distance measure for window linear distance! distance = abs(windowFeatures[wf] - part) ############################################################## ## This is the distance measure for dependency tree distnace! ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse); ############################################################## if distance < mindist: mindist = distance mindist += 1 sentiz = senti_lexis.lexCounts(thepart) if theid in featureMap: # 2.0 - mindist / 7.0 worked well for the first distance measure... # featureMap[theid] += 1.0 / mindist; featureMap[theid][0] += 2.0 - mindist / 7.0 featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0] featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1] featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 else: # featureMap[theid] = 1.0 / mindist; # count, positive, negative, neutral, negate featureMap[theid] = [0, 0, 0, 0, 0] featureMap[theid][0] = 2.0 - mindist / 7.0 featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0] featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1] featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 if Nflag > 0: Nflag -= 1 if senti_lexis.lexNegate(thepart): Nflag = 2 for i in range(0, len(cv)): if i in featureMap: row.extend(featureMap[i]) else: row.extend([0, 0, 0, 0, 0]) xtc.append(row) #cv = CountVectorizer(); #xtc = cv.fit_transform(sents); #examining data structures here #parts = sents[0].split(" "); #for part in range(0, len(parts)): # print("PART: " + parts[part]); #print("WORD TAKE: " + str(cv.vocabulary_.get(u'i'))); #print("WORD TAKE: " + str(cv.vocabulary_.get(u'took'))); #print("WORD DONT: " + str(cv.vocabulary_.get(u'don'))); #print("WORD DONT: " + str(cv.vocabulary_.get(u't'))); #print("WORD TAKE: " + str(cv.vocabulary_.get(u'183'))); #print(str(xtc.shape)); #print("ROW0"); #print(xtc[0]); #print("ROW1"); #print(xtc[1]); print("ROW2") print(xtc[2]) print(len(xtc[2])) #print(type(xtc[0])); #print(type(xtc)); #print(str(len(sents))); #endtest #xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(xtc, targets, test_size=0.2, random_state=0); #use this section of code to do cross validation. #shuffle and split into Nfolds parts. #testdata = range(0, index); #random.shuffle(testdata); #folds = list(); #Nfolds = 10; #fsavef = open("folds", "w"); #for i in range(0, Nfolds): # print("i = " + str(i)); # nthfold = testdata[i*index/Nfolds:(i+1)*index/Nfolds]; # folds.append(nthfold); # fsavef.write(str(nthfold) + "\n"); # print("fold(" + str(i) + "): " + str(nthfold)); #fsavef.flush(); #fsavef.close(); #instead read the data from splits file fsplits = open("splits") lines = fsplits.readlines() splits = list() for i in range(0, len(lines)): parts = lines[i].strip().split(":") train = list() test = list() for s in parts[0][1:-1].split(", "): train.append(int(s)) for s in parts[1][1:-1].split(", "): test.append(int(s)) splits.append((train, test)) fsplits.close() #test print the first split #print(splits[0][0]); #print(splits[0][1]); #do gridsearch + evaluation fscores = open("baseline_scores", "w") for i in range(0, len(splits)): bestC = 0 bestGamma = 0 bestScore = 0 xtest = list() xtrain = list() ytest = list() ytrain = list() # do train-test split for j in range(0, len(splits[i][0])): # VECTOR is 38 x 141 -> 264 total for LL in range(0, len(sent_to_xtc[splits[i][0][j]])): ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]]) for j in range(0, len(splits[i][1])): for LL in range(0, len(sent_to_xtc[splits[i][1][j]])): ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) score = ytrain.count("neutral") * 1.0 / len(ytrain) print("Actual Score: " + str(score)) fscores.write(str(score) + "\n") fscores.flush() fscores.close()