示例#1
0
def main():
    np.set_printoptions(threshold='nan')
    lines = NLU.getALines()
    utterances = NLU.getUtterances(lines)
    #do ALS stuff
    X = getMatrix(utterances)[0]
    idtestp = 28
    print(X[idtestp])
    cossim = consineUser(X, idtestp)
    classtest = 280
    print(getWeightedGuess(cossim, X, classtest))
def xactinst():
    w = dicts.getEECSprofs()
    sentences = NLU.getALines()
    utterances = NLU.getUtterances(sentences)
    xmatches = list()
    for i in utterances:
        tutt = i[0].strip().lower()
        for q in w:
            if q.lower() in tutt:
                xmatches.append(q)
    bees = len(xmatches)
    eyes = 0
    for x in xmatches:
        ptz = x.split()
        eyes += len(ptz) - 1
    print(bees)
    print(eyes)
def xactclass():
    w = dicts.getEECSdict()
    ww = list()
    for key in w.keys():
        ww.append(w[key])
    sentences = NLU.getALines()
    utterances = NLU.getUtterances(sentences)
    xmatches = list()
    for i in utterances:
        tutt = i[0].strip().lower()
        for q in ww:
            if q.lower() in tutt:
                xmatches.append(q)
    bees = len(xmatches)
    eyes = 0
    for x in xmatches:
        ptz = x.split()
        eyes += len(ptz) - 1
    print(bees)
    print(eyes)
def t2():
	f = open('crf-input-data')
	clines = f.readlines()
	f.close()
	u2 = list()
	utt = list()
	t2 = list()
	tutt = list()
	for cl in clines:
		parts = cl.strip()
		if parts == '':
			if utt != []:
				u2.append(utt)
				t2.append(tutt)
				utt = list()
				tutt = list()
		else:
			parts = parts.split()
			utt.append(parts[0])
			tutt.append(parts[2])
	if utt != []:
		u2.append(utt)
		t2.append(tutt)
		utt = list()
		tutt = list()

	lines = NLU.getALines()
	utterances = NLU.getUtterances(lines)
	for u in range(0, len(utterances)):
		slots = NLU.getSlots(utterances[u])
		sclist = list()
		for slot in slots[0]:
			sclist.append([slot[1], slot[2]])
		entlist = NLU.getEntities(u2[u], t2[u])[0]
		l1 = list()
		l2 = sclist
		for ent in entlist:
			l1.append([ent[1], ent[2]])
		if l1 != l2:
			print(str(l1) + '_' + str(l2))
def instructorLevel():
    ICOR = 0
    IGUE = 0
    IACT = 0
    profs = dicts.getProfWords()
    pattern = re.compile("[\W_]+")
    print profs
    sentences = NLU.getALines()
    utterances = NLU.getUtterances(sentences)
    for u in utterances:
        names = list()
        cname = ""
        slots = NLU.getSlots(u)[1]
        tutt = u[0].strip().lower().split()
        print slots
        for tok in tutt:
            ttok = pattern.sub("", tok)
            if ttok in profs:
                if cname != "":
                    cname += " "
                cname += ttok
            else:
                if cname != "":
                    names.append(cname)
                cname = ""
        if cname != "":
            names.append(cname)
        print(names)
        slist = list()
        for slot in slots:
            slist.append(slot[0].lower())
        IACT += len(slots)
        IGUE += len(names)
        for name in names:
            if name in slist:
                ICOR += 1
    print(str(ICOR * 1.0 / IGUE))
    print(str(ICOR * 1.0 / IACT))
    print(IACT)
    return ICOR, IGUE, IACT
def classLevel():
    CCOR = 0
    CGUE = 0
    CACT = 0
    pattern = re.compile("[\W_]+")
    w = dicts.getEECSdict()
    ww = list()
    for key in w.keys():
        ww.append(w[key])
    sentences = NLU.getALines()
    utterances = NLU.getUtterances(sentences)
    for u in utterances:
        xmatches = list()
        tutt = u[0].strip().lower()
        slots = NLU.getSlots(u)[0]
        for q in tutt.split():
            qq = pattern.sub("", q)
            if is_number(qq):
                xmatches.append(qq)
        for q in ww:
            if q.lower() in tutt:
                xmatches.append(q.lower())
        slist = list()
        for slot in slots:
            slist.append(slot[1].lower())
        print(slist)
        print(xmatches)
        CACT += len(slots)
        CGUE += len(xmatches)
        for name in xmatches:
            if name in slist:
                CCOR += 1
    print(str(CCOR * 1.0 / CGUE))
    print(str(CCOR * 1.0 / CACT))
    print(CACT)
    return CCOR, CGUE, CACT
示例#7
0
def main():
    fo = open("EECS_annotated_samples_anonymized", "r")
    lines = fo.readlines()
    utterances = NLU.getUtterances(lines)
    mode = False
    sents = list()
    targets = list()
    lastTaken = ""
    lastSent = ""
    isclass = False
    tagset = list()
    coriset = list()
    lastTagset = list()
    index = 0
    # to make cross validation work after sentences are duplicated for entities
    sent_to_xtc = dict()
    sent_to_xtc[0] = list()
    for i in range(len(lines)):
        data = lines[i].strip()
        if "" == data:
            index += 1
            sent_to_xtc[index] = list()
        if data.startswith("<class") or data.startswith("<instructor"):
            mode = True
            lastTaken = ""
            lastTagset = list()
        if data.startswith("<class"):
            isclass = True
        if mode and data.startswith("sentiment="):
            lastTaken = data[10:]
            if lastTaken.endswith(">"):
                lastTaken = lastTaken[:-1]
        if mode and data.startswith("name="):
            temp = data[5:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("id="):
            temp = data[3:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("department="):
            temp = data[11:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if not mode and "" != data:
            lastSent = data
        if data.endswith(">"):
            mode = False
            coriset.append(isclass)
            isclass = False
            sents.append(lastSent)
            tagset.append(lastTagset)
            sent_to_xtc[index].append(len(sents) - 1)
            if lastTaken == "":
                targets.append("neutral")
            else:
                targets.append(lastTaken)

    # This will print out mapping from sentences to entity vectors (XTC)
    #foutest = open("outtestJ", "w");
    #for key in sent_to_xtc:
    #	foutest.write(str(key) + " : " + str(sent_to_xtc[key]) + "\n");
    #foutest.flush();
    #foutest.close();

    #randomly sample utterances
    #testdata = random.sample(range(0, index), index/5);

    print("number of utterances: " + str(index))
    print("length of lines: " + str(len(sents)))
    print("length of targets: " + str(len(targets)))
    print("sent 2: " + str(sents[2]))
    print("tagset 2: " + str(tagset[2]))

    cv = set()
    regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+")
    for sent in range(0, len(sents)):
        parts = sents[sent].split(" ")
        for part in range(0, len(parts)):
            thepart = regex.sub("", parts[part])
            # corner case for hyphens
            hps = thepart.split("-")
            if len(hps) > 1:
                for hi in range(0, len(hps)):
                    cv.add(hps[hi].lower())
            # end corner case for hyphens
            thepart = thepart.lower()
            cv.add(thepart)
    cv = list(cv)
    cv.append("452")
    #bug?
    print("vocabulary size: " + str(len(cv)))
    print("index of I: " + str(cv.index("i")))
    xtc = []
    for sent in range(0, len(sents)):
        print("sentence: " + str(sent))
        print("s1: " + str(sents[sent]))

        #print(sents[sent] + " - with tagset - " + str(tagset[sent]));
        #dparse = spwrap.parse(sents[sent]);
        #print("DPARSE: " + dparse);

        # add token boundaries to the sentence
        tokenSent = sents[sent]
        for tag in range(0, len(tagset[sent])):
            tokenSent = tokenSent.replace(tagset[sent][tag],
                                          " ~~t~~ " + tagset[sent][tag])
        print(tokenSent)
        parts = regex.sub("", tokenSent)
        # this handles split and hyphen corner case
        parts = re.split(" |-", parts)

        # remove empty parts from the sentence
        while "" in parts:
            parts.remove("")

        # locate window feature indicies
        windowFeatures = []
        done = False
        while not done:
            for part in range(0, len(parts)):
                if "~~t~~" == parts[part]:
                    windowFeatures += [part]
                    parts.remove(parts[part])
                    print("parts?: " + str(parts))
                    break
                if part == len(parts) - 1:
                    done = True
        print("window features: " + str(windowFeatures))

        print("parts: " + str(parts))
        row = []
        featureMap = {}
        Nflag = 0
        for part in range(0, len(parts)):
            #thepart = regex.sub("", parts[part]);
            #thepart = thepart.lower();
            thepart = parts[part].lower()
            theid = cv.index(thepart)
            print(theid)
            mindist = 999
            for wf in range(0, len(windowFeatures)):
                ##############################################################
                ## This is the distance measure for window linear distance!
                distance = abs(windowFeatures[wf] - part)
                ##############################################################
                ## This is the distance measure for dependency tree distnace!
                ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse);
                ##############################################################
                if distance < mindist:
                    mindist = distance
            mindist += 1
            sentiz = senti_lexis.lexCounts(thepart)
            if theid in featureMap:
                # 2.0 - mindist / 7.0 worked well for the first distance measure...
                # featureMap[theid] += 1.0 / mindist;
                featureMap[theid][0] += 2.0 - mindist / 7.0
                featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            else:
                # featureMap[theid] = 1.0 / mindist;
                # count, positive, negative, neutral, negate
                featureMap[theid] = [0, 0, 0, 0, 0]
                featureMap[theid][0] = 2.0 - mindist / 7.0
                featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            if Nflag > 0:
                Nflag -= 1
            if senti_lexis.lexNegate(thepart):
                Nflag = 2
        for i in range(0, len(cv)):
            if i in featureMap:
                row.extend(featureMap[i])
            else:
                row.extend([0, 0, 0, 0, 0])
        xtc.append(row)

    #instead read the data from splits file
    fsplits = open("splits")
    lines = fsplits.readlines()
    splits = list()
    for i in range(0, len(lines)):
        parts = lines[i].strip().split(":")
        train = list()
        test = list()
        for s in parts[0][1:-1].split(", "):
            train.append(int(s))
        for s in parts[1][1:-1].split(", "):
            test.append(int(s))
        splits.append((train, test))
    fsplits.close()
    #test print the first split
    #print(splits[0][0]);
    #print(splits[0][1]);

    bestsplit = -1
    BSscore = 0
    for i in range(0, len(splits)):
        bestC = 0
        bestGamma = 0
        bestScore = 0
        xtest = list()
        xtrain = list()
        ytest = list()
        ytrain = list()
        # add the utterance set generation here for senti_set
        senti_utters = list()
        for j in range(0, len(splits[i][0])):
            senti_utters.append(utterances[splits[i][0][j]])
        likesMatrix, slist = leastSquares.getMatrix(senti_utters)
        # do train-test split
        csims = np.array([0.0] * 38)
        totz = 0
        #for j in range(0, len(splits[i][0])):
        #	speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0]);
        #	cossim = leastSquares.cosineUserWE(likesMatrix, slist.index(speaker));
        #	np.add(csims, cossim);
        #	totz += 1;
        for j in range(0, len(splits[i][1])):
            speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0])
            cossim = leastSquares.cosineUserWE(likesMatrix,
                                               slist.index(speaker))
            cossim = np.array(cossim)
            csims = np.add(csims, cossim)
            totz += 1
        for j in range(0, len(csims)):
            csims[j] /= totz
        print(csims.tolist())
示例#8
0
def main():
    fo = open("../data/extract_samples/EECS_annotated_samples_anonymized", "r")
    lines = fo.readlines()
    utterances = NLU.getUtterances(lines)
    mode = False
    sents = list()
    targets = list()
    lastTaken = ""
    lastSent = ""
    isclass = False
    tagset = list()
    coriset = list()
    lastTagset = list()
    index = 0
    sent_to_xtc = dict()
    sent_to_xtc[0] = list()
    for i in range(len(lines)):
        data = lines[i].strip()
        if "" == data:
            index += 1
            sent_to_xtc[index] = list()
        if data.startswith("<class") or data.startswith("<instructor"):
            mode = True
            lastTaken = ""
            lastTagset = list()
        if data.startswith("<class"):
            isclass = True
        if mode and data.startswith("sentiment="):
            lastTaken = data[10:]
            if lastTaken.endswith(">"):
                lastTaken = lastTaken[:-1]
        if mode and data.startswith("name="):
            temp = data[5:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("id="):
            temp = data[3:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("department="):
            temp = data[11:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if not mode and "" != data:
            lastSent = data
        if data.endswith(">"):
            mode = False
            coriset.append(isclass)
            isclass = False
            sents.append(lastSent)
            tagset.append(lastTagset)
            sent_to_xtc[index].append(len(sents) - 1)
            if lastTaken == "":
                targets.append("neutral")
            else:
                targets.append(lastTaken)

    f2 = open("RNTN_sent")
    gendict = f2.readlines()
    f2.close()

    sdict = dict()
    slist = list()
    parent = None
    lastpart = None
    for i in gendict:
        if i.startswith(":"):
            parent = i[1:].strip()
            sdict[parent] = dict()
            slist.append(parent)
        elif is_number(i.strip()):
            sdict[parent][lastpart] = int(i.strip())
        else:
            lastpart = i.strip()
            sdict[parent][lastpart] = -1

    print(len(tagset))
    print(len(sdict.keys()))
    print(len(sent_to_xtc))
    print(len(targets))

    tries = 0
    correct = 0
    for q in range(0, len(slist)):
        print(sdict[slist[q]])
        print(sent_to_xtc[q])
        for i in sent_to_xtc[q]:
            print(str(tagset[i]) + ":" + str(targets[i]))
            for j in sdict[slist[q]]:
                if tagset[i][0] in j:
                    asent = "neutral"
                    if int(sdict[slist[q]][j]) > 2:
                        asent = "positive"
                    elif int(sdict[slist[q]][j]) < 1:
                        asent = "negative"
                    print(asent)
                    tries += 1
                    if targets[i] == asent:
                        correct += 1
    print("correct: " + str(correct * 1.0 / tries))
def main():
	# get scores
	fscores = open("S1feature");#S1feature -S1single_lies
	lines = fscores.readlines();
	fscores.close();
	scores = list();
	for i in lines:
		scores.append(float(i.strip()));
	sort_scores = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])];
	sort_scores.reverse();

	# get splits
	fsplits = open("splits");
	splines = fsplits.readlines();
	splits = list();
	for i in range(0, len(splines)):
		parts = splines[i].strip().split(":");
		train = list();
		test = list();
		for s in parts[0][1:-1].split(", "):
			train.append(int(s));
		for s in parts[1][1:-1].split(", "):
			test.append(int(s));
		splits.append((train, test));
	fsplits.close();

	#get speakers
	nlines = NLU.getALines();
	utterances = NLU.getUtterances(nlines);
	nlist = list();
	for i in range(0, len(splits)):
		senti_utters = list();
		for j in range(0, len(splits[i][0])):
			senti_utters.append(utterances[splits[i][0][j]]);
		likesMatrix, slist = leastSquares.getMatrix(senti_utters);

		test_utters = list();
		for j in range(0, len(splits[i][1])):
			test_utters.append(utterances[splits[i][1][j]]);
		TlikesMatrix, Tslist = leastSquares.getMatrix(test_utters);

		nonneus = 0;
		nnews = 0;
		density = 0.0;
		counts = list();
		#iterate over rows
		for k in range(0, len(likesMatrix)):
			nonneu = 0;
			for j in range(0, len(likesMatrix[k])):
				if int(likesMatrix[k][j]) != 5:
					nonneu += 1;
			if nonneu > 0:
				nnews += 1;
			nonneus += nonneu;
			counts.append(nonneu);
		#iterate over columns
		elaps = 0;
		for k in range(0, len(likesMatrix[0])):
			nonneu = 0;
			TNEW = 0;
			for j in range(0, len(likesMatrix)):
				if int(likesMatrix[j][k]) != 5:
					nonneu = 1;
				if int(TlikesMatrix[j][k]) != 5:
					TNEW = 1;
			if nonneu == 1 and TNEW == 1:
				elaps += 1;

		nlist.append(str(nnews) + ":" + str(nonneus) + ":" + str(counts) + ":" + str(elaps));

	#print correlations
	for i in sort_scores:
		print(str(scores[i]) + " - " + nlist[i]);
示例#10
0
def main():
    name = "MEGHAN"
    fi = open("../data/extract_samples/pID_AEU")
    pid = fi.readlines()
    fi.close()
    pidmap = dict()
    pset = set()
    for i in range(0, len(pid)):
        parts = pid[i].split("\t")
        pset.add(parts[0])
        pidmap[parts[1].strip()] = parts[0]
    fl = open("EECS_annotated_samples_anonymized")
    lines = fl.readlines()
    fl.close()
    utterances = NLU.getUtterances(lines)
    print(utterances[0])
    print("Speaker: " + pidmap[utterances[0][0].strip()])
    slots = NLU.getSlots(utterances[0])
    print(slots)
    plikes = dict()
    for i in pset:
        plikes[i] = [list(), list()]
    for i in range(0, len(utterances)):
        slots = NLU.getSlots(utterances[i])
        speaker = pidmap[utterances[i][0].strip()]
        if slots[0]:
            plikes[speaker][0].extend(slots[0])
        if slots[1]:
            plikes[speaker][1].extend(slots[1])
    print("\n\nGiven that EECS 492 sentiment is neutral...")
    #print(plikes[name]);
    wholikes = ("EECS", "492", "neutral")
    likers = list()
    for i in pset:
        if wholikes in plikes[i][0]:
            likers.append(i)
    # check instructors in likers
    ucontains_i = "Quentin Stout"
    print("\n\nWho likes " + ucontains_i)
    for i in likers:
        for j in range(0, len(plikes[i][1])):
            if plikes[i][1][j][0] == ucontains_i:
                print(i + ": " + str(plikes[i][1][j]))
    # check classes in likers
    ucontains_cd = "EECS"
    ucontains_cid = "545"
    print("\n\nWho likes " + ucontains_cd + " " + ucontains_cid)
    for i in likers:
        for j in range(0, len(plikes[i][0])):
            # don't worry about department but if you want to... then use this line
            # plikes[i][0][j][0] == ucontains_cd and
            if plikes[i][0][j][1] == ucontains_cid:
                print(i + ": " + str(plikes[i][0][j]))
    # find all people with similar sentiments to <name> in the data set
    print("\n\nSimlikes!")
    simlikesmap = dict()
    for q in pset:
        simlikes = list()
        for i in pset:
            if i == q:
                continue
            found = False
            for j in range(0, len(plikes[i][0])):
                if (("EECS", plikes[i][0][j][1], plikes[i][0][j][2])
                        in plikes[name][0] or
                    ("", plikes[i][0][j][1], plikes[i][0][j][2]
                     ) in plikes[name][0]) and plikes[i][0][j][2] != "neutral":
                    print("similar likes for " + i + " and " + name + ": " +
                          str(plikes[i][0][j]))
                    simlikes.append(i)
                    found = True
                    break
            if not found:
                for j in range(0, len(plikes[i][1])):
                    if plikes[i][1][j] in plikes[name][
                            1] and plikes[i][1][j][1] != "neutral":
                        print("similar likes for " + i + " and " + name +
                              ": " + str(plikes[i][1][j]))
                        simlikes.append(i)
                        found = True
                        break
        simlikesmap[q] = simlikes
    # calculate % of times where OSCORE will be nonzero
    times = 0
    ttimes = 0
    for u in utterances:
        slots = NLU.getSlots(u)
        speaker = pidmap[u[0].strip()]
        for slot in slots[0]:
            ttimes += 1
            oscore = 0
            for i in simlikesmap[speaker]:
                pscore = 0
                for j in range(0, len(plikes[i][0])):
                    if slot[1] == plikes[i][0][j][1]:
                        if plikes[i][0][j][2] == "positive":
                            pscore += 1
                        elif plikes[i][0][j][2] == "negative":
                            pscore -= 1
                if pscore > 0:
                    oscore += 1
                elif pscore < 0:
                    oscore -= 1
            if oscore != 0:
                times += 1
        for slot in slots[1]:
            ttimes += 1
            oscore = 0
            for i in simlikesmap[speaker]:
                pscore = 0
                for j in range(0, len(plikes[i][1])):
                    if slot[0] == plikes[i][1][j][0]:
                        if plikes[i][1][j][1] == "positive":
                            pscore += 1
                        elif plikes[i][1][j][1] == "negative":
                            pscore -= 1
                if pscore > 0:
                    oscore += 1
                elif pscore < 0:
                    oscore -= 1
            if oscore != 0:
                times += 1
    print("Times: " + str(times))
    print("Total Times: " + str(ttimes))
    print("Percentage: " + str(times * 100.0 / ttimes))
def main():
    fi = open("sentimentAnnotations")
    line1 = fi.readlines()
    fi.close()
    fo = open("EECS_annotated_samples_anonymized")
    line2 = fo.readlines()
    fo.close()
    utt1 = NLU.getUtterances(line1)
    utt2 = NLU.getUtterances(line2)
    correct = 0
    wrong = 0
    NEU_NEG = 0
    NEU_POS = 0
    POS_NEG = 0
    SNEU_NEG = set()
    SNEU_NEG.add("neutral")
    SNEU_NEG.add("negative")
    SNEU_POS = set()
    SNEU_POS.add("neutral")
    SNEU_POS.add("positive")
    SPOS_NEG = set()
    SPOS_NEG.add("negative")
    SPOS_NEG.add("positive")
    disagrees = list()
    inst = 1
    insttype = "neutral"
    for i in range(0, len(utt1)):
        slots1 = NLU.getSlots(utt1[i])
        slots2 = NLU.getSlots(utt2[i])
        for j in range(0, len(slots1[0])):
            if insttype == slots2[0][j][2]:
                inst += 1
            if slots1[0][j][3] == slots2[0][j][3]:
                correct += 1
            else:
                tset = set()
                tset.add(slots1[0][j][3])
                tset.add(slots2[0][j][3])
                disagrees.append(utt1[i])
                if slots2[0][j][3] == insttype:
                    if tset == SNEU_NEG:
                        NEU_NEG += 1
                    elif tset == SNEU_POS:
                        NEU_POS += 1
                    elif tset == SPOS_NEG:
                        POS_NEG += 1
                wrong += 1
        for j in range(0, len(slots1[1])):
            if slots1[1][j][1] == slots2[1][j][1]:
                correct += 1
            else:
                tset = set()
                disagrees.append(utt1[i])
                tset.add(slots1[1][j][1])
                tset.add(slots2[1][j][1])
                if slots2[1][j][1] == insttype:
                    if tset == SNEU_NEG:
                        NEU_NEG += 1
                    elif tset == SNEU_POS:
                        NEU_POS += 1
                    elif tset == SPOS_NEG:
                        POS_NEG += 1
                wrong += 1
    print("Agree on " + str(correct))
    print("Disagree on " + str(wrong))
    print("Percent agreement is " + str(correct * 1.0 / (correct + wrong)) +
          "%")
    #print("NEU_NEG: " + str(NEU_NEG*1.0/(correct+wrong)));
    #print("NEU_POS: " + str(NEU_POS*1.0/(correct+wrong)));
    #print("POS_NEG: " + str(POS_NEG*1.0/(correct+wrong)));
    print("NEU_NEG: " + str(NEU_NEG * 1.0 / inst))
    print("NEU_POS: " + str(NEU_POS * 1.0 / inst))
    print("POS_NEG: " + str(POS_NEG * 1.0 / inst))
def main():
    if not os.path.exists('classifiers'):
        os.makedirs('classifiers')

    allines = NLU.getALines()
    allU = NLU.getUtterances(allines)
    textLines = NLU.getTextLines(allU)
    slots = [NLU.getSlots(i) for i in allU]

    sents = list()
    targets = list()
    tagset = list()
    sent_to_xtc = dict()

    index = 0
    for i in range(len(slots)):
        tstx = []
        for etype in ENT_TYPES:
            for j in range(len(slots[i][etype])):
                tstx.append(index)
                index += 1
                targets.append(slots[i][etype][j]['sentiment'])
                ttags = [
                    slots[i][etype][j][k] for k in ALL_IDS
                    if k in slots[i][etype][j]
                ]
                tagset.append(ttags)
                sents.append(textLines[i])
        sent_to_xtc[i] = tstx

    cprint('Number of Utterances: ' + str(index))
    cprint('Length of Lines: ' + str(len(sents)))
    cprint('Length of Targets: ' + str(len(targets)))

    cv = set()
    regex = re.compile(r'[^a-zA-Z0-9_\~\- ]+')
    for sent in range(0, len(sents)):
        parts = sents[sent].split(' ')
        for part in range(0, len(parts)):
            thepart = regex.sub('', parts[part])
            # corner case for hyphens
            hps = thepart.split('-')
            if len(hps) > 1:
                for hi in range(0, len(hps)):
                    cv.add(hps[hi].lower())
            # end corner case for hyphens
            thepart = thepart.lower()
            cv.add(thepart)
    cv = list(cv)
    cprint('Vocabulary Size: ' + str(len(cv)))

    xtc = []
    for sent in range(0, len(sents)):
        #print('sentence: ' + str(sent))
        #print('s1: ' + str(sents[sent]))

        #print(sents[sent] + ' - with tagset - ' + str(tagset[sent]))
        #dparse = spwrap.parse(sents[sent])
        #print('DPARSE: ' + dparse)

        # add token boundaries to the sentence
        tokenSent = sents[sent]
        for tag in range(0, len(tagset[sent])):
            tokenSent = tokenSent.replace(tagset[sent][tag],
                                          ' ~~t~~ ' + tagset[sent][tag])
        #print(tokenSent)
        parts = regex.sub('', tokenSent)
        # this handles split and hyphen corner case
        parts = re.split(' |-', parts)

        # remove empty parts from the sentence
        while '' in parts:
            parts.remove('')

        # locate window feature indicies
        windowFeatures = []
        done = False
        while not done:
            for part in range(0, len(parts)):
                if '~~t~~' == parts[part]:
                    windowFeatures += [part]
                    parts.remove(parts[part])
                    #print('parts?: ' + str(parts))
                    break
                if part == len(parts) - 1:
                    done = True
        #print('window features: ' + str(windowFeatures))

        #print('parts: ' + str(parts))
        row = []
        # featureMapG = [[0]*300]*4
        featureMap = {}
        Nflag = 0
        for part in range(0, len(parts)):
            #thepart = regex.sub('', parts[part])
            #thepart = thepart.lower()
            thepart = parts[part].lower()
            theid = cv.index(thepart)
            #print(theid)
            #g_vec = glove_features.getGloveWord(glove_dict, parts[part])
            mindist = 999
            for wf in range(0, len(windowFeatures)):
                ##############################################################
                ## This is the distance measure for window linear distance!
                distance = abs(windowFeatures[wf] - part)
                ##############################################################
                ## This is the distance measure for dependency tree distnace!
                ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse)
                ##############################################################
                if distance < mindist:
                    mindist = distance
            mindist += 1
            sentiz = senti_lexis.lexCounts(thepart)
            #for g_vi in range(0, len(g_vec)):
            #	featureMapG[0][g_vi] += g_vec[g_vi];# - mindist/10.0
            #	featureMapG[1][g_vi] += g_vec[g_vi];# - mindist/10.0
            #	featureMapG[2][g_vi] += g_vec[g_vi];# - mindist/10.0
            #	featureMapG[3][g_vi] += g_vec[g_vi];# - mindist/10.0
            if theid in featureMap:
                # 1.0 - mindist / 10.0 worked well for the first distance measure...
                # featureMap[theid] += 1.0 / mindist
                featureMap[theid][0] += 1.0 - mindist / 10.0
                featureMap[theid][1] += (1.0 - mindist / 10.0) * sentiz[0]
                featureMap[theid][2] += (1.0 - mindist / 10.0) * sentiz[1]
                featureMap[theid][3] += (1.0 - mindist / 10.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            else:
                # featureMap[theid] = 1.0 / mindist
                # count, positive, negative, neutral, negate
                featureMap[theid] = [0, 0, 0, 0, 0]
                featureMap[theid][0] = 1.0 - mindist / 10.0
                featureMap[theid][1] = (1.0 - mindist / 10.0) * sentiz[0]
                featureMap[theid][2] = (1.0 - mindist / 10.0) * sentiz[1]
                featureMap[theid][3] = (1.0 - mindist / 10.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            if Nflag > 0:
                Nflag -= 1
            if senti_lexis.lexNegate(thepart):
                Nflag = 2
        for i in range(0, len(cv)):
            if i in featureMap:
                row.extend(featureMap[i])
            else:
                row.extend([0, 0, 0, 0, 0])
        # add on the glove features
        # for a in range(0, len(featureMapG)):
        # 	temp_vec = []
        # 	for a_a in range(0, len(featureMapG[a])):
        # 		temp_vec.append(featureMapG[a][a_a]*1.0/len(parts))
        # 	row.extend(temp_vec)
        xtc.append(row)

    #instead read the data from splits file
    fsplits = open('splits')
    lines = fsplits.readlines()
    splits = list()
    for i in range(0, len(lines)):
        parts = lines[i].strip().split(':')
        train = list()
        test = list()
        for s in parts[0][1:-1].split(', '):
            train.append(int(s))
        for s in parts[1][1:-1].split(', '):
            test.append(int(s))
        splits.append((train, test))
    fsplits.close()
    #test print the first split
    #print(splits[0][0])
    #print(splits[0][1])

    #do gridsearch + evaluation
    fscores = open('scores_sentiment', 'w')
    bestsplit = -1
    BSscore = 0
    for i in range(0, len(splits)):
        bestC = 0
        bestGamma = 0
        bestScore = 0
        xtest = list()
        xtrain = list()
        ytest = list()
        ytrain = list()
        # add the utterance set generation here for senti_set
        # senti_utters = list()
        # for j in range(0, len(splits[i][0])):
        # 	senti_utters.append(utterances[splits[i][0][j]])
        #likesMatrix, slist = leastSquares.getMatrix(senti_utters)
        # do train-test split
        for j in range(0, len(splits[i][0])):
            #speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0])
            #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker))
            #print('\n' + speaker + ': ' + utterances[splits[i][0][j]][0].strip())
            # VECTOR is 38 x 141 -> 264 total
            for LL in range(0, len(sent_to_xtc[splits[i][0][j]])):
                #fvector = likesMatrix[slist.index(speaker)]
                #fvector = fvector.tolist()[0]
                fvector = xtc[sent_to_xtc[splits[i][0][j]][LL]]
                #fvector.append(slist.index(speaker))
                ##############################################################
                #entity = tagset[sent_to_xtc[splits[i][0][j]][LL]]
                #entity = tagset2entity(entity)
                #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker))
                #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity)
                #print('speaker: ' + str(speaker) + ' - ' + str(slist.index(speaker)))
                #fvector.append(gscore)
                ########fvector = [gscore]
                ##############################################################
                xtrain.append(fvector)
                ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]])
        for j in range(0, len(splits[i][1])):
            #speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0])
            #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker))
            for LL in range(0, len(sent_to_xtc[splits[i][1][j]])):
                #fvector = likesMatrix[slist.index(speaker)]
                #fvector = fvector.tolist()[0]
                fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]]
                #fvector.append(slist.index(speaker))
                ##############################################################
                #entity = tagset[sent_to_xtc[splits[i][1][j]][LL]]
                #entity = tagset2entity(entity)
                #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker))
                #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity)
                #fvector.append(gscore)
                ########fvector = [gscore]
                ##############################################################
                xtest.append(fvector)
                ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        score = 0

        for gamma in numpy.linspace(0.0001, 0.05, 10):  #10steps
            for C in numpy.linspace(0.1, 10, 10):  #10steps
                #2 fold
                x1 = xtrain[len(xtrain) / 2:]
                x2 = xtrain[:len(xtrain) / 2]
                y1 = ytrain[len(ytrain) / 2:]
                y2 = ytrain[:len(ytrain) / 2]
                x11 = csr_matrix(x1)
                x22 = csr_matrix(x2)
                clf = svm.SVC(gamma=gamma, C=C)
                testout = clf.fit(x1, y1)
                score = clf.score(x2, y2)
                clf = svm.SVC(gamma=gamma, C=C)
                testout = clf.fit(x2, y2)
                score += clf.score(x1, y1)
                score /= 2
                if score > bestScore:
                    bestC = C
                    bestGamma = gamma
                    bestScore = score
                    cprint('Cross Validation Score: ' + str(score))
                    cprint('Gamma = ' + str(gamma) + ' and C = ' + str(C))

        ################ THIS IS FOR CvI EVALUATION ################
        #Ixtest = list()
        #Iytest = list()
        #Cxtest = list()
        #Cytest = list()
        #for j in range(0, len(splits[i][1])):
        #	for LL in range(0, len(sent_to_xtc[splits[i][1][j]])):
        #		fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]]
        #		if coriset[sent_to_xtc[splits[i][1][j]][LL]]:
        #			Cxtest.append(fvector)
        #			Cytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        #		else:
        #			Ixtest.append(fvector)
        #			Iytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        #xtrain = csr_matrix(xtrain)
        #Cxtest = csr_matrix(Cxtest)
        #Ixtest = csr_matrix(Ixtest)
        #clf = svm.SVC(gamma=bestGamma, C=bestC)
        #testout = clf.fit(xtrain, ytrain)
        #CBscore = clf.score(Cxtest, Cytest)
        #IBscore = clf.score(Ixtest, Iytest)
        #cprint('Actual Score: ' + str(CBscore) + ':' + str(IBscore))
        #fscores.write(str(CBscore) + ':' + str(IBscore) + '\n')
        #fscores.flush()
        ###############################################################
        ################ THIS IS FOR NORMAL EVALUATION ################
        xtrain = csr_matrix(xtrain)
        xtest = csr_matrix(xtest)
        clf = svm.SVC(gamma=bestGamma, C=bestC)
        testout = clf.fit(xtrain, ytrain)
        bestScore = clf.score(xtest, ytest)
        cprint('Actual Score: ' + str(bestScore))
        fscores.write(str(bestScore) + '\n')
        ###############################################################
        # save best classifier per fold
        cString = pickle.dumps(clf)
        fsave1 = open('classifiers/sentiment_classifier' + str(i), 'w')
        fsave1.write(cString)
        fsave1.close()

    fscores.close()
    # save feature dictionary
    cvString = pickle.dumps(cv)
    fsave2 = open('sentiment_dictionary', 'w')
    fsave2.write(cvString)
    fsave2.close()
示例#13
0
def main():
    fo = open("EECS_annotated_samples_anonymized", "r")
    lines = fo.readlines()
    utterances = NLU.getUtterances(lines)
    mode = False
    sents = list()
    targets = list()
    lastTaken = ""
    lastSent = ""
    isclass = False
    tagset = list()
    lastTagset = list()
    index = 0
    # to make cross validation work after sentences are duplicated for entities
    sent_to_xtc = dict()
    sent_to_xtc[0] = list()
    for i in range(len(lines)):
        data = lines[i].strip()
        if "" == data:
            index += 1
            sent_to_xtc[index] = list()
        if data.startswith("<class") or data.startswith("<instructor"):
            mode = True
            lastTaken = ""
            lastTagset = list()
        if data.startswith("<class"):
            isclass = True
        if mode and data.startswith("sentiment="):
            lastTaken = data[10:]
            if lastTaken.endswith(">"):
                lastTaken = lastTaken[:-1]
        if mode and data.startswith("name="):
            temp = data[5:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("id="):
            temp = data[3:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("department="):
            temp = data[11:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if not mode and "" != data:
            lastSent = data
        if data.endswith(">"):
            mode = False
            isclass = False
            sents.append(lastSent)
            tagset.append(lastTagset)
            sent_to_xtc[index].append(len(sents) - 1)
            if lastTaken == "":
                targets.append("neutral")
            else:
                targets.append(lastTaken)

    # This will print out mapping from sentences to entity vectors (XTC)
    #foutest = open("outtestJ", "w");
    #for key in sent_to_xtc:
    #	foutest.write(str(key) + " : " + str(sent_to_xtc[key]) + "\n");
    #foutest.flush();
    #foutest.close();

    #randomly sample utterances
    #testdata = random.sample(range(0, index), index/5);

    print("number of utterances: " + str(index))
    print("length of lines: " + str(len(sents)))
    print("length of targets: " + str(len(targets)))
    print("sent 2: " + str(sents[2]))
    print("tagset 2: " + str(tagset[2]))

    cv = set()
    regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+")
    for sent in range(0, len(sents)):
        parts = sents[sent].split(" ")
        for part in range(0, len(parts)):
            thepart = regex.sub("", parts[part])
            # corner case for hyphens
            hps = thepart.split("-")
            if len(hps) > 1:
                for hi in range(0, len(hps)):
                    cv.add(hps[hi].lower())
            # end corner case for hyphens
            thepart = thepart.lower()
            cv.add(thepart)
    cv = list(cv)
    cv.append("452")
    #bug?
    print("vocabulary size: " + str(len(cv)))
    print("index of I: " + str(cv.index("i")))
    xtc = []
    for sent in range(0, len(sents)):
        print("sentence: " + str(sent))
        print("s1: " + str(sents[sent]))

        #print(sents[sent] + " - with tagset - " + str(tagset[sent]));
        #dparse = spwrap.parse(sents[sent]);
        #print("DPARSE: " + dparse);

        # add token boundaries to the sentence
        tokenSent = sents[sent]
        for tag in range(0, len(tagset[sent])):
            tokenSent = tokenSent.replace(tagset[sent][tag],
                                          " ~~t~~ " + tagset[sent][tag])
        print(tokenSent)
        parts = regex.sub("", tokenSent)
        # this handles split and hyphen corner case
        parts = re.split(" |-", parts)

        # remove empty parts from the sentence
        while "" in parts:
            parts.remove("")

        # locate window feature indicies
        windowFeatures = []
        done = False
        while not done:
            for part in range(0, len(parts)):
                if "~~t~~" == parts[part]:
                    windowFeatures += [part]
                    parts.remove(parts[part])
                    print("parts?: " + str(parts))
                    break
                if part == len(parts) - 1:
                    done = True
        print("window features: " + str(windowFeatures))

        print("parts: " + str(parts))
        row = []
        featureMap = {}
        Nflag = 0
        for part in range(0, len(parts)):
            #thepart = regex.sub("", parts[part]);
            #thepart = thepart.lower();
            thepart = parts[part].lower()
            theid = cv.index(thepart)
            print(theid)
            mindist = 999
            for wf in range(0, len(windowFeatures)):
                ##############################################################
                ## This is the distance measure for window linear distance!
                distance = abs(windowFeatures[wf] - part)
                ##############################################################
                ## This is the distance measure for dependency tree distnace!
                ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse);
                ##############################################################
                if distance < mindist:
                    mindist = distance
            mindist += 1
            sentiz = senti_lexis.lexCounts(thepart)
            if theid in featureMap:
                # 2.0 - mindist / 7.0 worked well for the first distance measure...
                # featureMap[theid] += 1.0 / mindist;
                featureMap[theid][0] += 2.0 - mindist / 7.0
                featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            else:
                # featureMap[theid] = 1.0 / mindist;
                # count, positive, negative, neutral, negate
                featureMap[theid] = [0, 0, 0, 0, 0]
                featureMap[theid][0] = 2.0 - mindist / 7.0
                featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            if Nflag > 0:
                Nflag -= 1
            if senti_lexis.lexNegate(thepart):
                Nflag = 2
        for i in range(0, len(cv)):
            if i in featureMap:
                row.extend(featureMap[i])
            else:
                row.extend([0, 0, 0, 0, 0])
        xtc.append(row)

    #cv = CountVectorizer();
    #xtc = cv.fit_transform(sents);

    #examining data structures here
    #parts = sents[0].split(" ");
    #for part in range(0, len(parts)):
    #	print("PART: " + parts[part]);
    #print("WORD TAKE: " + str(cv.vocabulary_.get(u'i')));
    #print("WORD TAKE: " + str(cv.vocabulary_.get(u'took')));
    #print("WORD DONT: " + str(cv.vocabulary_.get(u'don')));
    #print("WORD DONT: " + str(cv.vocabulary_.get(u't')));
    #print("WORD TAKE: " + str(cv.vocabulary_.get(u'183')));
    #print(str(xtc.shape));
    #print("ROW0");
    #print(xtc[0]);
    #print("ROW1");
    #print(xtc[1]);
    print("ROW2")
    print(xtc[2])
    print(len(xtc[2]))
    #print(type(xtc[0]));
    #print(type(xtc));
    #print(str(len(sents)));
    #endtest

    #xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(xtc, targets, test_size=0.2, random_state=0);

    #use this section of code to do cross validation.
    #shuffle and split into Nfolds parts.
    #testdata = range(0, index);
    #random.shuffle(testdata);
    #folds = list();
    #Nfolds = 10;
    #fsavef = open("folds", "w");
    #for i in range(0, Nfolds):
    #	print("i = " + str(i));
    #	nthfold = testdata[i*index/Nfolds:(i+1)*index/Nfolds];
    #	folds.append(nthfold);
    #	fsavef.write(str(nthfold) + "\n");
    #	print("fold(" + str(i) + "): " + str(nthfold));
    #fsavef.flush();
    #fsavef.close();

    #instead read the data from splits file
    fsplits = open("splits")
    lines = fsplits.readlines()
    splits = list()
    for i in range(0, len(lines)):
        parts = lines[i].strip().split(":")
        train = list()
        test = list()
        for s in parts[0][1:-1].split(", "):
            train.append(int(s))
        for s in parts[1][1:-1].split(", "):
            test.append(int(s))
        splits.append((train, test))
    fsplits.close()
    #test print the first split
    #print(splits[0][0]);
    #print(splits[0][1]);

    #do gridsearch + evaluation
    fscores = open("baseline_scores", "w")
    for i in range(0, len(splits)):
        bestC = 0
        bestGamma = 0
        bestScore = 0
        xtest = list()
        xtrain = list()
        ytest = list()
        ytrain = list()
        # do train-test split
        for j in range(0, len(splits[i][0])):
            # VECTOR is 38 x 141 -> 264 total
            for LL in range(0, len(sent_to_xtc[splits[i][0][j]])):
                ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]])
        for j in range(0, len(splits[i][1])):
            for LL in range(0, len(sent_to_xtc[splits[i][1][j]])):
                ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        score = ytrain.count("neutral") * 1.0 / len(ytrain)

        print("Actual Score: " + str(score))
        fscores.write(str(score) + "\n")
        fscores.flush()
    fscores.close()