Пример #1
0
def main():
    np.set_printoptions(threshold='nan')
    lines = NLU.getALines()
    utterances = NLU.getUtterances(lines)
    #do ALS stuff
    X = getMatrix(utterances)[0]
    idtestp = 28
    print(X[idtestp])
    cossim = consineUser(X, idtestp)
    classtest = 280
    print(getWeightedGuess(cossim, X, classtest))
def xactinst():
    w = dicts.getEECSprofs()
    sentences = NLU.getALines()
    utterances = NLU.getUtterances(sentences)
    xmatches = list()
    for i in utterances:
        tutt = i[0].strip().lower()
        for q in w:
            if q.lower() in tutt:
                xmatches.append(q)
    bees = len(xmatches)
    eyes = 0
    for x in xmatches:
        ptz = x.split()
        eyes += len(ptz) - 1
    print(bees)
    print(eyes)
def xactclass():
    w = dicts.getEECSdict()
    ww = list()
    for key in w.keys():
        ww.append(w[key])
    sentences = NLU.getALines()
    utterances = NLU.getUtterances(sentences)
    xmatches = list()
    for i in utterances:
        tutt = i[0].strip().lower()
        for q in ww:
            if q.lower() in tutt:
                xmatches.append(q)
    bees = len(xmatches)
    eyes = 0
    for x in xmatches:
        ptz = x.split()
        eyes += len(ptz) - 1
    print(bees)
    print(eyes)
Пример #4
0
def t2():
	f = open('crf-input-data')
	clines = f.readlines()
	f.close()
	u2 = list()
	utt = list()
	t2 = list()
	tutt = list()
	for cl in clines:
		parts = cl.strip()
		if parts == '':
			if utt != []:
				u2.append(utt)
				t2.append(tutt)
				utt = list()
				tutt = list()
		else:
			parts = parts.split()
			utt.append(parts[0])
			tutt.append(parts[2])
	if utt != []:
		u2.append(utt)
		t2.append(tutt)
		utt = list()
		tutt = list()

	lines = NLU.getALines()
	utterances = NLU.getUtterances(lines)
	for u in range(0, len(utterances)):
		slots = NLU.getSlots(utterances[u])
		sclist = list()
		for slot in slots[0]:
			sclist.append([slot[1], slot[2]])
		entlist = NLU.getEntities(u2[u], t2[u])[0]
		l1 = list()
		l2 = sclist
		for ent in entlist:
			l1.append([ent[1], ent[2]])
		if l1 != l2:
			print(str(l1) + '_' + str(l2))
def instructorLevel():
    ICOR = 0
    IGUE = 0
    IACT = 0
    profs = dicts.getProfWords()
    pattern = re.compile("[\W_]+")
    print profs
    sentences = NLU.getALines()
    utterances = NLU.getUtterances(sentences)
    for u in utterances:
        names = list()
        cname = ""
        slots = NLU.getSlots(u)[1]
        tutt = u[0].strip().lower().split()
        print slots
        for tok in tutt:
            ttok = pattern.sub("", tok)
            if ttok in profs:
                if cname != "":
                    cname += " "
                cname += ttok
            else:
                if cname != "":
                    names.append(cname)
                cname = ""
        if cname != "":
            names.append(cname)
        print(names)
        slist = list()
        for slot in slots:
            slist.append(slot[0].lower())
        IACT += len(slots)
        IGUE += len(names)
        for name in names:
            if name in slist:
                ICOR += 1
    print(str(ICOR * 1.0 / IGUE))
    print(str(ICOR * 1.0 / IACT))
    print(IACT)
    return ICOR, IGUE, IACT
def classLevel():
    CCOR = 0
    CGUE = 0
    CACT = 0
    pattern = re.compile("[\W_]+")
    w = dicts.getEECSdict()
    ww = list()
    for key in w.keys():
        ww.append(w[key])
    sentences = NLU.getALines()
    utterances = NLU.getUtterances(sentences)
    for u in utterances:
        xmatches = list()
        tutt = u[0].strip().lower()
        slots = NLU.getSlots(u)[0]
        for q in tutt.split():
            qq = pattern.sub("", q)
            if is_number(qq):
                xmatches.append(qq)
        for q in ww:
            if q.lower() in tutt:
                xmatches.append(q.lower())
        slist = list()
        for slot in slots:
            slist.append(slot[1].lower())
        print(slist)
        print(xmatches)
        CACT += len(slots)
        CGUE += len(xmatches)
        for name in xmatches:
            if name in slist:
                CCOR += 1
    print(str(CCOR * 1.0 / CGUE))
    print(str(CCOR * 1.0 / CACT))
    print(CACT)
    return CCOR, CGUE, CACT
Пример #7
0
def genLikesMap(utterances):
    likes_map.clear()
    for i in s_set:
        likes_map[i] = [list(), list()]
    for i in range(0, len(utterances)):
        slots = NLU.getSlots(utterances[i])
        speaker = s_map[utterances[i][0].strip()]
        if slots[0]:
            likes_map[speaker][0].extend(slots[0])
        if slots[1]:
            likes_map[speaker][1].extend(slots[1])
    # generate dictionary for similar likes for each person
    for q in s_set:
        simlikeq = list()
        for i in s_set:
            if i == q:
                continue
            found = False
            for j in range(0, len(likes_map[i][0])):
                if (("EECS", likes_map[i][0][j][1], likes_map[i][0][j][2])
                        in likes_map[q][0] or
                    ("", likes_map[i][0][j][1], likes_map[i][0][j][2])
                        in likes_map[q][0]
                    ) and likes_map[i][0][j][2] != "neutral":
                    #print("similar likes for " + i + " and " + q + ": " + str(likes_map[i][0][j]));
                    simlikeq.append(i)
                    found = True
                    break
            if not found:
                for j in range(0, len(likes_map[i][1])):
                    if likes_map[i][1][j] in likes_map[q][
                            1] and likes_map[i][1][j][1] != "neutral":
                        #print("similar likes for " + i + " and " + q + ": " + str(likes_map[i][1][j]));
                        simlikeq.append(i)
                        found = True
                        break
        sim_likes[q] = simlikeq
Пример #8
0
      print "\ngoodbye"
      sys.exit(0)
  data = {
    'id' : id,
    'user_utterance' : user_utterance,
    'prefs' : set(),
    'imdbi' : imdb,
    'outputs' : [],
    'act' : None,
  }
  NLU.NLU(data)
  DM.DM(data)
  for key, value in data.items():
    if type(value) == type(set()):
      data[key] = list(value)
  NLU.add_entity_names(data)
  del data['imdbi']
  result = json.dumps(imdbi.clean_unicode_errors(data))
  if server:
    sys.stderr.write('> ' + user_utterance + '\n')
    sys.stderr.write('< ' + result + '\n')
  if server:
  	sys.stdout.write(result + '\n')
  else:
  	sys.stdout.write(result + '\n')
  	for output in data['outputs']:
  		sys.stdout.write(str(output) + '\n')
  sys.stdout.flush()  
  
os.remove('/tmp/imdbot_pid')
Пример #9
0
def main():
    fo = open("../data/extract_samples/EECS_annotated_samples_anonymized", "r")
    lines = fo.readlines()
    utterances = NLU.getUtterances(lines)
    mode = False
    sents = list()
    targets = list()
    lastTaken = ""
    lastSent = ""
    isclass = False
    tagset = list()
    coriset = list()
    lastTagset = list()
    index = 0
    sent_to_xtc = dict()
    sent_to_xtc[0] = list()
    for i in range(len(lines)):
        data = lines[i].strip()
        if "" == data:
            index += 1
            sent_to_xtc[index] = list()
        if data.startswith("<class") or data.startswith("<instructor"):
            mode = True
            lastTaken = ""
            lastTagset = list()
        if data.startswith("<class"):
            isclass = True
        if mode and data.startswith("sentiment="):
            lastTaken = data[10:]
            if lastTaken.endswith(">"):
                lastTaken = lastTaken[:-1]
        if mode and data.startswith("name="):
            temp = data[5:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("id="):
            temp = data[3:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("department="):
            temp = data[11:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if not mode and "" != data:
            lastSent = data
        if data.endswith(">"):
            mode = False
            coriset.append(isclass)
            isclass = False
            sents.append(lastSent)
            tagset.append(lastTagset)
            sent_to_xtc[index].append(len(sents) - 1)
            if lastTaken == "":
                targets.append("neutral")
            else:
                targets.append(lastTaken)

    f2 = open("RNTN_sent")
    gendict = f2.readlines()
    f2.close()

    sdict = dict()
    slist = list()
    parent = None
    lastpart = None
    for i in gendict:
        if i.startswith(":"):
            parent = i[1:].strip()
            sdict[parent] = dict()
            slist.append(parent)
        elif is_number(i.strip()):
            sdict[parent][lastpart] = int(i.strip())
        else:
            lastpart = i.strip()
            sdict[parent][lastpart] = -1

    print(len(tagset))
    print(len(sdict.keys()))
    print(len(sent_to_xtc))
    print(len(targets))

    tries = 0
    correct = 0
    for q in range(0, len(slist)):
        print(sdict[slist[q]])
        print(sent_to_xtc[q])
        for i in sent_to_xtc[q]:
            print(str(tagset[i]) + ":" + str(targets[i]))
            for j in sdict[slist[q]]:
                if tagset[i][0] in j:
                    asent = "neutral"
                    if int(sdict[slist[q]][j]) > 2:
                        asent = "positive"
                    elif int(sdict[slist[q]][j]) < 1:
                        asent = "negative"
                    print(asent)
                    tries += 1
                    if targets[i] == asent:
                        correct += 1
    print("correct: " + str(correct * 1.0 / tries))
Пример #10
0
def main():
	# get scores
	fscores = open("S1feature");#S1feature -S1single_lies
	lines = fscores.readlines();
	fscores.close();
	scores = list();
	for i in lines:
		scores.append(float(i.strip()));
	sort_scores = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])];
	sort_scores.reverse();

	# get splits
	fsplits = open("splits");
	splines = fsplits.readlines();
	splits = list();
	for i in range(0, len(splines)):
		parts = splines[i].strip().split(":");
		train = list();
		test = list();
		for s in parts[0][1:-1].split(", "):
			train.append(int(s));
		for s in parts[1][1:-1].split(", "):
			test.append(int(s));
		splits.append((train, test));
	fsplits.close();

	#get speakers
	nlines = NLU.getALines();
	utterances = NLU.getUtterances(nlines);
	nlist = list();
	for i in range(0, len(splits)):
		senti_utters = list();
		for j in range(0, len(splits[i][0])):
			senti_utters.append(utterances[splits[i][0][j]]);
		likesMatrix, slist = leastSquares.getMatrix(senti_utters);

		test_utters = list();
		for j in range(0, len(splits[i][1])):
			test_utters.append(utterances[splits[i][1][j]]);
		TlikesMatrix, Tslist = leastSquares.getMatrix(test_utters);

		nonneus = 0;
		nnews = 0;
		density = 0.0;
		counts = list();
		#iterate over rows
		for k in range(0, len(likesMatrix)):
			nonneu = 0;
			for j in range(0, len(likesMatrix[k])):
				if int(likesMatrix[k][j]) != 5:
					nonneu += 1;
			if nonneu > 0:
				nnews += 1;
			nonneus += nonneu;
			counts.append(nonneu);
		#iterate over columns
		elaps = 0;
		for k in range(0, len(likesMatrix[0])):
			nonneu = 0;
			TNEW = 0;
			for j in range(0, len(likesMatrix)):
				if int(likesMatrix[j][k]) != 5:
					nonneu = 1;
				if int(TlikesMatrix[j][k]) != 5:
					TNEW = 1;
			if nonneu == 1 and TNEW == 1:
				elaps += 1;

		nlist.append(str(nnews) + ":" + str(nonneus) + ":" + str(counts) + ":" + str(elaps));

	#print correlations
	for i in sort_scores:
		print(str(scores[i]) + " - " + nlist[i]);
Пример #11
0
def main():
    name = "MEGHAN"
    fi = open("../data/extract_samples/pID_AEU")
    pid = fi.readlines()
    fi.close()
    pidmap = dict()
    pset = set()
    for i in range(0, len(pid)):
        parts = pid[i].split("\t")
        pset.add(parts[0])
        pidmap[parts[1].strip()] = parts[0]
    fl = open("EECS_annotated_samples_anonymized")
    lines = fl.readlines()
    fl.close()
    utterances = NLU.getUtterances(lines)
    print(utterances[0])
    print("Speaker: " + pidmap[utterances[0][0].strip()])
    slots = NLU.getSlots(utterances[0])
    print(slots)
    plikes = dict()
    for i in pset:
        plikes[i] = [list(), list()]
    for i in range(0, len(utterances)):
        slots = NLU.getSlots(utterances[i])
        speaker = pidmap[utterances[i][0].strip()]
        if slots[0]:
            plikes[speaker][0].extend(slots[0])
        if slots[1]:
            plikes[speaker][1].extend(slots[1])
    print("\n\nGiven that EECS 492 sentiment is neutral...")
    #print(plikes[name]);
    wholikes = ("EECS", "492", "neutral")
    likers = list()
    for i in pset:
        if wholikes in plikes[i][0]:
            likers.append(i)
    # check instructors in likers
    ucontains_i = "Quentin Stout"
    print("\n\nWho likes " + ucontains_i)
    for i in likers:
        for j in range(0, len(plikes[i][1])):
            if plikes[i][1][j][0] == ucontains_i:
                print(i + ": " + str(plikes[i][1][j]))
    # check classes in likers
    ucontains_cd = "EECS"
    ucontains_cid = "545"
    print("\n\nWho likes " + ucontains_cd + " " + ucontains_cid)
    for i in likers:
        for j in range(0, len(plikes[i][0])):
            # don't worry about department but if you want to... then use this line
            # plikes[i][0][j][0] == ucontains_cd and
            if plikes[i][0][j][1] == ucontains_cid:
                print(i + ": " + str(plikes[i][0][j]))
    # find all people with similar sentiments to <name> in the data set
    print("\n\nSimlikes!")
    simlikesmap = dict()
    for q in pset:
        simlikes = list()
        for i in pset:
            if i == q:
                continue
            found = False
            for j in range(0, len(plikes[i][0])):
                if (("EECS", plikes[i][0][j][1], plikes[i][0][j][2])
                        in plikes[name][0] or
                    ("", plikes[i][0][j][1], plikes[i][0][j][2]
                     ) in plikes[name][0]) and plikes[i][0][j][2] != "neutral":
                    print("similar likes for " + i + " and " + name + ": " +
                          str(plikes[i][0][j]))
                    simlikes.append(i)
                    found = True
                    break
            if not found:
                for j in range(0, len(plikes[i][1])):
                    if plikes[i][1][j] in plikes[name][
                            1] and plikes[i][1][j][1] != "neutral":
                        print("similar likes for " + i + " and " + name +
                              ": " + str(plikes[i][1][j]))
                        simlikes.append(i)
                        found = True
                        break
        simlikesmap[q] = simlikes
    # calculate % of times where OSCORE will be nonzero
    times = 0
    ttimes = 0
    for u in utterances:
        slots = NLU.getSlots(u)
        speaker = pidmap[u[0].strip()]
        for slot in slots[0]:
            ttimes += 1
            oscore = 0
            for i in simlikesmap[speaker]:
                pscore = 0
                for j in range(0, len(plikes[i][0])):
                    if slot[1] == plikes[i][0][j][1]:
                        if plikes[i][0][j][2] == "positive":
                            pscore += 1
                        elif plikes[i][0][j][2] == "negative":
                            pscore -= 1
                if pscore > 0:
                    oscore += 1
                elif pscore < 0:
                    oscore -= 1
            if oscore != 0:
                times += 1
        for slot in slots[1]:
            ttimes += 1
            oscore = 0
            for i in simlikesmap[speaker]:
                pscore = 0
                for j in range(0, len(plikes[i][1])):
                    if slot[0] == plikes[i][1][j][0]:
                        if plikes[i][1][j][1] == "positive":
                            pscore += 1
                        elif plikes[i][1][j][1] == "negative":
                            pscore -= 1
                if pscore > 0:
                    oscore += 1
                elif pscore < 0:
                    oscore -= 1
            if oscore != 0:
                times += 1
    print("Times: " + str(times))
    print("Total Times: " + str(ttimes))
    print("Percentage: " + str(times * 100.0 / ttimes))
def main():
    fi = open("sentimentAnnotations")
    line1 = fi.readlines()
    fi.close()
    fo = open("EECS_annotated_samples_anonymized")
    line2 = fo.readlines()
    fo.close()
    utt1 = NLU.getUtterances(line1)
    utt2 = NLU.getUtterances(line2)
    correct = 0
    wrong = 0
    NEU_NEG = 0
    NEU_POS = 0
    POS_NEG = 0
    SNEU_NEG = set()
    SNEU_NEG.add("neutral")
    SNEU_NEG.add("negative")
    SNEU_POS = set()
    SNEU_POS.add("neutral")
    SNEU_POS.add("positive")
    SPOS_NEG = set()
    SPOS_NEG.add("negative")
    SPOS_NEG.add("positive")
    disagrees = list()
    inst = 1
    insttype = "neutral"
    for i in range(0, len(utt1)):
        slots1 = NLU.getSlots(utt1[i])
        slots2 = NLU.getSlots(utt2[i])
        for j in range(0, len(slots1[0])):
            if insttype == slots2[0][j][2]:
                inst += 1
            if slots1[0][j][3] == slots2[0][j][3]:
                correct += 1
            else:
                tset = set()
                tset.add(slots1[0][j][3])
                tset.add(slots2[0][j][3])
                disagrees.append(utt1[i])
                if slots2[0][j][3] == insttype:
                    if tset == SNEU_NEG:
                        NEU_NEG += 1
                    elif tset == SNEU_POS:
                        NEU_POS += 1
                    elif tset == SPOS_NEG:
                        POS_NEG += 1
                wrong += 1
        for j in range(0, len(slots1[1])):
            if slots1[1][j][1] == slots2[1][j][1]:
                correct += 1
            else:
                tset = set()
                disagrees.append(utt1[i])
                tset.add(slots1[1][j][1])
                tset.add(slots2[1][j][1])
                if slots2[1][j][1] == insttype:
                    if tset == SNEU_NEG:
                        NEU_NEG += 1
                    elif tset == SNEU_POS:
                        NEU_POS += 1
                    elif tset == SPOS_NEG:
                        POS_NEG += 1
                wrong += 1
    print("Agree on " + str(correct))
    print("Disagree on " + str(wrong))
    print("Percent agreement is " + str(correct * 1.0 / (correct + wrong)) +
          "%")
    #print("NEU_NEG: " + str(NEU_NEG*1.0/(correct+wrong)));
    #print("NEU_POS: " + str(NEU_POS*1.0/(correct+wrong)));
    #print("POS_NEG: " + str(POS_NEG*1.0/(correct+wrong)));
    print("NEU_NEG: " + str(NEU_NEG * 1.0 / inst))
    print("NEU_POS: " + str(NEU_POS * 1.0 / inst))
    print("POS_NEG: " + str(POS_NEG * 1.0 / inst))
Пример #13
0
def main():
    with open(prior_property_file) as json_file:
        prior_property_dict = json.load(json_file)

    dialogDB_json = {}

    input_json = request.get_json()
    print(input_json)
    user_ID = input_json['user_id']
    userDB_json = {'userID': user_ID, 'command': 'LOGIN'}
    userDB_response = Dialog_Manager.UserDBaccess(userDB_json)

    KB_language = {
        'user_ID': user_ID,
        'entities': [],
        'frames': [],
        'triples': []
    }

    result_json = {
        'user_id': user_ID,
        'frames': [],
        'entities': [],
        'q_list': [],
        'knowledge': []
    }

    user_input = input_json['utterance']
    dialogDB_json['user_id'] = KB_language['user_ID']
    dialogDB_json['utterance'] = user_input
    dialogDB_json['speaker'] = 'user'
    dialogDB_json['mode'] = 'make_table'
    dialog_index = DialogDBaccess(dialogDB_json)

    dialogDB_json['mode'] = 'add_data'
    dialogDB_json['utterance'] = user_input
    dialogDB_json['speaker'] = 'user'
    dialog_index = DialogDBaccess(dialogDB_json)

    if not input_json['q_list'] and not input_json['knowledge']:
        NLU_response = NLU.Frame_Interpreter(user_input)
        #frame기반
        if NLU_response['frames']:
            KB_language['frames'] = NLU_response['frames']
            result_json['frames'] = KB_language['frames']
            now_frame = KB_language['frames'][-1]
            question_list = []
            with open('./frame_info_full.json', 'r', encoding='utf-8') as f:
                frame_json_data = json.load(f)
                frame_data = frame_json_data[now_frame['frame']]
                for ele in frame_data['arguments']:
                    if ele['coreType'] == 'Core':
                        question_list.append(ele['fe'])
            for exi_ele in now_frame['ele']:
                if exi_ele in question_list:
                    question_list.remove(exi_ele)
            if question_list:
                result_json['q_list'] = question_list
                system_output = \
                 NLG([now_frame['frame'] + '\t' + question_list[0] + '\t' + '?o'],
                  'Knowledge_question')[0]
                result_json['utterance'] = system_output
                dialogDB_json['utterance'] = system_output
                dialogDB_json['speaker'] = 'system'
                dialog_index = DialogDBaccess(dialogDB_json)
            else:
                system_output = '감사합니다.'
                result_json['utterance'] = system_output
                dialogDB_json['utterance'] = system_output
                dialogDB_json['speaker'] = 'system'
                dialog_index = DialogDBaccess(dialogDB_json)
        #entity기반
        elif NLU_response['entities']:
            KB_language['entities'] = NLU_response['entities']
            result_json['entities'] = KB_language['entities']
            KBM_response = Entity_Summarization.ES(
                KB_language['entities'][0]['text'])
            KB_language['triples'] = KBM_response
            utterances = NLG(KB_language['triples'], 'Knowledge_inform')
            system_output = ''
            for candi in utterances:
                if 'wiki' in candi or 'abstract' in candi:
                    continue
                system_output += candi
            tmp_list = []
            for ele in NLU_response['entities'][0]['type']:
                if 'http://dbpedia.org/ontology/' in ele:
                    tmp_list.append(ele.split('/')[-1])
            KB_language['entities'][0]['type'] = copy.deepcopy(tmp_list)
            for tmp_class in KB_language['entities'][0]['type']:
                if tmp_class in class_dict['level_4']:
                    entity_class = tmp_class
                    break
                elif tmp_class in class_dict['level_3']:
                    entity_class = tmp_class
                    break
                elif tmp_class in class_dict['level_2']:
                    entity_class = tmp_class
                    break
                elif tmp_class in class_dict['level_1']:
                    entity_class = tmp_class
                    break
            question_list = []
            question_property_list = prior_property_dict[entity_class]
            question_num = 0
            for candi_question in question_property_list:
                if question_num == 3:
                    break
                tmp_user_query = Dialog_Manager.SPARQL_Generation(
                    'ASK',
                    [KB_language['entities'][0]['uri'], candi_question, '?o'],
                    KB_language['user_ID'])
                tmp_master_query = Dialog_Manager.SPARQL_Generation(
                    'ASK',
                    [KB_language['entities'][0]['uri'], candi_question, '?o'],
                    "")
                userDB_json['query'] = tmp_user_query
                userDB_json['command'] = 'QUERY'
                #print(UserDBaccess(userDB_json))
                if not Dialog_Manager.MasterDBaccess(
                        tmp_master_query) and not Dialog_Manager.UserDBaccess(
                            userDB_json)['query_result']:
                    question_list.append([
                        KB_language['entities'][0]['text'], candi_question,
                        '?o'
                    ])
                    question_num += 1
            if question_list:
                result_json['q_list'] = question_list
                system_output += '\n' + KB_language['entities'][0][
                    'text'] + '에 대해서 몇 가지 물어보고 싶은게 있어요.'
                system_output += '\n' + NLG([
                    question_list[0][0] + '\t' + question_list[0][1] + '\t' +
                    question_list[0][2]
                ], 'Knowledge_question')[0]
                result_json['utterance'] = system_output
                dialogDB_json['utterance'] = system_output
                dialogDB_json['speaker'] = 'system'
                dialog_index = DialogDBaccess(dialogDB_json)
            else:
                system_output = '감사합니다.'
                result_json['utterance'] = system_output
                dialogDB_json['utterance'] = system_output
                dialogDB_json['speaker'] = 'system'
                dialog_index = DialogDBaccess(dialogDB_json)
        else:
            system_output = '제가 아직 이해할 수 없는 문장이에요.'
            result_json['utterance'] = system_output
            dialogDB_json['utterance'] = system_output
            dialogDB_json['speaker'] = 'system'
            dialog_index = DialogDBaccess(dialogDB_json)
    elif input_json['q_list']:
        dialogDB_json['utterance'] = user_input
        dialogDB_json['speaker'] = 'user'
        dialog_index = DialogDBaccess(dialogDB_json)
        if input_json['frames']:
            result_json['frames'] = input_json['frames']
            tmp_user_answer = ETRI_NER(user_input)
            tmp_knowledge_list = input_json['knowledge']
            if tmp_user_answer:
                tmp_knowledge_list.append(tmp_user_answer[0][0])
            result_json['knowledge'] = tmp_knowledge_list
            result_json['q_list'] = input_json['q_list']
            del result_json['q_list'][0]
            if result_json['q_list']:
                now_frame = input_json['frames'][-1]
                system_output = \
                 NLG([now_frame['frame'] + '\t' + result_json['q_list'][0] + '\t' + '?o'],
                  'Knowledge_question')[0]
                result_json['utterance'] = system_output
                dialogDB_json['utterance'] = system_output
                dialogDB_json['speaker'] = 'system'
                dialog_index = DialogDBaccess(dialogDB_json)
            else:
                if input_json['knowledge']:
                    #frame지식저장
                    system_output = '감사합니다'
                    result_json['frames'] = []
                    result_json['entities'] = []
                    result_json['knowledge'] = []
                    result_json['utterance'] = system_output
                    dialogDB_json['utterance'] = system_output
                    dialogDB_json['speaker'] = 'system'
                    dialog_index = DialogDBaccess(dialogDB_json)
                else:
                    system_output = '감사합니다'
                    result_json['frames'] = []
                    result_json['entities'] = []
                    result_json['utterance'] = system_output
                    dialogDB_json['utterance'] = system_output
                    dialogDB_json['speaker'] = 'system'
                    dialog_index = DialogDBaccess(dialogDB_json)
        elif input_json['entities']:
            result_json['entities'] = input_json['entities']
            tmp_user_answer = ETRI_NER(user_input)
            new_triple = []
            tmp_knowledge_list = input_json['knowledge']
            result_json['q_list'] = input_json['q_list']
            if tmp_user_answer:
                new_triple.append('http://kbox.kaist.ac.kr/resource/' +
                                  result_json['q_list'][0][0] + '\t' +
                                  result_json['q_list'][0][1] +
                                  '\thttp://kbox.kaist.ac.kr/resource/' +
                                  tmp_user_answer[0][0])
                new_triple.append(
                    'http://kbox.kaist.ac.kr/resource/' +
                    result_json['q_list'][0][0] + result_json['q_list'][0][1] +
                    'http://kbox.kaist.ac.kr/resource/' +
                    tmp_user_answer[0][0] +
                    '\thttp://kbox.kaist.ac.kr/flagship/dialogid\thttp://ko.dbpedia.org/resource/'
                    + str(dialog_index))
                s1, p1, o1 = new_triple[0].split('\t')
                s2, p2, o2 = new_triple[1].split('\t')
                tmp_knowledge_list.append([s1, p1, o1])
                tmp_knowledge_list.append([s2, p2, o2])
            result_json['knowledge'] = tmp_knowledge_list
            del result_json['q_list'][0]
            if result_json['q_list']:
                system_output = '\n' + NLG([
                    result_json['q_list'][0][0] + '\t' +
                    result_json['q_list'][0][1] + '\t' +
                    result_json['q_list'][0][2]
                ], 'Knowledge_question')[0]
                result_json['utterance'] = system_output
                dialogDB_json['utterance'] = system_output
                dialogDB_json['speaker'] = 'system'
                dialog_index = DialogDBaccess(dialogDB_json)
            else:
                if result_json['knowledge']:
                    system_output = ''
                    for triple in result_json['knowledge']:
                        if 'flagship/dialogid' not in triple:
                            system_output += NLG(['\t'.join(triple)],
                                                 'Knowledge_inform')[0] + '\n'
                    userDB_json['command'] = 'REGISTER'
                    userDB_json['triple'] = result_json['knowledge']
                    Dialog_Manager.UserDBaccess(userDB_json)
                    result_json['entities'] = []
                    result_json['knowledge'] = []
                    system_output += '감사합니다'
                    result_json['utterance'] = system_output
                    dialogDB_json['utterance'] = system_output
                    dialogDB_json['speaker'] = 'system'
                    dialog_index = DialogDBaccess(dialogDB_json)
                else:
                    result_json['entities'] = []
                    system_output = '감사합니다'
                    result_json['utterance'] = system_output
                    dialogDB_json['utterance'] = system_output
                    dialogDB_json['speaker'] = 'system'
                    dialog_index = DialogDBaccess(dialogDB_json)
        else:
            print('input error')
            result_json = input_json
            result_json['utterance'] = 'input error'
    else:
        print('input error')
        result_json = input_json
        result_json['utterance'] = 'input error'

    return jsonify(result_json)
Пример #14
0
			print(system_output)
			dialogDB_json['mode'] = 'add_data'
			dialogDB_json['utterance'] = system_output
			dialogDB_json['speaker'] = 'system'
			dialog_index = DialogDBaccess(dialogDB_json)
			'''
            user_input = input()
            if user_input == '로그아웃':
                break
            elif user_input == '끝':
                stop = True
                break
            dialogDB_json['utterance'] = user_input
            dialogDB_json['speaker'] = 'user'
            dialog_index = DialogDBaccess(dialogDB_json)
            NLU_response = NLU.Frame_Interpreter(user_input)
            #frame기반
            if NLU_response['frames']:
                KB_language['frames'] = NLU_response['frames']
                #for now_frame in KB_language['frames']:
                now_frame = KB_language['frames'][-1]
                question_list = []
                with open('./frame_info_full.json', 'r',
                          encoding='utf-8') as f:
                    frame_json_data = json.load(f)
                    frame_data = frame_json_data[now_frame['frame']]
                    for ele in frame_data['arguments']:
                        if ele['coreType'] == 'Core':
                            question_list.append(ele['fe'])
                print(question_list)
                for exi_ele in now_frame['ele']:
Пример #15
0
import datetime
from dateutil.relativedelta import relativedelta
import sys, json

data = sys.argv

weekday = {
    "月曜日": 0,
    "火曜日": 1,
    "水曜日": 2,
    "木曜日": 3,
    "金曜日": 4,
    "土曜日": 5,
    "日曜日": 6
}
model = NLU.main()


def check(text):
    ur, da, concept = model(text)

    if da == "negative":
        return [0]
    elif not ("plan" in concept):
        return [0]
    else:
        year, month, day = date_detector(concept.get("month"),
                                         concept.get("day"),
                                         concept.get("youbi"),
                                         concept.get("today"),
                                         concept.get("tomorrow"),
Пример #16
0
def getMatrix(utterances):
    GROUNDTRUTHS = True
    np.set_printoptions(threshold='nan')
    #lines = NLU.getALines();
    #do ALS stuff
    ioffset = len(classes)
    X = np.ones((len(sset), len(classes) + len(instructors))) * -1
    #print(X.shape);
    for i in range(0, len(utterances)):
        slots = NLU.getSlots(utterances[i])
        cslots = slots[0]
        islots = slots[1]
        for slot in islots:
            iname = ""
            if GROUNDTRUTHS:
                iname = slot[0]
            else:
                if slot[0] in entcache.keys():
                    iname = entcache[slot[0]]
                else:
                    iname = ed.entityDistance(slot[0])[1][1]
                    entcache[slot[0]] = iname
            if slot[1] == "positive":
                X[slist.index(smap[utterances[i][0].strip()])][
                    ioffset + instructors.index(iname)] = 10
            elif slot[1] == "negative":
                X[slist.index(smap[utterances[i][0].strip()])][
                    ioffset + instructors.index(iname)] = 0
            elif slot[1] == "neutral":
                X[slist.index(smap[utterances[i][0].strip()])][
                    ioffset + instructors.index(iname)] = 5
        for slot in cslots:
            if is_number(slot[1]):
                if slot[1] in classes:
                    if slot[2] == "positive":
                        X[slist.index(
                            smap[utterances[i][0].strip()])][classes.index(
                                slot[1])] = 10
                    elif slot[2] == "negative":
                        X[slist.index(
                            smap[utterances[i][0].strip()])][classes.index(
                                slot[1])] = 0
                    elif slot[2] == "neutral":
                        X[slist.index(
                            smap[utterances[i][0].strip()])][classes.index(
                                slot[1])] = 5
                else:
                    pass
                    #print(slot[1] + " is not a class...");
            else:
                classname = ""
                if GROUNDTRUTHS:
                    classname = slot[1]
                else:
                    if slot[1] in entcache.keys():
                        classname = entcache[slot[1]]
                    else:
                        classname = ed.entityDistance(slot[1])[0][1]
                        entcache[slot[1]] = classname
                if slot[2] == "positive":
                    X[slist.index(smap[utterances[i][0].strip()])][
                        classNames.index(classname)] = 10
                elif slot[2] == "negative":
                    X[slist.index(smap[utterances[i][0].strip()])][
                        classNames.index(classname)] = 0
                elif slot[2] == "neutral":
                    X[slist.index(smap[utterances[i][0].strip()])][
                        classNames.index(classname)] = 5
    # Add back these four lines and change return X to newX if you want to use ALS
    A, Y = nmf(X, 50)
    A = np.matrix(A)
    Y = np.matrix(Y)
    newX = A * Y
    return newX, slist
Пример #17
0
import telebot

import Controller
import BlackBox

import NLG
import NLU

from configuration import API_TOKEN

bot = telebot.TeleBot(API_TOKEN)
black_box = BlackBox.BlackBox()

controller = Controller.Controller()
nlg = NLG.NLG()
nlu = NLU.NLU()


@bot.message_handler(commands=['start'])
def start_message(message):
    controller.add_user(message.from_user.id)
    # answer = black_box.get_greeting_message()
    answer = nlg.get_message(
        'none', controller.users[message.from_user.id]["status"], nlu,
        black_box, controller.users[message.from_user.id]["right_answer"])
    bot.send_message(message.from_user.id, answer)
    bot.send_message(message.from_user.id,
                     "Вам нужно получить такую картинку: ")
    #bot.send_photo(message.from_user.id, photo=black_box.get_response(controller.users[message.from_user.id]["right_answer"]))
    black_box.get_response(
        controller.users[message.from_user.id]["right_answer"])
Пример #18
0
def main():
    if not os.path.exists('classifiers'):
        os.makedirs('classifiers')

    allines = NLU.getALines()
    allU = NLU.getUtterances(allines)
    textLines = NLU.getTextLines(allU)
    slots = [NLU.getSlots(i) for i in allU]

    sents = list()
    targets = list()
    tagset = list()
    sent_to_xtc = dict()

    index = 0
    for i in range(len(slots)):
        tstx = []
        for etype in ENT_TYPES:
            for j in range(len(slots[i][etype])):
                tstx.append(index)
                index += 1
                targets.append(slots[i][etype][j]['sentiment'])
                ttags = [
                    slots[i][etype][j][k] for k in ALL_IDS
                    if k in slots[i][etype][j]
                ]
                tagset.append(ttags)
                sents.append(textLines[i])
        sent_to_xtc[i] = tstx

    cprint('Number of Utterances: ' + str(index))
    cprint('Length of Lines: ' + str(len(sents)))
    cprint('Length of Targets: ' + str(len(targets)))

    cv = set()
    regex = re.compile(r'[^a-zA-Z0-9_\~\- ]+')
    for sent in range(0, len(sents)):
        parts = sents[sent].split(' ')
        for part in range(0, len(parts)):
            thepart = regex.sub('', parts[part])
            # corner case for hyphens
            hps = thepart.split('-')
            if len(hps) > 1:
                for hi in range(0, len(hps)):
                    cv.add(hps[hi].lower())
            # end corner case for hyphens
            thepart = thepart.lower()
            cv.add(thepart)
    cv = list(cv)
    cprint('Vocabulary Size: ' + str(len(cv)))

    xtc = []
    for sent in range(0, len(sents)):
        #print('sentence: ' + str(sent))
        #print('s1: ' + str(sents[sent]))

        #print(sents[sent] + ' - with tagset - ' + str(tagset[sent]))
        #dparse = spwrap.parse(sents[sent])
        #print('DPARSE: ' + dparse)

        # add token boundaries to the sentence
        tokenSent = sents[sent]
        for tag in range(0, len(tagset[sent])):
            tokenSent = tokenSent.replace(tagset[sent][tag],
                                          ' ~~t~~ ' + tagset[sent][tag])
        #print(tokenSent)
        parts = regex.sub('', tokenSent)
        # this handles split and hyphen corner case
        parts = re.split(' |-', parts)

        # remove empty parts from the sentence
        while '' in parts:
            parts.remove('')

        # locate window feature indicies
        windowFeatures = []
        done = False
        while not done:
            for part in range(0, len(parts)):
                if '~~t~~' == parts[part]:
                    windowFeatures += [part]
                    parts.remove(parts[part])
                    #print('parts?: ' + str(parts))
                    break
                if part == len(parts) - 1:
                    done = True
        #print('window features: ' + str(windowFeatures))

        #print('parts: ' + str(parts))
        row = []
        # featureMapG = [[0]*300]*4
        featureMap = {}
        Nflag = 0
        for part in range(0, len(parts)):
            #thepart = regex.sub('', parts[part])
            #thepart = thepart.lower()
            thepart = parts[part].lower()
            theid = cv.index(thepart)
            #print(theid)
            #g_vec = glove_features.getGloveWord(glove_dict, parts[part])
            mindist = 999
            for wf in range(0, len(windowFeatures)):
                ##############################################################
                ## This is the distance measure for window linear distance!
                distance = abs(windowFeatures[wf] - part)
                ##############################################################
                ## This is the distance measure for dependency tree distnace!
                ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse)
                ##############################################################
                if distance < mindist:
                    mindist = distance
            mindist += 1
            sentiz = senti_lexis.lexCounts(thepart)
            #for g_vi in range(0, len(g_vec)):
            #	featureMapG[0][g_vi] += g_vec[g_vi];# - mindist/10.0
            #	featureMapG[1][g_vi] += g_vec[g_vi];# - mindist/10.0
            #	featureMapG[2][g_vi] += g_vec[g_vi];# - mindist/10.0
            #	featureMapG[3][g_vi] += g_vec[g_vi];# - mindist/10.0
            if theid in featureMap:
                # 1.0 - mindist / 10.0 worked well for the first distance measure...
                # featureMap[theid] += 1.0 / mindist
                featureMap[theid][0] += 1.0 - mindist / 10.0
                featureMap[theid][1] += (1.0 - mindist / 10.0) * sentiz[0]
                featureMap[theid][2] += (1.0 - mindist / 10.0) * sentiz[1]
                featureMap[theid][3] += (1.0 - mindist / 10.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            else:
                # featureMap[theid] = 1.0 / mindist
                # count, positive, negative, neutral, negate
                featureMap[theid] = [0, 0, 0, 0, 0]
                featureMap[theid][0] = 1.0 - mindist / 10.0
                featureMap[theid][1] = (1.0 - mindist / 10.0) * sentiz[0]
                featureMap[theid][2] = (1.0 - mindist / 10.0) * sentiz[1]
                featureMap[theid][3] = (1.0 - mindist / 10.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            if Nflag > 0:
                Nflag -= 1
            if senti_lexis.lexNegate(thepart):
                Nflag = 2
        for i in range(0, len(cv)):
            if i in featureMap:
                row.extend(featureMap[i])
            else:
                row.extend([0, 0, 0, 0, 0])
        # add on the glove features
        # for a in range(0, len(featureMapG)):
        # 	temp_vec = []
        # 	for a_a in range(0, len(featureMapG[a])):
        # 		temp_vec.append(featureMapG[a][a_a]*1.0/len(parts))
        # 	row.extend(temp_vec)
        xtc.append(row)

    #instead read the data from splits file
    fsplits = open('splits')
    lines = fsplits.readlines()
    splits = list()
    for i in range(0, len(lines)):
        parts = lines[i].strip().split(':')
        train = list()
        test = list()
        for s in parts[0][1:-1].split(', '):
            train.append(int(s))
        for s in parts[1][1:-1].split(', '):
            test.append(int(s))
        splits.append((train, test))
    fsplits.close()
    #test print the first split
    #print(splits[0][0])
    #print(splits[0][1])

    #do gridsearch + evaluation
    fscores = open('scores_sentiment', 'w')
    bestsplit = -1
    BSscore = 0
    for i in range(0, len(splits)):
        bestC = 0
        bestGamma = 0
        bestScore = 0
        xtest = list()
        xtrain = list()
        ytest = list()
        ytrain = list()
        # add the utterance set generation here for senti_set
        # senti_utters = list()
        # for j in range(0, len(splits[i][0])):
        # 	senti_utters.append(utterances[splits[i][0][j]])
        #likesMatrix, slist = leastSquares.getMatrix(senti_utters)
        # do train-test split
        for j in range(0, len(splits[i][0])):
            #speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0])
            #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker))
            #print('\n' + speaker + ': ' + utterances[splits[i][0][j]][0].strip())
            # VECTOR is 38 x 141 -> 264 total
            for LL in range(0, len(sent_to_xtc[splits[i][0][j]])):
                #fvector = likesMatrix[slist.index(speaker)]
                #fvector = fvector.tolist()[0]
                fvector = xtc[sent_to_xtc[splits[i][0][j]][LL]]
                #fvector.append(slist.index(speaker))
                ##############################################################
                #entity = tagset[sent_to_xtc[splits[i][0][j]][LL]]
                #entity = tagset2entity(entity)
                #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker))
                #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity)
                #print('speaker: ' + str(speaker) + ' - ' + str(slist.index(speaker)))
                #fvector.append(gscore)
                ########fvector = [gscore]
                ##############################################################
                xtrain.append(fvector)
                ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]])
        for j in range(0, len(splits[i][1])):
            #speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0])
            #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker))
            for LL in range(0, len(sent_to_xtc[splits[i][1][j]])):
                #fvector = likesMatrix[slist.index(speaker)]
                #fvector = fvector.tolist()[0]
                fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]]
                #fvector.append(slist.index(speaker))
                ##############################################################
                #entity = tagset[sent_to_xtc[splits[i][1][j]][LL]]
                #entity = tagset2entity(entity)
                #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker))
                #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity)
                #fvector.append(gscore)
                ########fvector = [gscore]
                ##############################################################
                xtest.append(fvector)
                ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        score = 0

        for gamma in numpy.linspace(0.0001, 0.05, 10):  #10steps
            for C in numpy.linspace(0.1, 10, 10):  #10steps
                #2 fold
                x1 = xtrain[len(xtrain) / 2:]
                x2 = xtrain[:len(xtrain) / 2]
                y1 = ytrain[len(ytrain) / 2:]
                y2 = ytrain[:len(ytrain) / 2]
                x11 = csr_matrix(x1)
                x22 = csr_matrix(x2)
                clf = svm.SVC(gamma=gamma, C=C)
                testout = clf.fit(x1, y1)
                score = clf.score(x2, y2)
                clf = svm.SVC(gamma=gamma, C=C)
                testout = clf.fit(x2, y2)
                score += clf.score(x1, y1)
                score /= 2
                if score > bestScore:
                    bestC = C
                    bestGamma = gamma
                    bestScore = score
                    cprint('Cross Validation Score: ' + str(score))
                    cprint('Gamma = ' + str(gamma) + ' and C = ' + str(C))

        ################ THIS IS FOR CvI EVALUATION ################
        #Ixtest = list()
        #Iytest = list()
        #Cxtest = list()
        #Cytest = list()
        #for j in range(0, len(splits[i][1])):
        #	for LL in range(0, len(sent_to_xtc[splits[i][1][j]])):
        #		fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]]
        #		if coriset[sent_to_xtc[splits[i][1][j]][LL]]:
        #			Cxtest.append(fvector)
        #			Cytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        #		else:
        #			Ixtest.append(fvector)
        #			Iytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        #xtrain = csr_matrix(xtrain)
        #Cxtest = csr_matrix(Cxtest)
        #Ixtest = csr_matrix(Ixtest)
        #clf = svm.SVC(gamma=bestGamma, C=bestC)
        #testout = clf.fit(xtrain, ytrain)
        #CBscore = clf.score(Cxtest, Cytest)
        #IBscore = clf.score(Ixtest, Iytest)
        #cprint('Actual Score: ' + str(CBscore) + ':' + str(IBscore))
        #fscores.write(str(CBscore) + ':' + str(IBscore) + '\n')
        #fscores.flush()
        ###############################################################
        ################ THIS IS FOR NORMAL EVALUATION ################
        xtrain = csr_matrix(xtrain)
        xtest = csr_matrix(xtest)
        clf = svm.SVC(gamma=bestGamma, C=bestC)
        testout = clf.fit(xtrain, ytrain)
        bestScore = clf.score(xtest, ytest)
        cprint('Actual Score: ' + str(bestScore))
        fscores.write(str(bestScore) + '\n')
        ###############################################################
        # save best classifier per fold
        cString = pickle.dumps(clf)
        fsave1 = open('classifiers/sentiment_classifier' + str(i), 'w')
        fsave1.write(cString)
        fsave1.close()

    fscores.close()
    # save feature dictionary
    cvString = pickle.dumps(cv)
    fsave2 = open('sentiment_dictionary', 'w')
    fsave2.write(cvString)
    fsave2.close()
Пример #19
0
def main():
    fo = open("EECS_annotated_samples_anonymized", "r")
    lines = fo.readlines()
    utterances = NLU.getUtterances(lines)
    mode = False
    sents = list()
    targets = list()
    lastTaken = ""
    lastSent = ""
    isclass = False
    tagset = list()
    coriset = list()
    lastTagset = list()
    index = 0
    # to make cross validation work after sentences are duplicated for entities
    sent_to_xtc = dict()
    sent_to_xtc[0] = list()
    for i in range(len(lines)):
        data = lines[i].strip()
        if "" == data:
            index += 1
            sent_to_xtc[index] = list()
        if data.startswith("<class") or data.startswith("<instructor"):
            mode = True
            lastTaken = ""
            lastTagset = list()
        if data.startswith("<class"):
            isclass = True
        if mode and data.startswith("sentiment="):
            lastTaken = data[10:]
            if lastTaken.endswith(">"):
                lastTaken = lastTaken[:-1]
        if mode and data.startswith("name="):
            temp = data[5:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("id="):
            temp = data[3:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("department="):
            temp = data[11:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if not mode and "" != data:
            lastSent = data
        if data.endswith(">"):
            mode = False
            coriset.append(isclass)
            isclass = False
            sents.append(lastSent)
            tagset.append(lastTagset)
            sent_to_xtc[index].append(len(sents) - 1)
            if lastTaken == "":
                targets.append("neutral")
            else:
                targets.append(lastTaken)

    # This will print out mapping from sentences to entity vectors (XTC)
    #foutest = open("outtestJ", "w");
    #for key in sent_to_xtc:
    #	foutest.write(str(key) + " : " + str(sent_to_xtc[key]) + "\n");
    #foutest.flush();
    #foutest.close();

    #randomly sample utterances
    #testdata = random.sample(range(0, index), index/5);

    print("number of utterances: " + str(index))
    print("length of lines: " + str(len(sents)))
    print("length of targets: " + str(len(targets)))
    print("sent 2: " + str(sents[2]))
    print("tagset 2: " + str(tagset[2]))

    cv = set()
    regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+")
    for sent in range(0, len(sents)):
        parts = sents[sent].split(" ")
        for part in range(0, len(parts)):
            thepart = regex.sub("", parts[part])
            # corner case for hyphens
            hps = thepart.split("-")
            if len(hps) > 1:
                for hi in range(0, len(hps)):
                    cv.add(hps[hi].lower())
            # end corner case for hyphens
            thepart = thepart.lower()
            cv.add(thepart)
    cv = list(cv)
    cv.append("452")
    #bug?
    print("vocabulary size: " + str(len(cv)))
    print("index of I: " + str(cv.index("i")))
    xtc = []
    for sent in range(0, len(sents)):
        print("sentence: " + str(sent))
        print("s1: " + str(sents[sent]))

        #print(sents[sent] + " - with tagset - " + str(tagset[sent]));
        #dparse = spwrap.parse(sents[sent]);
        #print("DPARSE: " + dparse);

        # add token boundaries to the sentence
        tokenSent = sents[sent]
        for tag in range(0, len(tagset[sent])):
            tokenSent = tokenSent.replace(tagset[sent][tag],
                                          " ~~t~~ " + tagset[sent][tag])
        print(tokenSent)
        parts = regex.sub("", tokenSent)
        # this handles split and hyphen corner case
        parts = re.split(" |-", parts)

        # remove empty parts from the sentence
        while "" in parts:
            parts.remove("")

        # locate window feature indicies
        windowFeatures = []
        done = False
        while not done:
            for part in range(0, len(parts)):
                if "~~t~~" == parts[part]:
                    windowFeatures += [part]
                    parts.remove(parts[part])
                    print("parts?: " + str(parts))
                    break
                if part == len(parts) - 1:
                    done = True
        print("window features: " + str(windowFeatures))

        print("parts: " + str(parts))
        row = []
        featureMap = {}
        Nflag = 0
        for part in range(0, len(parts)):
            #thepart = regex.sub("", parts[part]);
            #thepart = thepart.lower();
            thepart = parts[part].lower()
            theid = cv.index(thepart)
            print(theid)
            mindist = 999
            for wf in range(0, len(windowFeatures)):
                ##############################################################
                ## This is the distance measure for window linear distance!
                distance = abs(windowFeatures[wf] - part)
                ##############################################################
                ## This is the distance measure for dependency tree distnace!
                ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse);
                ##############################################################
                if distance < mindist:
                    mindist = distance
            mindist += 1
            sentiz = senti_lexis.lexCounts(thepart)
            if theid in featureMap:
                # 2.0 - mindist / 7.0 worked well for the first distance measure...
                # featureMap[theid] += 1.0 / mindist;
                featureMap[theid][0] += 2.0 - mindist / 7.0
                featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            else:
                # featureMap[theid] = 1.0 / mindist;
                # count, positive, negative, neutral, negate
                featureMap[theid] = [0, 0, 0, 0, 0]
                featureMap[theid][0] = 2.0 - mindist / 7.0
                featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            if Nflag > 0:
                Nflag -= 1
            if senti_lexis.lexNegate(thepart):
                Nflag = 2
        for i in range(0, len(cv)):
            if i in featureMap:
                row.extend(featureMap[i])
            else:
                row.extend([0, 0, 0, 0, 0])
        xtc.append(row)

    #instead read the data from splits file
    fsplits = open("splits")
    lines = fsplits.readlines()
    splits = list()
    for i in range(0, len(lines)):
        parts = lines[i].strip().split(":")
        train = list()
        test = list()
        for s in parts[0][1:-1].split(", "):
            train.append(int(s))
        for s in parts[1][1:-1].split(", "):
            test.append(int(s))
        splits.append((train, test))
    fsplits.close()
    #test print the first split
    #print(splits[0][0]);
    #print(splits[0][1]);

    bestsplit = -1
    BSscore = 0
    for i in range(0, len(splits)):
        bestC = 0
        bestGamma = 0
        bestScore = 0
        xtest = list()
        xtrain = list()
        ytest = list()
        ytrain = list()
        # add the utterance set generation here for senti_set
        senti_utters = list()
        for j in range(0, len(splits[i][0])):
            senti_utters.append(utterances[splits[i][0][j]])
        likesMatrix, slist = leastSquares.getMatrix(senti_utters)
        # do train-test split
        csims = np.array([0.0] * 38)
        totz = 0
        #for j in range(0, len(splits[i][0])):
        #	speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0]);
        #	cossim = leastSquares.cosineUserWE(likesMatrix, slist.index(speaker));
        #	np.add(csims, cossim);
        #	totz += 1;
        for j in range(0, len(splits[i][1])):
            speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0])
            cossim = leastSquares.cosineUserWE(likesMatrix,
                                               slist.index(speaker))
            cossim = np.array(cossim)
            csims = np.add(csims, cossim)
            totz += 1
        for j in range(0, len(csims)):
            csims[j] /= totz
        print(csims.tolist())
Пример #20
0
def main():
    fo = open("EECS_annotated_samples_anonymized", "r")
    lines = fo.readlines()
    utterances = NLU.getUtterances(lines)
    mode = False
    sents = list()
    targets = list()
    lastTaken = ""
    lastSent = ""
    isclass = False
    tagset = list()
    lastTagset = list()
    index = 0
    # to make cross validation work after sentences are duplicated for entities
    sent_to_xtc = dict()
    sent_to_xtc[0] = list()
    for i in range(len(lines)):
        data = lines[i].strip()
        if "" == data:
            index += 1
            sent_to_xtc[index] = list()
        if data.startswith("<class") or data.startswith("<instructor"):
            mode = True
            lastTaken = ""
            lastTagset = list()
        if data.startswith("<class"):
            isclass = True
        if mode and data.startswith("sentiment="):
            lastTaken = data[10:]
            if lastTaken.endswith(">"):
                lastTaken = lastTaken[:-1]
        if mode and data.startswith("name="):
            temp = data[5:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("id="):
            temp = data[3:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("department="):
            temp = data[11:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if not mode and "" != data:
            lastSent = data
        if data.endswith(">"):
            mode = False
            isclass = False
            sents.append(lastSent)
            tagset.append(lastTagset)
            sent_to_xtc[index].append(len(sents) - 1)
            if lastTaken == "":
                targets.append("neutral")
            else:
                targets.append(lastTaken)

    # This will print out mapping from sentences to entity vectors (XTC)
    #foutest = open("outtestJ", "w");
    #for key in sent_to_xtc:
    #	foutest.write(str(key) + " : " + str(sent_to_xtc[key]) + "\n");
    #foutest.flush();
    #foutest.close();

    #randomly sample utterances
    #testdata = random.sample(range(0, index), index/5);

    print("number of utterances: " + str(index))
    print("length of lines: " + str(len(sents)))
    print("length of targets: " + str(len(targets)))
    print("sent 2: " + str(sents[2]))
    print("tagset 2: " + str(tagset[2]))

    cv = set()
    regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+")
    for sent in range(0, len(sents)):
        parts = sents[sent].split(" ")
        for part in range(0, len(parts)):
            thepart = regex.sub("", parts[part])
            # corner case for hyphens
            hps = thepart.split("-")
            if len(hps) > 1:
                for hi in range(0, len(hps)):
                    cv.add(hps[hi].lower())
            # end corner case for hyphens
            thepart = thepart.lower()
            cv.add(thepart)
    cv = list(cv)
    cv.append("452")
    #bug?
    print("vocabulary size: " + str(len(cv)))
    print("index of I: " + str(cv.index("i")))
    xtc = []
    for sent in range(0, len(sents)):
        print("sentence: " + str(sent))
        print("s1: " + str(sents[sent]))

        #print(sents[sent] + " - with tagset - " + str(tagset[sent]));
        #dparse = spwrap.parse(sents[sent]);
        #print("DPARSE: " + dparse);

        # add token boundaries to the sentence
        tokenSent = sents[sent]
        for tag in range(0, len(tagset[sent])):
            tokenSent = tokenSent.replace(tagset[sent][tag],
                                          " ~~t~~ " + tagset[sent][tag])
        print(tokenSent)
        parts = regex.sub("", tokenSent)
        # this handles split and hyphen corner case
        parts = re.split(" |-", parts)

        # remove empty parts from the sentence
        while "" in parts:
            parts.remove("")

        # locate window feature indicies
        windowFeatures = []
        done = False
        while not done:
            for part in range(0, len(parts)):
                if "~~t~~" == parts[part]:
                    windowFeatures += [part]
                    parts.remove(parts[part])
                    print("parts?: " + str(parts))
                    break
                if part == len(parts) - 1:
                    done = True
        print("window features: " + str(windowFeatures))

        print("parts: " + str(parts))
        row = []
        featureMap = {}
        Nflag = 0
        for part in range(0, len(parts)):
            #thepart = regex.sub("", parts[part]);
            #thepart = thepart.lower();
            thepart = parts[part].lower()
            theid = cv.index(thepart)
            print(theid)
            mindist = 999
            for wf in range(0, len(windowFeatures)):
                ##############################################################
                ## This is the distance measure for window linear distance!
                distance = abs(windowFeatures[wf] - part)
                ##############################################################
                ## This is the distance measure for dependency tree distnace!
                ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse);
                ##############################################################
                if distance < mindist:
                    mindist = distance
            mindist += 1
            sentiz = senti_lexis.lexCounts(thepart)
            if theid in featureMap:
                # 2.0 - mindist / 7.0 worked well for the first distance measure...
                # featureMap[theid] += 1.0 / mindist;
                featureMap[theid][0] += 2.0 - mindist / 7.0
                featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            else:
                # featureMap[theid] = 1.0 / mindist;
                # count, positive, negative, neutral, negate
                featureMap[theid] = [0, 0, 0, 0, 0]
                featureMap[theid][0] = 2.0 - mindist / 7.0
                featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            if Nflag > 0:
                Nflag -= 1
            if senti_lexis.lexNegate(thepart):
                Nflag = 2
        for i in range(0, len(cv)):
            if i in featureMap:
                row.extend(featureMap[i])
            else:
                row.extend([0, 0, 0, 0, 0])
        xtc.append(row)

    #cv = CountVectorizer();
    #xtc = cv.fit_transform(sents);

    #examining data structures here
    #parts = sents[0].split(" ");
    #for part in range(0, len(parts)):
    #	print("PART: " + parts[part]);
    #print("WORD TAKE: " + str(cv.vocabulary_.get(u'i')));
    #print("WORD TAKE: " + str(cv.vocabulary_.get(u'took')));
    #print("WORD DONT: " + str(cv.vocabulary_.get(u'don')));
    #print("WORD DONT: " + str(cv.vocabulary_.get(u't')));
    #print("WORD TAKE: " + str(cv.vocabulary_.get(u'183')));
    #print(str(xtc.shape));
    #print("ROW0");
    #print(xtc[0]);
    #print("ROW1");
    #print(xtc[1]);
    print("ROW2")
    print(xtc[2])
    print(len(xtc[2]))
    #print(type(xtc[0]));
    #print(type(xtc));
    #print(str(len(sents)));
    #endtest

    #xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(xtc, targets, test_size=0.2, random_state=0);

    #use this section of code to do cross validation.
    #shuffle and split into Nfolds parts.
    #testdata = range(0, index);
    #random.shuffle(testdata);
    #folds = list();
    #Nfolds = 10;
    #fsavef = open("folds", "w");
    #for i in range(0, Nfolds):
    #	print("i = " + str(i));
    #	nthfold = testdata[i*index/Nfolds:(i+1)*index/Nfolds];
    #	folds.append(nthfold);
    #	fsavef.write(str(nthfold) + "\n");
    #	print("fold(" + str(i) + "): " + str(nthfold));
    #fsavef.flush();
    #fsavef.close();

    #instead read the data from splits file
    fsplits = open("splits")
    lines = fsplits.readlines()
    splits = list()
    for i in range(0, len(lines)):
        parts = lines[i].strip().split(":")
        train = list()
        test = list()
        for s in parts[0][1:-1].split(", "):
            train.append(int(s))
        for s in parts[1][1:-1].split(", "):
            test.append(int(s))
        splits.append((train, test))
    fsplits.close()
    #test print the first split
    #print(splits[0][0]);
    #print(splits[0][1]);

    #do gridsearch + evaluation
    fscores = open("baseline_scores", "w")
    for i in range(0, len(splits)):
        bestC = 0
        bestGamma = 0
        bestScore = 0
        xtest = list()
        xtrain = list()
        ytest = list()
        ytrain = list()
        # do train-test split
        for j in range(0, len(splits[i][0])):
            # VECTOR is 38 x 141 -> 264 total
            for LL in range(0, len(sent_to_xtc[splits[i][0][j]])):
                ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]])
        for j in range(0, len(splits[i][1])):
            for LL in range(0, len(sent_to_xtc[splits[i][1][j]])):
                ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        score = ytrain.count("neutral") * 1.0 / len(ytrain)

        print("Actual Score: " + str(score))
        fscores.write(str(score) + "\n")
        fscores.flush()
    fscores.close()