def main(): np.set_printoptions(threshold='nan') lines = NLU.getALines() utterances = NLU.getUtterances(lines) #do ALS stuff X = getMatrix(utterances)[0] idtestp = 28 print(X[idtestp]) cossim = consineUser(X, idtestp) classtest = 280 print(getWeightedGuess(cossim, X, classtest))
def xactinst(): w = dicts.getEECSprofs() sentences = NLU.getALines() utterances = NLU.getUtterances(sentences) xmatches = list() for i in utterances: tutt = i[0].strip().lower() for q in w: if q.lower() in tutt: xmatches.append(q) bees = len(xmatches) eyes = 0 for x in xmatches: ptz = x.split() eyes += len(ptz) - 1 print(bees) print(eyes)
def xactclass(): w = dicts.getEECSdict() ww = list() for key in w.keys(): ww.append(w[key]) sentences = NLU.getALines() utterances = NLU.getUtterances(sentences) xmatches = list() for i in utterances: tutt = i[0].strip().lower() for q in ww: if q.lower() in tutt: xmatches.append(q) bees = len(xmatches) eyes = 0 for x in xmatches: ptz = x.split() eyes += len(ptz) - 1 print(bees) print(eyes)
def t2(): f = open('crf-input-data') clines = f.readlines() f.close() u2 = list() utt = list() t2 = list() tutt = list() for cl in clines: parts = cl.strip() if parts == '': if utt != []: u2.append(utt) t2.append(tutt) utt = list() tutt = list() else: parts = parts.split() utt.append(parts[0]) tutt.append(parts[2]) if utt != []: u2.append(utt) t2.append(tutt) utt = list() tutt = list() lines = NLU.getALines() utterances = NLU.getUtterances(lines) for u in range(0, len(utterances)): slots = NLU.getSlots(utterances[u]) sclist = list() for slot in slots[0]: sclist.append([slot[1], slot[2]]) entlist = NLU.getEntities(u2[u], t2[u])[0] l1 = list() l2 = sclist for ent in entlist: l1.append([ent[1], ent[2]]) if l1 != l2: print(str(l1) + '_' + str(l2))
def instructorLevel(): ICOR = 0 IGUE = 0 IACT = 0 profs = dicts.getProfWords() pattern = re.compile("[\W_]+") print profs sentences = NLU.getALines() utterances = NLU.getUtterances(sentences) for u in utterances: names = list() cname = "" slots = NLU.getSlots(u)[1] tutt = u[0].strip().lower().split() print slots for tok in tutt: ttok = pattern.sub("", tok) if ttok in profs: if cname != "": cname += " " cname += ttok else: if cname != "": names.append(cname) cname = "" if cname != "": names.append(cname) print(names) slist = list() for slot in slots: slist.append(slot[0].lower()) IACT += len(slots) IGUE += len(names) for name in names: if name in slist: ICOR += 1 print(str(ICOR * 1.0 / IGUE)) print(str(ICOR * 1.0 / IACT)) print(IACT) return ICOR, IGUE, IACT
def classLevel(): CCOR = 0 CGUE = 0 CACT = 0 pattern = re.compile("[\W_]+") w = dicts.getEECSdict() ww = list() for key in w.keys(): ww.append(w[key]) sentences = NLU.getALines() utterances = NLU.getUtterances(sentences) for u in utterances: xmatches = list() tutt = u[0].strip().lower() slots = NLU.getSlots(u)[0] for q in tutt.split(): qq = pattern.sub("", q) if is_number(qq): xmatches.append(qq) for q in ww: if q.lower() in tutt: xmatches.append(q.lower()) slist = list() for slot in slots: slist.append(slot[1].lower()) print(slist) print(xmatches) CACT += len(slots) CGUE += len(xmatches) for name in xmatches: if name in slist: CCOR += 1 print(str(CCOR * 1.0 / CGUE)) print(str(CCOR * 1.0 / CACT)) print(CACT) return CCOR, CGUE, CACT
def genLikesMap(utterances): likes_map.clear() for i in s_set: likes_map[i] = [list(), list()] for i in range(0, len(utterances)): slots = NLU.getSlots(utterances[i]) speaker = s_map[utterances[i][0].strip()] if slots[0]: likes_map[speaker][0].extend(slots[0]) if slots[1]: likes_map[speaker][1].extend(slots[1]) # generate dictionary for similar likes for each person for q in s_set: simlikeq = list() for i in s_set: if i == q: continue found = False for j in range(0, len(likes_map[i][0])): if (("EECS", likes_map[i][0][j][1], likes_map[i][0][j][2]) in likes_map[q][0] or ("", likes_map[i][0][j][1], likes_map[i][0][j][2]) in likes_map[q][0] ) and likes_map[i][0][j][2] != "neutral": #print("similar likes for " + i + " and " + q + ": " + str(likes_map[i][0][j])); simlikeq.append(i) found = True break if not found: for j in range(0, len(likes_map[i][1])): if likes_map[i][1][j] in likes_map[q][ 1] and likes_map[i][1][j][1] != "neutral": #print("similar likes for " + i + " and " + q + ": " + str(likes_map[i][1][j])); simlikeq.append(i) found = True break sim_likes[q] = simlikeq
print "\ngoodbye" sys.exit(0) data = { 'id' : id, 'user_utterance' : user_utterance, 'prefs' : set(), 'imdbi' : imdb, 'outputs' : [], 'act' : None, } NLU.NLU(data) DM.DM(data) for key, value in data.items(): if type(value) == type(set()): data[key] = list(value) NLU.add_entity_names(data) del data['imdbi'] result = json.dumps(imdbi.clean_unicode_errors(data)) if server: sys.stderr.write('> ' + user_utterance + '\n') sys.stderr.write('< ' + result + '\n') if server: sys.stdout.write(result + '\n') else: sys.stdout.write(result + '\n') for output in data['outputs']: sys.stdout.write(str(output) + '\n') sys.stdout.flush() os.remove('/tmp/imdbot_pid')
def main(): fo = open("../data/extract_samples/EECS_annotated_samples_anonymized", "r") lines = fo.readlines() utterances = NLU.getUtterances(lines) mode = False sents = list() targets = list() lastTaken = "" lastSent = "" isclass = False tagset = list() coriset = list() lastTagset = list() index = 0 sent_to_xtc = dict() sent_to_xtc[0] = list() for i in range(len(lines)): data = lines[i].strip() if "" == data: index += 1 sent_to_xtc[index] = list() if data.startswith("<class") or data.startswith("<instructor"): mode = True lastTaken = "" lastTagset = list() if data.startswith("<class"): isclass = True if mode and data.startswith("sentiment="): lastTaken = data[10:] if lastTaken.endswith(">"): lastTaken = lastTaken[:-1] if mode and data.startswith("name="): temp = data[5:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("id="): temp = data[3:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("department="): temp = data[11:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if not mode and "" != data: lastSent = data if data.endswith(">"): mode = False coriset.append(isclass) isclass = False sents.append(lastSent) tagset.append(lastTagset) sent_to_xtc[index].append(len(sents) - 1) if lastTaken == "": targets.append("neutral") else: targets.append(lastTaken) f2 = open("RNTN_sent") gendict = f2.readlines() f2.close() sdict = dict() slist = list() parent = None lastpart = None for i in gendict: if i.startswith(":"): parent = i[1:].strip() sdict[parent] = dict() slist.append(parent) elif is_number(i.strip()): sdict[parent][lastpart] = int(i.strip()) else: lastpart = i.strip() sdict[parent][lastpart] = -1 print(len(tagset)) print(len(sdict.keys())) print(len(sent_to_xtc)) print(len(targets)) tries = 0 correct = 0 for q in range(0, len(slist)): print(sdict[slist[q]]) print(sent_to_xtc[q]) for i in sent_to_xtc[q]: print(str(tagset[i]) + ":" + str(targets[i])) for j in sdict[slist[q]]: if tagset[i][0] in j: asent = "neutral" if int(sdict[slist[q]][j]) > 2: asent = "positive" elif int(sdict[slist[q]][j]) < 1: asent = "negative" print(asent) tries += 1 if targets[i] == asent: correct += 1 print("correct: " + str(correct * 1.0 / tries))
def main(): # get scores fscores = open("S1feature");#S1feature -S1single_lies lines = fscores.readlines(); fscores.close(); scores = list(); for i in lines: scores.append(float(i.strip())); sort_scores = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])]; sort_scores.reverse(); # get splits fsplits = open("splits"); splines = fsplits.readlines(); splits = list(); for i in range(0, len(splines)): parts = splines[i].strip().split(":"); train = list(); test = list(); for s in parts[0][1:-1].split(", "): train.append(int(s)); for s in parts[1][1:-1].split(", "): test.append(int(s)); splits.append((train, test)); fsplits.close(); #get speakers nlines = NLU.getALines(); utterances = NLU.getUtterances(nlines); nlist = list(); for i in range(0, len(splits)): senti_utters = list(); for j in range(0, len(splits[i][0])): senti_utters.append(utterances[splits[i][0][j]]); likesMatrix, slist = leastSquares.getMatrix(senti_utters); test_utters = list(); for j in range(0, len(splits[i][1])): test_utters.append(utterances[splits[i][1][j]]); TlikesMatrix, Tslist = leastSquares.getMatrix(test_utters); nonneus = 0; nnews = 0; density = 0.0; counts = list(); #iterate over rows for k in range(0, len(likesMatrix)): nonneu = 0; for j in range(0, len(likesMatrix[k])): if int(likesMatrix[k][j]) != 5: nonneu += 1; if nonneu > 0: nnews += 1; nonneus += nonneu; counts.append(nonneu); #iterate over columns elaps = 0; for k in range(0, len(likesMatrix[0])): nonneu = 0; TNEW = 0; for j in range(0, len(likesMatrix)): if int(likesMatrix[j][k]) != 5: nonneu = 1; if int(TlikesMatrix[j][k]) != 5: TNEW = 1; if nonneu == 1 and TNEW == 1: elaps += 1; nlist.append(str(nnews) + ":" + str(nonneus) + ":" + str(counts) + ":" + str(elaps)); #print correlations for i in sort_scores: print(str(scores[i]) + " - " + nlist[i]);
def main(): name = "MEGHAN" fi = open("../data/extract_samples/pID_AEU") pid = fi.readlines() fi.close() pidmap = dict() pset = set() for i in range(0, len(pid)): parts = pid[i].split("\t") pset.add(parts[0]) pidmap[parts[1].strip()] = parts[0] fl = open("EECS_annotated_samples_anonymized") lines = fl.readlines() fl.close() utterances = NLU.getUtterances(lines) print(utterances[0]) print("Speaker: " + pidmap[utterances[0][0].strip()]) slots = NLU.getSlots(utterances[0]) print(slots) plikes = dict() for i in pset: plikes[i] = [list(), list()] for i in range(0, len(utterances)): slots = NLU.getSlots(utterances[i]) speaker = pidmap[utterances[i][0].strip()] if slots[0]: plikes[speaker][0].extend(slots[0]) if slots[1]: plikes[speaker][1].extend(slots[1]) print("\n\nGiven that EECS 492 sentiment is neutral...") #print(plikes[name]); wholikes = ("EECS", "492", "neutral") likers = list() for i in pset: if wholikes in plikes[i][0]: likers.append(i) # check instructors in likers ucontains_i = "Quentin Stout" print("\n\nWho likes " + ucontains_i) for i in likers: for j in range(0, len(plikes[i][1])): if plikes[i][1][j][0] == ucontains_i: print(i + ": " + str(plikes[i][1][j])) # check classes in likers ucontains_cd = "EECS" ucontains_cid = "545" print("\n\nWho likes " + ucontains_cd + " " + ucontains_cid) for i in likers: for j in range(0, len(plikes[i][0])): # don't worry about department but if you want to... then use this line # plikes[i][0][j][0] == ucontains_cd and if plikes[i][0][j][1] == ucontains_cid: print(i + ": " + str(plikes[i][0][j])) # find all people with similar sentiments to <name> in the data set print("\n\nSimlikes!") simlikesmap = dict() for q in pset: simlikes = list() for i in pset: if i == q: continue found = False for j in range(0, len(plikes[i][0])): if (("EECS", plikes[i][0][j][1], plikes[i][0][j][2]) in plikes[name][0] or ("", plikes[i][0][j][1], plikes[i][0][j][2] ) in plikes[name][0]) and plikes[i][0][j][2] != "neutral": print("similar likes for " + i + " and " + name + ": " + str(plikes[i][0][j])) simlikes.append(i) found = True break if not found: for j in range(0, len(plikes[i][1])): if plikes[i][1][j] in plikes[name][ 1] and plikes[i][1][j][1] != "neutral": print("similar likes for " + i + " and " + name + ": " + str(plikes[i][1][j])) simlikes.append(i) found = True break simlikesmap[q] = simlikes # calculate % of times where OSCORE will be nonzero times = 0 ttimes = 0 for u in utterances: slots = NLU.getSlots(u) speaker = pidmap[u[0].strip()] for slot in slots[0]: ttimes += 1 oscore = 0 for i in simlikesmap[speaker]: pscore = 0 for j in range(0, len(plikes[i][0])): if slot[1] == plikes[i][0][j][1]: if plikes[i][0][j][2] == "positive": pscore += 1 elif plikes[i][0][j][2] == "negative": pscore -= 1 if pscore > 0: oscore += 1 elif pscore < 0: oscore -= 1 if oscore != 0: times += 1 for slot in slots[1]: ttimes += 1 oscore = 0 for i in simlikesmap[speaker]: pscore = 0 for j in range(0, len(plikes[i][1])): if slot[0] == plikes[i][1][j][0]: if plikes[i][1][j][1] == "positive": pscore += 1 elif plikes[i][1][j][1] == "negative": pscore -= 1 if pscore > 0: oscore += 1 elif pscore < 0: oscore -= 1 if oscore != 0: times += 1 print("Times: " + str(times)) print("Total Times: " + str(ttimes)) print("Percentage: " + str(times * 100.0 / ttimes))
def main(): fi = open("sentimentAnnotations") line1 = fi.readlines() fi.close() fo = open("EECS_annotated_samples_anonymized") line2 = fo.readlines() fo.close() utt1 = NLU.getUtterances(line1) utt2 = NLU.getUtterances(line2) correct = 0 wrong = 0 NEU_NEG = 0 NEU_POS = 0 POS_NEG = 0 SNEU_NEG = set() SNEU_NEG.add("neutral") SNEU_NEG.add("negative") SNEU_POS = set() SNEU_POS.add("neutral") SNEU_POS.add("positive") SPOS_NEG = set() SPOS_NEG.add("negative") SPOS_NEG.add("positive") disagrees = list() inst = 1 insttype = "neutral" for i in range(0, len(utt1)): slots1 = NLU.getSlots(utt1[i]) slots2 = NLU.getSlots(utt2[i]) for j in range(0, len(slots1[0])): if insttype == slots2[0][j][2]: inst += 1 if slots1[0][j][3] == slots2[0][j][3]: correct += 1 else: tset = set() tset.add(slots1[0][j][3]) tset.add(slots2[0][j][3]) disagrees.append(utt1[i]) if slots2[0][j][3] == insttype: if tset == SNEU_NEG: NEU_NEG += 1 elif tset == SNEU_POS: NEU_POS += 1 elif tset == SPOS_NEG: POS_NEG += 1 wrong += 1 for j in range(0, len(slots1[1])): if slots1[1][j][1] == slots2[1][j][1]: correct += 1 else: tset = set() disagrees.append(utt1[i]) tset.add(slots1[1][j][1]) tset.add(slots2[1][j][1]) if slots2[1][j][1] == insttype: if tset == SNEU_NEG: NEU_NEG += 1 elif tset == SNEU_POS: NEU_POS += 1 elif tset == SPOS_NEG: POS_NEG += 1 wrong += 1 print("Agree on " + str(correct)) print("Disagree on " + str(wrong)) print("Percent agreement is " + str(correct * 1.0 / (correct + wrong)) + "%") #print("NEU_NEG: " + str(NEU_NEG*1.0/(correct+wrong))); #print("NEU_POS: " + str(NEU_POS*1.0/(correct+wrong))); #print("POS_NEG: " + str(POS_NEG*1.0/(correct+wrong))); print("NEU_NEG: " + str(NEU_NEG * 1.0 / inst)) print("NEU_POS: " + str(NEU_POS * 1.0 / inst)) print("POS_NEG: " + str(POS_NEG * 1.0 / inst))
def main(): with open(prior_property_file) as json_file: prior_property_dict = json.load(json_file) dialogDB_json = {} input_json = request.get_json() print(input_json) user_ID = input_json['user_id'] userDB_json = {'userID': user_ID, 'command': 'LOGIN'} userDB_response = Dialog_Manager.UserDBaccess(userDB_json) KB_language = { 'user_ID': user_ID, 'entities': [], 'frames': [], 'triples': [] } result_json = { 'user_id': user_ID, 'frames': [], 'entities': [], 'q_list': [], 'knowledge': [] } user_input = input_json['utterance'] dialogDB_json['user_id'] = KB_language['user_ID'] dialogDB_json['utterance'] = user_input dialogDB_json['speaker'] = 'user' dialogDB_json['mode'] = 'make_table' dialog_index = DialogDBaccess(dialogDB_json) dialogDB_json['mode'] = 'add_data' dialogDB_json['utterance'] = user_input dialogDB_json['speaker'] = 'user' dialog_index = DialogDBaccess(dialogDB_json) if not input_json['q_list'] and not input_json['knowledge']: NLU_response = NLU.Frame_Interpreter(user_input) #frame기반 if NLU_response['frames']: KB_language['frames'] = NLU_response['frames'] result_json['frames'] = KB_language['frames'] now_frame = KB_language['frames'][-1] question_list = [] with open('./frame_info_full.json', 'r', encoding='utf-8') as f: frame_json_data = json.load(f) frame_data = frame_json_data[now_frame['frame']] for ele in frame_data['arguments']: if ele['coreType'] == 'Core': question_list.append(ele['fe']) for exi_ele in now_frame['ele']: if exi_ele in question_list: question_list.remove(exi_ele) if question_list: result_json['q_list'] = question_list system_output = \ NLG([now_frame['frame'] + '\t' + question_list[0] + '\t' + '?o'], 'Knowledge_question')[0] result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) else: system_output = '감사합니다.' result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) #entity기반 elif NLU_response['entities']: KB_language['entities'] = NLU_response['entities'] result_json['entities'] = KB_language['entities'] KBM_response = Entity_Summarization.ES( KB_language['entities'][0]['text']) KB_language['triples'] = KBM_response utterances = NLG(KB_language['triples'], 'Knowledge_inform') system_output = '' for candi in utterances: if 'wiki' in candi or 'abstract' in candi: continue system_output += candi tmp_list = [] for ele in NLU_response['entities'][0]['type']: if 'http://dbpedia.org/ontology/' in ele: tmp_list.append(ele.split('/')[-1]) KB_language['entities'][0]['type'] = copy.deepcopy(tmp_list) for tmp_class in KB_language['entities'][0]['type']: if tmp_class in class_dict['level_4']: entity_class = tmp_class break elif tmp_class in class_dict['level_3']: entity_class = tmp_class break elif tmp_class in class_dict['level_2']: entity_class = tmp_class break elif tmp_class in class_dict['level_1']: entity_class = tmp_class break question_list = [] question_property_list = prior_property_dict[entity_class] question_num = 0 for candi_question in question_property_list: if question_num == 3: break tmp_user_query = Dialog_Manager.SPARQL_Generation( 'ASK', [KB_language['entities'][0]['uri'], candi_question, '?o'], KB_language['user_ID']) tmp_master_query = Dialog_Manager.SPARQL_Generation( 'ASK', [KB_language['entities'][0]['uri'], candi_question, '?o'], "") userDB_json['query'] = tmp_user_query userDB_json['command'] = 'QUERY' #print(UserDBaccess(userDB_json)) if not Dialog_Manager.MasterDBaccess( tmp_master_query) and not Dialog_Manager.UserDBaccess( userDB_json)['query_result']: question_list.append([ KB_language['entities'][0]['text'], candi_question, '?o' ]) question_num += 1 if question_list: result_json['q_list'] = question_list system_output += '\n' + KB_language['entities'][0][ 'text'] + '에 대해서 몇 가지 물어보고 싶은게 있어요.' system_output += '\n' + NLG([ question_list[0][0] + '\t' + question_list[0][1] + '\t' + question_list[0][2] ], 'Knowledge_question')[0] result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) else: system_output = '감사합니다.' result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) else: system_output = '제가 아직 이해할 수 없는 문장이에요.' result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) elif input_json['q_list']: dialogDB_json['utterance'] = user_input dialogDB_json['speaker'] = 'user' dialog_index = DialogDBaccess(dialogDB_json) if input_json['frames']: result_json['frames'] = input_json['frames'] tmp_user_answer = ETRI_NER(user_input) tmp_knowledge_list = input_json['knowledge'] if tmp_user_answer: tmp_knowledge_list.append(tmp_user_answer[0][0]) result_json['knowledge'] = tmp_knowledge_list result_json['q_list'] = input_json['q_list'] del result_json['q_list'][0] if result_json['q_list']: now_frame = input_json['frames'][-1] system_output = \ NLG([now_frame['frame'] + '\t' + result_json['q_list'][0] + '\t' + '?o'], 'Knowledge_question')[0] result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) else: if input_json['knowledge']: #frame지식저장 system_output = '감사합니다' result_json['frames'] = [] result_json['entities'] = [] result_json['knowledge'] = [] result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) else: system_output = '감사합니다' result_json['frames'] = [] result_json['entities'] = [] result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) elif input_json['entities']: result_json['entities'] = input_json['entities'] tmp_user_answer = ETRI_NER(user_input) new_triple = [] tmp_knowledge_list = input_json['knowledge'] result_json['q_list'] = input_json['q_list'] if tmp_user_answer: new_triple.append('http://kbox.kaist.ac.kr/resource/' + result_json['q_list'][0][0] + '\t' + result_json['q_list'][0][1] + '\thttp://kbox.kaist.ac.kr/resource/' + tmp_user_answer[0][0]) new_triple.append( 'http://kbox.kaist.ac.kr/resource/' + result_json['q_list'][0][0] + result_json['q_list'][0][1] + 'http://kbox.kaist.ac.kr/resource/' + tmp_user_answer[0][0] + '\thttp://kbox.kaist.ac.kr/flagship/dialogid\thttp://ko.dbpedia.org/resource/' + str(dialog_index)) s1, p1, o1 = new_triple[0].split('\t') s2, p2, o2 = new_triple[1].split('\t') tmp_knowledge_list.append([s1, p1, o1]) tmp_knowledge_list.append([s2, p2, o2]) result_json['knowledge'] = tmp_knowledge_list del result_json['q_list'][0] if result_json['q_list']: system_output = '\n' + NLG([ result_json['q_list'][0][0] + '\t' + result_json['q_list'][0][1] + '\t' + result_json['q_list'][0][2] ], 'Knowledge_question')[0] result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) else: if result_json['knowledge']: system_output = '' for triple in result_json['knowledge']: if 'flagship/dialogid' not in triple: system_output += NLG(['\t'.join(triple)], 'Knowledge_inform')[0] + '\n' userDB_json['command'] = 'REGISTER' userDB_json['triple'] = result_json['knowledge'] Dialog_Manager.UserDBaccess(userDB_json) result_json['entities'] = [] result_json['knowledge'] = [] system_output += '감사합니다' result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) else: result_json['entities'] = [] system_output = '감사합니다' result_json['utterance'] = system_output dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) else: print('input error') result_json = input_json result_json['utterance'] = 'input error' else: print('input error') result_json = input_json result_json['utterance'] = 'input error' return jsonify(result_json)
print(system_output) dialogDB_json['mode'] = 'add_data' dialogDB_json['utterance'] = system_output dialogDB_json['speaker'] = 'system' dialog_index = DialogDBaccess(dialogDB_json) ''' user_input = input() if user_input == '로그아웃': break elif user_input == '끝': stop = True break dialogDB_json['utterance'] = user_input dialogDB_json['speaker'] = 'user' dialog_index = DialogDBaccess(dialogDB_json) NLU_response = NLU.Frame_Interpreter(user_input) #frame기반 if NLU_response['frames']: KB_language['frames'] = NLU_response['frames'] #for now_frame in KB_language['frames']: now_frame = KB_language['frames'][-1] question_list = [] with open('./frame_info_full.json', 'r', encoding='utf-8') as f: frame_json_data = json.load(f) frame_data = frame_json_data[now_frame['frame']] for ele in frame_data['arguments']: if ele['coreType'] == 'Core': question_list.append(ele['fe']) print(question_list) for exi_ele in now_frame['ele']:
import datetime from dateutil.relativedelta import relativedelta import sys, json data = sys.argv weekday = { "月曜日": 0, "火曜日": 1, "水曜日": 2, "木曜日": 3, "金曜日": 4, "土曜日": 5, "日曜日": 6 } model = NLU.main() def check(text): ur, da, concept = model(text) if da == "negative": return [0] elif not ("plan" in concept): return [0] else: year, month, day = date_detector(concept.get("month"), concept.get("day"), concept.get("youbi"), concept.get("today"), concept.get("tomorrow"),
def getMatrix(utterances): GROUNDTRUTHS = True np.set_printoptions(threshold='nan') #lines = NLU.getALines(); #do ALS stuff ioffset = len(classes) X = np.ones((len(sset), len(classes) + len(instructors))) * -1 #print(X.shape); for i in range(0, len(utterances)): slots = NLU.getSlots(utterances[i]) cslots = slots[0] islots = slots[1] for slot in islots: iname = "" if GROUNDTRUTHS: iname = slot[0] else: if slot[0] in entcache.keys(): iname = entcache[slot[0]] else: iname = ed.entityDistance(slot[0])[1][1] entcache[slot[0]] = iname if slot[1] == "positive": X[slist.index(smap[utterances[i][0].strip()])][ ioffset + instructors.index(iname)] = 10 elif slot[1] == "negative": X[slist.index(smap[utterances[i][0].strip()])][ ioffset + instructors.index(iname)] = 0 elif slot[1] == "neutral": X[slist.index(smap[utterances[i][0].strip()])][ ioffset + instructors.index(iname)] = 5 for slot in cslots: if is_number(slot[1]): if slot[1] in classes: if slot[2] == "positive": X[slist.index( smap[utterances[i][0].strip()])][classes.index( slot[1])] = 10 elif slot[2] == "negative": X[slist.index( smap[utterances[i][0].strip()])][classes.index( slot[1])] = 0 elif slot[2] == "neutral": X[slist.index( smap[utterances[i][0].strip()])][classes.index( slot[1])] = 5 else: pass #print(slot[1] + " is not a class..."); else: classname = "" if GROUNDTRUTHS: classname = slot[1] else: if slot[1] in entcache.keys(): classname = entcache[slot[1]] else: classname = ed.entityDistance(slot[1])[0][1] entcache[slot[1]] = classname if slot[2] == "positive": X[slist.index(smap[utterances[i][0].strip()])][ classNames.index(classname)] = 10 elif slot[2] == "negative": X[slist.index(smap[utterances[i][0].strip()])][ classNames.index(classname)] = 0 elif slot[2] == "neutral": X[slist.index(smap[utterances[i][0].strip()])][ classNames.index(classname)] = 5 # Add back these four lines and change return X to newX if you want to use ALS A, Y = nmf(X, 50) A = np.matrix(A) Y = np.matrix(Y) newX = A * Y return newX, slist
import telebot import Controller import BlackBox import NLG import NLU from configuration import API_TOKEN bot = telebot.TeleBot(API_TOKEN) black_box = BlackBox.BlackBox() controller = Controller.Controller() nlg = NLG.NLG() nlu = NLU.NLU() @bot.message_handler(commands=['start']) def start_message(message): controller.add_user(message.from_user.id) # answer = black_box.get_greeting_message() answer = nlg.get_message( 'none', controller.users[message.from_user.id]["status"], nlu, black_box, controller.users[message.from_user.id]["right_answer"]) bot.send_message(message.from_user.id, answer) bot.send_message(message.from_user.id, "Вам нужно получить такую картинку: ") #bot.send_photo(message.from_user.id, photo=black_box.get_response(controller.users[message.from_user.id]["right_answer"])) black_box.get_response( controller.users[message.from_user.id]["right_answer"])
def main(): if not os.path.exists('classifiers'): os.makedirs('classifiers') allines = NLU.getALines() allU = NLU.getUtterances(allines) textLines = NLU.getTextLines(allU) slots = [NLU.getSlots(i) for i in allU] sents = list() targets = list() tagset = list() sent_to_xtc = dict() index = 0 for i in range(len(slots)): tstx = [] for etype in ENT_TYPES: for j in range(len(slots[i][etype])): tstx.append(index) index += 1 targets.append(slots[i][etype][j]['sentiment']) ttags = [ slots[i][etype][j][k] for k in ALL_IDS if k in slots[i][etype][j] ] tagset.append(ttags) sents.append(textLines[i]) sent_to_xtc[i] = tstx cprint('Number of Utterances: ' + str(index)) cprint('Length of Lines: ' + str(len(sents))) cprint('Length of Targets: ' + str(len(targets))) cv = set() regex = re.compile(r'[^a-zA-Z0-9_\~\- ]+') for sent in range(0, len(sents)): parts = sents[sent].split(' ') for part in range(0, len(parts)): thepart = regex.sub('', parts[part]) # corner case for hyphens hps = thepart.split('-') if len(hps) > 1: for hi in range(0, len(hps)): cv.add(hps[hi].lower()) # end corner case for hyphens thepart = thepart.lower() cv.add(thepart) cv = list(cv) cprint('Vocabulary Size: ' + str(len(cv))) xtc = [] for sent in range(0, len(sents)): #print('sentence: ' + str(sent)) #print('s1: ' + str(sents[sent])) #print(sents[sent] + ' - with tagset - ' + str(tagset[sent])) #dparse = spwrap.parse(sents[sent]) #print('DPARSE: ' + dparse) # add token boundaries to the sentence tokenSent = sents[sent] for tag in range(0, len(tagset[sent])): tokenSent = tokenSent.replace(tagset[sent][tag], ' ~~t~~ ' + tagset[sent][tag]) #print(tokenSent) parts = regex.sub('', tokenSent) # this handles split and hyphen corner case parts = re.split(' |-', parts) # remove empty parts from the sentence while '' in parts: parts.remove('') # locate window feature indicies windowFeatures = [] done = False while not done: for part in range(0, len(parts)): if '~~t~~' == parts[part]: windowFeatures += [part] parts.remove(parts[part]) #print('parts?: ' + str(parts)) break if part == len(parts) - 1: done = True #print('window features: ' + str(windowFeatures)) #print('parts: ' + str(parts)) row = [] # featureMapG = [[0]*300]*4 featureMap = {} Nflag = 0 for part in range(0, len(parts)): #thepart = regex.sub('', parts[part]) #thepart = thepart.lower() thepart = parts[part].lower() theid = cv.index(thepart) #print(theid) #g_vec = glove_features.getGloveWord(glove_dict, parts[part]) mindist = 999 for wf in range(0, len(windowFeatures)): ############################################################## ## This is the distance measure for window linear distance! distance = abs(windowFeatures[wf] - part) ############################################################## ## This is the distance measure for dependency tree distnace! ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse) ############################################################## if distance < mindist: mindist = distance mindist += 1 sentiz = senti_lexis.lexCounts(thepart) #for g_vi in range(0, len(g_vec)): # featureMapG[0][g_vi] += g_vec[g_vi];# - mindist/10.0 # featureMapG[1][g_vi] += g_vec[g_vi];# - mindist/10.0 # featureMapG[2][g_vi] += g_vec[g_vi];# - mindist/10.0 # featureMapG[3][g_vi] += g_vec[g_vi];# - mindist/10.0 if theid in featureMap: # 1.0 - mindist / 10.0 worked well for the first distance measure... # featureMap[theid] += 1.0 / mindist featureMap[theid][0] += 1.0 - mindist / 10.0 featureMap[theid][1] += (1.0 - mindist / 10.0) * sentiz[0] featureMap[theid][2] += (1.0 - mindist / 10.0) * sentiz[1] featureMap[theid][3] += (1.0 - mindist / 10.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 else: # featureMap[theid] = 1.0 / mindist # count, positive, negative, neutral, negate featureMap[theid] = [0, 0, 0, 0, 0] featureMap[theid][0] = 1.0 - mindist / 10.0 featureMap[theid][1] = (1.0 - mindist / 10.0) * sentiz[0] featureMap[theid][2] = (1.0 - mindist / 10.0) * sentiz[1] featureMap[theid][3] = (1.0 - mindist / 10.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 if Nflag > 0: Nflag -= 1 if senti_lexis.lexNegate(thepart): Nflag = 2 for i in range(0, len(cv)): if i in featureMap: row.extend(featureMap[i]) else: row.extend([0, 0, 0, 0, 0]) # add on the glove features # for a in range(0, len(featureMapG)): # temp_vec = [] # for a_a in range(0, len(featureMapG[a])): # temp_vec.append(featureMapG[a][a_a]*1.0/len(parts)) # row.extend(temp_vec) xtc.append(row) #instead read the data from splits file fsplits = open('splits') lines = fsplits.readlines() splits = list() for i in range(0, len(lines)): parts = lines[i].strip().split(':') train = list() test = list() for s in parts[0][1:-1].split(', '): train.append(int(s)) for s in parts[1][1:-1].split(', '): test.append(int(s)) splits.append((train, test)) fsplits.close() #test print the first split #print(splits[0][0]) #print(splits[0][1]) #do gridsearch + evaluation fscores = open('scores_sentiment', 'w') bestsplit = -1 BSscore = 0 for i in range(0, len(splits)): bestC = 0 bestGamma = 0 bestScore = 0 xtest = list() xtrain = list() ytest = list() ytrain = list() # add the utterance set generation here for senti_set # senti_utters = list() # for j in range(0, len(splits[i][0])): # senti_utters.append(utterances[splits[i][0][j]]) #likesMatrix, slist = leastSquares.getMatrix(senti_utters) # do train-test split for j in range(0, len(splits[i][0])): #speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0]) #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker)) #print('\n' + speaker + ': ' + utterances[splits[i][0][j]][0].strip()) # VECTOR is 38 x 141 -> 264 total for LL in range(0, len(sent_to_xtc[splits[i][0][j]])): #fvector = likesMatrix[slist.index(speaker)] #fvector = fvector.tolist()[0] fvector = xtc[sent_to_xtc[splits[i][0][j]][LL]] #fvector.append(slist.index(speaker)) ############################################################## #entity = tagset[sent_to_xtc[splits[i][0][j]][LL]] #entity = tagset2entity(entity) #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker)) #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity) #print('speaker: ' + str(speaker) + ' - ' + str(slist.index(speaker))) #fvector.append(gscore) ########fvector = [gscore] ############################################################## xtrain.append(fvector) ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]]) for j in range(0, len(splits[i][1])): #speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0]) #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker)) for LL in range(0, len(sent_to_xtc[splits[i][1][j]])): #fvector = likesMatrix[slist.index(speaker)] #fvector = fvector.tolist()[0] fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]] #fvector.append(slist.index(speaker)) ############################################################## #entity = tagset[sent_to_xtc[splits[i][1][j]][LL]] #entity = tagset2entity(entity) #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker)) #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity) #fvector.append(gscore) ########fvector = [gscore] ############################################################## xtest.append(fvector) ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) score = 0 for gamma in numpy.linspace(0.0001, 0.05, 10): #10steps for C in numpy.linspace(0.1, 10, 10): #10steps #2 fold x1 = xtrain[len(xtrain) / 2:] x2 = xtrain[:len(xtrain) / 2] y1 = ytrain[len(ytrain) / 2:] y2 = ytrain[:len(ytrain) / 2] x11 = csr_matrix(x1) x22 = csr_matrix(x2) clf = svm.SVC(gamma=gamma, C=C) testout = clf.fit(x1, y1) score = clf.score(x2, y2) clf = svm.SVC(gamma=gamma, C=C) testout = clf.fit(x2, y2) score += clf.score(x1, y1) score /= 2 if score > bestScore: bestC = C bestGamma = gamma bestScore = score cprint('Cross Validation Score: ' + str(score)) cprint('Gamma = ' + str(gamma) + ' and C = ' + str(C)) ################ THIS IS FOR CvI EVALUATION ################ #Ixtest = list() #Iytest = list() #Cxtest = list() #Cytest = list() #for j in range(0, len(splits[i][1])): # for LL in range(0, len(sent_to_xtc[splits[i][1][j]])): # fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]] # if coriset[sent_to_xtc[splits[i][1][j]][LL]]: # Cxtest.append(fvector) # Cytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) # else: # Ixtest.append(fvector) # Iytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) #xtrain = csr_matrix(xtrain) #Cxtest = csr_matrix(Cxtest) #Ixtest = csr_matrix(Ixtest) #clf = svm.SVC(gamma=bestGamma, C=bestC) #testout = clf.fit(xtrain, ytrain) #CBscore = clf.score(Cxtest, Cytest) #IBscore = clf.score(Ixtest, Iytest) #cprint('Actual Score: ' + str(CBscore) + ':' + str(IBscore)) #fscores.write(str(CBscore) + ':' + str(IBscore) + '\n') #fscores.flush() ############################################################### ################ THIS IS FOR NORMAL EVALUATION ################ xtrain = csr_matrix(xtrain) xtest = csr_matrix(xtest) clf = svm.SVC(gamma=bestGamma, C=bestC) testout = clf.fit(xtrain, ytrain) bestScore = clf.score(xtest, ytest) cprint('Actual Score: ' + str(bestScore)) fscores.write(str(bestScore) + '\n') ############################################################### # save best classifier per fold cString = pickle.dumps(clf) fsave1 = open('classifiers/sentiment_classifier' + str(i), 'w') fsave1.write(cString) fsave1.close() fscores.close() # save feature dictionary cvString = pickle.dumps(cv) fsave2 = open('sentiment_dictionary', 'w') fsave2.write(cvString) fsave2.close()
def main(): fo = open("EECS_annotated_samples_anonymized", "r") lines = fo.readlines() utterances = NLU.getUtterances(lines) mode = False sents = list() targets = list() lastTaken = "" lastSent = "" isclass = False tagset = list() coriset = list() lastTagset = list() index = 0 # to make cross validation work after sentences are duplicated for entities sent_to_xtc = dict() sent_to_xtc[0] = list() for i in range(len(lines)): data = lines[i].strip() if "" == data: index += 1 sent_to_xtc[index] = list() if data.startswith("<class") or data.startswith("<instructor"): mode = True lastTaken = "" lastTagset = list() if data.startswith("<class"): isclass = True if mode and data.startswith("sentiment="): lastTaken = data[10:] if lastTaken.endswith(">"): lastTaken = lastTaken[:-1] if mode and data.startswith("name="): temp = data[5:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("id="): temp = data[3:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("department="): temp = data[11:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if not mode and "" != data: lastSent = data if data.endswith(">"): mode = False coriset.append(isclass) isclass = False sents.append(lastSent) tagset.append(lastTagset) sent_to_xtc[index].append(len(sents) - 1) if lastTaken == "": targets.append("neutral") else: targets.append(lastTaken) # This will print out mapping from sentences to entity vectors (XTC) #foutest = open("outtestJ", "w"); #for key in sent_to_xtc: # foutest.write(str(key) + " : " + str(sent_to_xtc[key]) + "\n"); #foutest.flush(); #foutest.close(); #randomly sample utterances #testdata = random.sample(range(0, index), index/5); print("number of utterances: " + str(index)) print("length of lines: " + str(len(sents))) print("length of targets: " + str(len(targets))) print("sent 2: " + str(sents[2])) print("tagset 2: " + str(tagset[2])) cv = set() regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+") for sent in range(0, len(sents)): parts = sents[sent].split(" ") for part in range(0, len(parts)): thepart = regex.sub("", parts[part]) # corner case for hyphens hps = thepart.split("-") if len(hps) > 1: for hi in range(0, len(hps)): cv.add(hps[hi].lower()) # end corner case for hyphens thepart = thepart.lower() cv.add(thepart) cv = list(cv) cv.append("452") #bug? print("vocabulary size: " + str(len(cv))) print("index of I: " + str(cv.index("i"))) xtc = [] for sent in range(0, len(sents)): print("sentence: " + str(sent)) print("s1: " + str(sents[sent])) #print(sents[sent] + " - with tagset - " + str(tagset[sent])); #dparse = spwrap.parse(sents[sent]); #print("DPARSE: " + dparse); # add token boundaries to the sentence tokenSent = sents[sent] for tag in range(0, len(tagset[sent])): tokenSent = tokenSent.replace(tagset[sent][tag], " ~~t~~ " + tagset[sent][tag]) print(tokenSent) parts = regex.sub("", tokenSent) # this handles split and hyphen corner case parts = re.split(" |-", parts) # remove empty parts from the sentence while "" in parts: parts.remove("") # locate window feature indicies windowFeatures = [] done = False while not done: for part in range(0, len(parts)): if "~~t~~" == parts[part]: windowFeatures += [part] parts.remove(parts[part]) print("parts?: " + str(parts)) break if part == len(parts) - 1: done = True print("window features: " + str(windowFeatures)) print("parts: " + str(parts)) row = [] featureMap = {} Nflag = 0 for part in range(0, len(parts)): #thepart = regex.sub("", parts[part]); #thepart = thepart.lower(); thepart = parts[part].lower() theid = cv.index(thepart) print(theid) mindist = 999 for wf in range(0, len(windowFeatures)): ############################################################## ## This is the distance measure for window linear distance! distance = abs(windowFeatures[wf] - part) ############################################################## ## This is the distance measure for dependency tree distnace! ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse); ############################################################## if distance < mindist: mindist = distance mindist += 1 sentiz = senti_lexis.lexCounts(thepart) if theid in featureMap: # 2.0 - mindist / 7.0 worked well for the first distance measure... # featureMap[theid] += 1.0 / mindist; featureMap[theid][0] += 2.0 - mindist / 7.0 featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0] featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1] featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 else: # featureMap[theid] = 1.0 / mindist; # count, positive, negative, neutral, negate featureMap[theid] = [0, 0, 0, 0, 0] featureMap[theid][0] = 2.0 - mindist / 7.0 featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0] featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1] featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 if Nflag > 0: Nflag -= 1 if senti_lexis.lexNegate(thepart): Nflag = 2 for i in range(0, len(cv)): if i in featureMap: row.extend(featureMap[i]) else: row.extend([0, 0, 0, 0, 0]) xtc.append(row) #instead read the data from splits file fsplits = open("splits") lines = fsplits.readlines() splits = list() for i in range(0, len(lines)): parts = lines[i].strip().split(":") train = list() test = list() for s in parts[0][1:-1].split(", "): train.append(int(s)) for s in parts[1][1:-1].split(", "): test.append(int(s)) splits.append((train, test)) fsplits.close() #test print the first split #print(splits[0][0]); #print(splits[0][1]); bestsplit = -1 BSscore = 0 for i in range(0, len(splits)): bestC = 0 bestGamma = 0 bestScore = 0 xtest = list() xtrain = list() ytest = list() ytrain = list() # add the utterance set generation here for senti_set senti_utters = list() for j in range(0, len(splits[i][0])): senti_utters.append(utterances[splits[i][0][j]]) likesMatrix, slist = leastSquares.getMatrix(senti_utters) # do train-test split csims = np.array([0.0] * 38) totz = 0 #for j in range(0, len(splits[i][0])): # speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0]); # cossim = leastSquares.cosineUserWE(likesMatrix, slist.index(speaker)); # np.add(csims, cossim); # totz += 1; for j in range(0, len(splits[i][1])): speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0]) cossim = leastSquares.cosineUserWE(likesMatrix, slist.index(speaker)) cossim = np.array(cossim) csims = np.add(csims, cossim) totz += 1 for j in range(0, len(csims)): csims[j] /= totz print(csims.tolist())
def main(): fo = open("EECS_annotated_samples_anonymized", "r") lines = fo.readlines() utterances = NLU.getUtterances(lines) mode = False sents = list() targets = list() lastTaken = "" lastSent = "" isclass = False tagset = list() lastTagset = list() index = 0 # to make cross validation work after sentences are duplicated for entities sent_to_xtc = dict() sent_to_xtc[0] = list() for i in range(len(lines)): data = lines[i].strip() if "" == data: index += 1 sent_to_xtc[index] = list() if data.startswith("<class") or data.startswith("<instructor"): mode = True lastTaken = "" lastTagset = list() if data.startswith("<class"): isclass = True if mode and data.startswith("sentiment="): lastTaken = data[10:] if lastTaken.endswith(">"): lastTaken = lastTaken[:-1] if mode and data.startswith("name="): temp = data[5:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("id="): temp = data[3:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if mode and data.startswith("department="): temp = data[11:] if temp.endswith(">"): temp = temp[:-1] lastTagset.append(temp) if not mode and "" != data: lastSent = data if data.endswith(">"): mode = False isclass = False sents.append(lastSent) tagset.append(lastTagset) sent_to_xtc[index].append(len(sents) - 1) if lastTaken == "": targets.append("neutral") else: targets.append(lastTaken) # This will print out mapping from sentences to entity vectors (XTC) #foutest = open("outtestJ", "w"); #for key in sent_to_xtc: # foutest.write(str(key) + " : " + str(sent_to_xtc[key]) + "\n"); #foutest.flush(); #foutest.close(); #randomly sample utterances #testdata = random.sample(range(0, index), index/5); print("number of utterances: " + str(index)) print("length of lines: " + str(len(sents))) print("length of targets: " + str(len(targets))) print("sent 2: " + str(sents[2])) print("tagset 2: " + str(tagset[2])) cv = set() regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+") for sent in range(0, len(sents)): parts = sents[sent].split(" ") for part in range(0, len(parts)): thepart = regex.sub("", parts[part]) # corner case for hyphens hps = thepart.split("-") if len(hps) > 1: for hi in range(0, len(hps)): cv.add(hps[hi].lower()) # end corner case for hyphens thepart = thepart.lower() cv.add(thepart) cv = list(cv) cv.append("452") #bug? print("vocabulary size: " + str(len(cv))) print("index of I: " + str(cv.index("i"))) xtc = [] for sent in range(0, len(sents)): print("sentence: " + str(sent)) print("s1: " + str(sents[sent])) #print(sents[sent] + " - with tagset - " + str(tagset[sent])); #dparse = spwrap.parse(sents[sent]); #print("DPARSE: " + dparse); # add token boundaries to the sentence tokenSent = sents[sent] for tag in range(0, len(tagset[sent])): tokenSent = tokenSent.replace(tagset[sent][tag], " ~~t~~ " + tagset[sent][tag]) print(tokenSent) parts = regex.sub("", tokenSent) # this handles split and hyphen corner case parts = re.split(" |-", parts) # remove empty parts from the sentence while "" in parts: parts.remove("") # locate window feature indicies windowFeatures = [] done = False while not done: for part in range(0, len(parts)): if "~~t~~" == parts[part]: windowFeatures += [part] parts.remove(parts[part]) print("parts?: " + str(parts)) break if part == len(parts) - 1: done = True print("window features: " + str(windowFeatures)) print("parts: " + str(parts)) row = [] featureMap = {} Nflag = 0 for part in range(0, len(parts)): #thepart = regex.sub("", parts[part]); #thepart = thepart.lower(); thepart = parts[part].lower() theid = cv.index(thepart) print(theid) mindist = 999 for wf in range(0, len(windowFeatures)): ############################################################## ## This is the distance measure for window linear distance! distance = abs(windowFeatures[wf] - part) ############################################################## ## This is the distance measure for dependency tree distnace! ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse); ############################################################## if distance < mindist: mindist = distance mindist += 1 sentiz = senti_lexis.lexCounts(thepart) if theid in featureMap: # 2.0 - mindist / 7.0 worked well for the first distance measure... # featureMap[theid] += 1.0 / mindist; featureMap[theid][0] += 2.0 - mindist / 7.0 featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0] featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1] featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 else: # featureMap[theid] = 1.0 / mindist; # count, positive, negative, neutral, negate featureMap[theid] = [0, 0, 0, 0, 0] featureMap[theid][0] = 2.0 - mindist / 7.0 featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0] featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1] featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2] if Nflag > 0: featureMap[theid][4] = 1.0 if Nflag > 0: Nflag -= 1 if senti_lexis.lexNegate(thepart): Nflag = 2 for i in range(0, len(cv)): if i in featureMap: row.extend(featureMap[i]) else: row.extend([0, 0, 0, 0, 0]) xtc.append(row) #cv = CountVectorizer(); #xtc = cv.fit_transform(sents); #examining data structures here #parts = sents[0].split(" "); #for part in range(0, len(parts)): # print("PART: " + parts[part]); #print("WORD TAKE: " + str(cv.vocabulary_.get(u'i'))); #print("WORD TAKE: " + str(cv.vocabulary_.get(u'took'))); #print("WORD DONT: " + str(cv.vocabulary_.get(u'don'))); #print("WORD DONT: " + str(cv.vocabulary_.get(u't'))); #print("WORD TAKE: " + str(cv.vocabulary_.get(u'183'))); #print(str(xtc.shape)); #print("ROW0"); #print(xtc[0]); #print("ROW1"); #print(xtc[1]); print("ROW2") print(xtc[2]) print(len(xtc[2])) #print(type(xtc[0])); #print(type(xtc)); #print(str(len(sents))); #endtest #xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(xtc, targets, test_size=0.2, random_state=0); #use this section of code to do cross validation. #shuffle and split into Nfolds parts. #testdata = range(0, index); #random.shuffle(testdata); #folds = list(); #Nfolds = 10; #fsavef = open("folds", "w"); #for i in range(0, Nfolds): # print("i = " + str(i)); # nthfold = testdata[i*index/Nfolds:(i+1)*index/Nfolds]; # folds.append(nthfold); # fsavef.write(str(nthfold) + "\n"); # print("fold(" + str(i) + "): " + str(nthfold)); #fsavef.flush(); #fsavef.close(); #instead read the data from splits file fsplits = open("splits") lines = fsplits.readlines() splits = list() for i in range(0, len(lines)): parts = lines[i].strip().split(":") train = list() test = list() for s in parts[0][1:-1].split(", "): train.append(int(s)) for s in parts[1][1:-1].split(", "): test.append(int(s)) splits.append((train, test)) fsplits.close() #test print the first split #print(splits[0][0]); #print(splits[0][1]); #do gridsearch + evaluation fscores = open("baseline_scores", "w") for i in range(0, len(splits)): bestC = 0 bestGamma = 0 bestScore = 0 xtest = list() xtrain = list() ytest = list() ytrain = list() # do train-test split for j in range(0, len(splits[i][0])): # VECTOR is 38 x 141 -> 264 total for LL in range(0, len(sent_to_xtc[splits[i][0][j]])): ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]]) for j in range(0, len(splits[i][1])): for LL in range(0, len(sent_to_xtc[splits[i][1][j]])): ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]]) score = ytrain.count("neutral") * 1.0 / len(ytrain) print("Actual Score: " + str(score)) fscores.write(str(score) + "\n") fscores.flush() fscores.close()