def main(originalFile, w2vFile, w2vDimension, topicModelFile, topicModelDimension, infoInstance, tfidfInstance, hasUrlInstance, ansProInstance): bowDict = {} w2vDict = {} tmDict = {} cuserComQuser = {} #cid, 0 or 1, compared with quserid ansProDict = {} #cid, category_cgold probability tfidfDict = {} #cid, tfidfScore urlDict = {} resultDict = {} utility = Utility() w2v = W2V(w2vFile, w2vDimension) tm = TopicModel(topicModelFile, topicModelDimension) files = [f for f in listdir(originalFile) if isdir(join(originalFile, f))] for directory in files: path = originalFile + directory fileList = [f for f in listdir(path) if isfile(join(path, f))] #question file with open(path + "/" + directory, "r") as fin: s1 = fin.read() vec1 = w2v.sentenceVector(s1) t1 = tm.getProbability(directory) #comment file for each in fileList: if each == directory: continue qid = directory cid = each cuserid = infoInstance.cidToCuserid(cid) quserid = infoInstance.cidToQuserid(cid) qcategory = infoInstance.qidToCategory(qid) if cuserid == quserid: cuserComQuser[cid] = 1.0 else: cuserComQuser[cid] = 0.0 ''' #notice, record the categoryAnsPro of train set first using following commands #after that you can use the command of "ansProDict[cid] = ansProInstance.getCategoryPro(qcategory)" in train, dev and test set ansProDict[cid] = infoInstance.getCategoryAnsPro(qcategory) cg = open("categoryAnsProTrain.txt", "a+") cg.write(qcategory + "\t") for i in range(len(ansProDict[cid])): cg.write(str(ansProDict[cid][i]) + "\t") cg.write("\n") ''' ansProDict[cid] = ansProInstance.getCategoryPro(qcategory) tfidfDict[cid] = tfidfInstance.getTfidfScore(cid) urlDict[cid] = hasUrlInstance.isExistUrl(cid) completePath = path + "/" + each with open(completePath, "r") as fin: s2 = fin.read() #some questions & comments are empty after preProcessing if not s1 or not s2: bowDict[each] = 0.000000000001 w2vDict[each] = 0.000000000001 tmDict[each] = 0.000000000001 continue bow = BOW(s1, s2) v1, v2 = bow.getVector() score = utility.cosine(v1, v2) bowDict[each] = score vec2 = w2v.sentenceVector(s2) score = utility.cosine(vec1, vec2) w2vDict[each] = score t2 = tm.getProbability(each) score = utility.cosine(t1, t2) tmDict[each] = score print "bowDict, w2vDict, tmDict done!" for key in bowDict: aList = [] aList.append(bowDict[key]) aList.append(w2vDict[key]) aList.append(tmDict[key]) aList.append(cuserComQuser[key]) for i in range(len(ansProDict[key])): aList.append(ansProDict[key][i]) aList.append(tfidfDict[key]) aList.append(urlDict[key]) resultDict[key] = aList print "resultDict done!" return resultDict
def main(originalFile, w2vFile, w2vDimension, topicModelFile, topicModelDimension, infoInstance, tfidfInstance, hasUrlInstance, ansProInstance, ynInstance): cidList = ynInstance.getCidList() cidMap = {} for i in range(len(cidList)): cidMap[cidList[i]] = 0 bowDict = {} w2vDict = {} tmDict = {} cuserComQuser = {} #cid, 0 or 1, compared with quserid ansProDict = {} #cid, category_cgold probability tfidfDict = {} #cid, tfidfScore urlDict = {} resultDict = {} utility = Utility() w2v = W2V(w2vFile, w2vDimension) tm = TopicModel(topicModelFile, topicModelDimension) files = [f for f in listdir(originalFile) if isdir(join(originalFile, f))] for directory in files: path = originalFile + directory fileList = [f for f in listdir(path) if isfile(join(path, f))] #question file with open(path + "/" + directory, "r") as fin: s1 = fin.read() vec1 = w2v.sentenceVector(s1) t1 = tm.getProbability(directory) #comment file for each in fileList: if each == directory: continue if each not in cidMap: break qid = directory cid = each cuserid = infoInstance.cidToCuserid(cid) quserid = infoInstance.cidToQuserid(cid) qcategory = infoInstance.qidToCategory(qid) if cuserid == quserid: cuserComQuser[cid] = 1.0 else: cuserComQuser[cid] = 0.0 ansProDict[cid] = ansProInstance.getCategoryPro(qcategory) tfidfDict[cid] = tfidfInstance.getTfidfScore(cid) urlDict[cid] = hasUrlInstance.isExistUrl(cid) completePath = path + "/" + each with open(completePath, "r") as fin: s2 = fin.read() #some questions & comments are empty after preProcessing if not s1 or not s2: bowDict[each] = 0.000000000001 w2vDict[each] = 0.000000000001 tmDict[each] = 0.000000000001 continue bow = BOW(s1, s2) v1, v2 = bow.getVector() score = utility.cosine(v1, v2) bowDict[each] = score vec2 = w2v.sentenceVector(s2) score = utility.cosine(vec1, vec2) w2vDict[each] = score t2 = tm.getProbability(each) score = utility.cosine(t1, t2) tmDict[each] = score ''' print bowDict print w2vDict print tmDict ''' print "bowDict, w2vDict, tmDict done!" for key in bowDict: aList = [] aList.append(bowDict[key]) aList.append(w2vDict[key]) aList.append(tmDict[key]) aList.append(cuserComQuser[key]) for i in range(len(ansProDict[key])): aList.append(ansProDict[key][i]) aList.append(tfidfDict[key]) aList.append(urlDict[key]) resultDict[key] = aList print "resultDict done!" return resultDict