def TLM(question, answer, word_dict, translation): q = tranfer_dict_vect(question) d = tranfer_dict_vect(answer) Pd = {} Pc = {} lbd = 0.2 beta = 0.8 s = 1.0 for word in q.keys(): Pd[word] = 0 for w in d.keys(): #if translation[word].has_key(w): #Pd[word] += translation[word][w] * float(d[w])/len(answer) t = translation.similarity(word, w) if t > 0: Pd[word] += t * float(d[w]) / len(answer) if word in d.keys(): Pd[word] = Pd[word] * beta + (1 - beta) * float( d[word]) / len(answer) else: Pd[word] = Pd[word] * beta if word_dict.has_key(word): Pc[word] = word_dict[word][1] else: Pc[word] = 1.0 / 80000.0 s *= (1 - lbd) * Pd[word] + lbd * Pc[word] return s
def VSM(question, answer, word_dict, translation): q = tranfer_dict_vect(question) d = tranfer_dict_vect(answer) wq = {} Wq = 0.0 for word in q.keys(): if word_dict.has_key(word): wq[word] = numpy.log(1 + Narticle / float(word_dict[word][0])) Wq += wq[word] * wq[word] Wq = numpy.sqrt(Wq) wd = {} Wd = 0 for word in d.keys(): wd[word] = 1 + numpy.log(d[word]) Wd += wd[word] * wd[word] Wd = numpy.sqrt(Wd) s = 0.0 for word in wq.keys(): if word in wd.keys(): s += wq[word] * wd[word] s = s / Wq / Wd return s
def LM(question, answer, word_dict, translation): q = tranfer_dict_vect(question) d = tranfer_dict_vect(answer) Pd = {} Pc = {} lbd = 0.2 s = 1.0 for word in q.keys(): if word in d.keys(): Pd[word] = float(d[word]) / len(answer) else: Pd[word] = 0 if word_dict.has_key(word): Pc[word] = word_dict[word][1] else: Pc[word] = 1.0 / 80000.0 s *= (1 - lbd) * Pd[word] + lbd * Pc[word] return s
def Okapi(question, answer, word_dict, translation): q = tranfer_dict_vect(question) d = tranfer_dict_vect(answer) k1 = 1.2 b = 0.75 wq = {} for word in q.keys(): if word_dict.has_key(word): wq[word] = numpy.log((Narticle - word_dict[word][0] + 0.5) / (word_dict[word][0] + 0.5)) * q[word] wd = {} Kd = k1 * ((1 - b) + b * len(question) * 27) for word in d.keys(): wd[word] = (k1 + 1) * d[word] / (Kd + d[word]) s = 0.0 for word in wq.keys(): if word in wd.keys(): s += wq[word] * wd[word] return s
db = shelve.open('database_30_50.dat') table = db['Table'] qa = db['qa'] translation = {} word_count1 = {} word_count2 = {} n=0 for index in table: n+=1 print n if n > 90000: break q,a = qa[index] qq = tranfer_dict_vect(q).keys() aa = tranfer_dict_vect(a).keys() for word0 in qq: for word1 in aa: if translation.has_key(word0): if translation[word0].has_key(word1): translation[word0][word1] += 1.0 else: translation[word0][word1] = 1.0 else: translation[word0] = {} translation[word0][word1] = 1.0 for word in qq: if word_count1.has_key(word): word_count1[word] += 1
try: answer = list(jieba.cut(article.answer[0][1])) except: continue tanswer = [] tanswer1 = [] for j in range(len(answer)): try: tanswer.append(model[answer[j].encode('utf-8')]) tanswer1.append(answer[j].encode('utf-8')) except: pass answer = tanswer1 la += len(answer) word_vect = tranfer_dict_vect(question + answer) for word in word_vect.keys(): if word_dict.has_key(word): word_dict[word][0] += 1 word_dict[word][1] += word_vect[word] else: word_dict[word] = [1, word_vect[word]] nword += word_vect[word] if (len(question) == 0) or (len(answer) == 0): continue if (len(question) > 30) or (len(answer) > 50): continue table50.append(index) qa[index] = (question, answer) words.extend(question)