def qbe(self, s, v, o, a): vectorizer = TfidfVectorizer(min_df=1) res = {} qst = remove_keys_of_empty_value(self.documents[self.q].get_subj_syn()) len_sq = len(qst.keys()) qvt = remove_keys_of_empty_value(self.documents[self.q].get_verb_syn()) len_vq = len(qvt.keys()) qot = remove_keys_of_empty_value(self.documents[self.q].get_obj_syn()) len_oq = len(qot.keys()) qat = remove_keys_of_empty_value(self.documents[self.q].get_adv_syn()) len_aq = len(qat.keys()) for d in self.documents: # subj dst = remove_keys_of_empty_value(self.documents[d].get_subj_syn()) Xs = vconcat(qst, dst) if len_sq < 1 or len(dst.keys()) < 1: subj_sim = 0.0 else: Xs_vec = vectorizer.fit_transform(Xs) subj_sim = np.average(csim(Xs_vec[0:len_sq],Xs_vec[len_sq:])) # verb dvt = remove_keys_of_empty_value(self.documents[d].get_verb_syn()) Xv = vconcat(qvt, dvt) if len_vq < 1 or len(dvt.keys()) < 1: verb_sim = 0.0 else: Xv_vec = vectorizer.fit_transform(Xv) verb_sim = np.average(csim(Xv_vec[0:len_vq],Xv_vec[len_vq:])) # obj dot = remove_keys_of_empty_value(self.documents[d].get_obj_syn()) Xo = vconcat(qot, dot) if len_oq < 1 or len(dot.keys()) < 1: obj_sim = 0.0 else: Xo_vec = vectorizer.fit_transform(Xo) obj_sim = np.average(csim(Xo_vec[0:len_oq],Xo_vec[len_oq:])) # adv dat = remove_keys_of_empty_value(self.documents[d].get_adv_syn()) Xa = vconcat(qat, dat) if len_aq < 1 or len(dat.keys()) < 1: adv_sim = 0.0 else: Xa_vec = vectorizer.fit_transform(Xa) adv_sim = np.average(csim(Xa_vec[0:len_aq],Xa_vec[len_aq:])) res[d] = s * subj_sim + v * verb_sim + o * obj_sim + a * adv_sim if self.norm: answer = norm_dic(res) else: answer = sort_dic_desc(res) return answer
def tfidf(self, path="./bench/"): # This function is used for comparision with qbe vectorizer = TfidfVectorizer(stop_words='english') lst_files = [] doc_dict = {} inv_doc_dict = {} i = 0 for f in os.listdir(path): if f.endswith(".txt"): d = open(path+f) cont = d.read() cont = unicode(cont, errors='ignore') lst_files.append(cont) doc_dict[f[:-4]] = i #[:-4] is used for trimming '.txt' from the filename inv_doc_dict[i] = f[:-4] i+= 1 tfidf_bow = vectorizer.fit_transform(lst_files) search = csim(tfidf_bow[doc_dict[self.q]], tfidf_bow) search = list(search[0]) i =0 ans = {} for item in search: x = inv_doc_dict[i] ans[x] = item i+= 1 tfidf_dic = ans tfidf = norm_dic(tfidf_dic) return tfidf
def pos_patterns_sim(query_id, documents, vectorizer, pos='subj'): if pos=='subj': qt = remove_keys_of_empty_value(documents[query_id].get_subj_syn()) elif pos =='verb': qt = remove_keys_of_empty_value(documents[query_id].get_verb_syn()) elif pos =='obj': qt = remove_keys_of_empty_value(documents[query_id].get_obj_syn()) elif pos =='adv': qt = remove_keys_of_empty_value(documents[query_id].get_adv_syn()) else: raise ValueError('The given pos value is not withen (subj, verb, obj, adv)') len_q = len(qt.keys()) for d in documents: if pos=='subj': dt = remove_keys_of_empty_value(documents[d].get_subj_syn()) elif pos =='verb': dt = remove_keys_of_empty_value(documents[d].get_verb_syn()) elif pos =='obj': dt = remove_keys_of_empty_value(documents[d].get_obj_syn()) elif pos =='adv': dt = remove_keys_of_empty_value(documents[d].get_adv_syn()) else: raise ValueError('The given pos value is not withen (subj, verb, obj, adv)') X = vconcat(qt, dt) if len(np.unique(X)) == 1: if np.unique(X) == ' ': pos_sim = 0.0 else: X_vec = vectorizer.fit_transform(X) pos_sim = np.average(csim(X_vec[0:len_q],X_vec[len_q:])) return pos_sim
def qbe(q,documents, s, v, o, a, vectorizer): res = {} len_sq = len(q.get_subj_syn().keys()) len_vq = len(q.get_verb_syn().keys()) len_oq = len(q.get_obj_syn().keys()) len_aq = len(q.get_adv_syn().keys()) for d in documents: Xs = vconcat(q.get_subj_syn(), documents[d].get_subj_syn()) Xs_vec = vectorizer.fit_transform(Xs) subj_sim = np.average(csim(Xs_vec[0:len_sq],Xs_vec[len_sq:])) Xv = vconcat(q.get_verb_syn(), documents[d].get_verb_syn()) Xv_vec = vectorizer.fit_transform(Xv) verb_sim = np.average(csim(Xv_vec[0:len_vq],Xv_vec[len_vq:])) Xo = vconcat(q.get_obj_syn(), documents[d].get_obj_syn()) Xo_vec = vectorizer.fit_transform(Xo) obj_sim = np.average(csim(Xo_vec[0:len_oq],Xo_vec[len_oq:])) Xa = vconcat(q.get_adv_syn(), documents[d].get_adv_syn()) Xa_vec = vectorizer.fit_transform(Xa) adv_sim = np.average(csim(Xa_vec[0:len_aq],Xa_vec[len_aq:])) res[d] = np.average([subj_sim, verb_sim, obj_sim, adv_sim]) res[d] = s * subj_sim + v * verb_sim + o * obj_sim + a * adv_sim return norm_dic(res)
''' print("Cluster %d titles:" % i, end='') for title in frame.ix[i]['title'].values.tolist(): print(' %s,' % title, end='') print() #add whitespace print() #add whitespace ''' print() print() # Specifying random_state so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) # Compute distance matrix: distm = 1 - csim(nmatrix) pos = mds.fit_transform(distm) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] #create data frame that has the result of the MDS plus the cluster numbers and titles df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=names1)) #group by cluster groups = df.groupby('label') # set up plot fig, ax = plt.subplots(figsize=(17, 9)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling