def feedback(query_docs_point_dict, query_model, doc_unigram, doc_wordcount, general_model, background_model, topN): lambda_bg = 0.1 lambda_fb = 0.8 lambda_ir_fb = 0.2 lambda_q = 0.1 specific_model = {} for q_key, docs_point_list in query_docs_point_dict.items(): feedback_doc = {} feedback_doc_wc = {} # Extract feedback document for doc_name in docs_point_list[0:topN]: feedback_doc[doc_name] = copy.deepcopy(doc_unigram[doc_name]) feedback_doc_wc[doc_name] = copy.deepcopy(doc_wordcount[doc_name]) # generate specific model specific_model = specific_modeling(dict(feedback_doc)) # generate significant model significant_model = significant_modeling(general_model, specific_model, feedback_doc, feedback_doc_wc) ''' ir_feedback_doc = {} ir_feedback_doc_wc = {} # Extract irrelevant feedback document for doc_name, point in docs_point_list[len(docs_point_list)-topN:]: ir_feedback_doc[doc_name] = doc_unigram[doc_name] ir_feedback_doc_wc[doc_name] = doc_wordcount[doc_name] # generate specific model ir_specific_model = specific_modeling(dict(ir_feedback_doc)) # generate significant model ir_significant_model = significant_modeling(general_model, ir_specific_model, ir_feedback_doc, ir_feedback_doc_wc) ''' for word, fb_w_prob in significant_model.items(): original_prob = 0.0 if word in query_model[q_key]: original_prob = query_model[q_key][word] else: original_prob = 0.0 # update query unigram query_model[q_key][word] = (lambda_q * original_prob) + ( lambda_fb * fb_w_prob) + (lambda_bg * background_model[word]) ''' for word, ir_fb_w_prob in ir_significant_model.items(): if word in query_model[q_key]: query_model[q_key][word] = (1 - lambda_ir_fb) * query_model[q_key][word] + lambda_ir_fb * ir_fb_w_prob ''' query_model[q_key] = ProcDoc.softmax(dict(query_model[q_key])) query_model, query_IDs = ProcDoc.dict2np(query_model) # plot_diagram.plotModel(general_model, specific_model, significant_model, feedback_doc_wc, feedback_doc) return [query_model, query_IDs]
query_lambda = 0 doc_lambda = 0.9 #remove_list = ["update_embedded_query_expansion_ci.pkl", "update_embedded_query_expansion_qi.pkl", "collection_embedded.pkl", "query_embedded.pkl", "collection_total_similarity.pkl"] remove_list = [] document_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW" query_path = "../Corpus/TDT2/QUERY_WDID_NEW_middle" word_emb_path = "data/word2vec_dict.pkl" relevance_path = "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt" # document model data = ProcDoc.read_file(document_path) doc_wordcount = ProcDoc.doc_preprocess(data) doc_unigram = ProcDoc.unigram(dict(doc_wordcount)) doc_mdl, doc_IDs = ProcDoc.dict2np(doc_unigram) # background_model background_model = ProcDoc.read_background_dict() background_model_np = ProcDoc.read_background_np() # document smoothing for doc_idx in xrange(doc_mdl.shape[0]): doc_vec = doc_mdl[doc_idx] doc_mdl[doc_idx] = ( 1 - doc_lambda) * doc_vec + doc_lambda * background_model_np # general model collection = {} collection_total_similarity = {} for key, value in doc_wordcount.items(): for word, count in value.items():
query_lambda = 0.36 doc_lambda = 0.82 remove_list = [ "update_embedded_query_expansion_ci.pkl", "update_embedded_query_expansion_qi.pkl", "collection_embedded.pkl", "query_embedded.pkl", "collection_total_similarity.pkl" ] document_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW" query_path = "../Corpus/TDT2/QUERY_WDID_NEW" # document model data = ProcDoc.read_file(document_path) doc_wordcount = ProcDoc.doc_preprocess(data) doc_unigram = ProcDoc.unigram(dict(doc_wordcount)) doc_mdl, doc_IDs = ProcDoc.dict2np(doc_unigram) # background_model background_model = ProcDoc.read_background_dict() background_model_np = ProcDoc.read_background_np() # document smoothing for doc_idx in xrange(doc_mdl.shape[0]): doc_vec = doc_mdl[doc_idx] doc_mdl[doc_idx] = ( 1 - doc_lambda) * doc_vec + doc_lambda * background_model_np # general model collection = {} collection_total_similarity = {} for key, value in doc_wordcount.items(): for word, count in value.items():