Exemplo n.º 1
0
 def embedded_query_expansion_qi(self, interpolated_aplpha, m):
     query_embedded = self.query_embedded
     query_wordcount = self.query_wordcount
     collection = self.collection
     collection_total_similarity = self.collection_total_similarity
     word2vec = self.word2vec
     # copy query model
     query_model = Pickle.load(open("model/query_model.pkl", "rb"))
     embedded_query_expansion = query_model
     
     update_embedded_query_expansion = {}
     if os.path.isfile("model/update_embedded_query_expansion_qi.pkl") == True:
         # check if a file exist
         update_embedded_query_expansion = Pickle.load(open("model/update_embedded_query_expansion_qi.pkl", "rb"))
     else:	
         # calculate every query
         for query, query_word_count_dict in query_wordcount.items():
             top_prob_dict = {}
             # calculate every word in collection
             for word in collection.keys():
                 # for every word in current query
                 query_length = ProcDoc.word_sum(query_word_count_dict) * 1.0
                 # p(w|q)
                 p_w_q = 0
                 if not word in query_word_count_dict:
                     for word_sq, word_sq_count in query_word_count_dict.items():
                         total_probability = collection_total_similarity[word_sq]
                         if word_sq in query_embedded:
                             cur_word_similarity = word2vec.getWordSimilarity(collection[word], query_embedded[word_sq])
                             p_w_q += (cur_word_similarity / total_probability )  * (word_sq_count / query_length)
                 
                 # storage probability
                 top_prob_dict[word] = p_w_q
             # softmax	
             top_prob_dict = ProcDoc.softmax(top_prob_dict)
             # sorted top_prob_dict by value(probability)
             top_prob_list = sorted(top_prob_dict.items(), key=operator.itemgetter(1), reverse = True)
             # storage update query model value
             update_embedded_query_expansion[query] = top_prob_list
         Pickle.dump(update_embedded_query_expansion, open("model/update_embedded_query_expansion_qi.pkl", "wb"), True)	
     
     # update query model	
     for update_query, update_query_word_list in update_embedded_query_expansion.items():
         filepath = "visual/" + update_query + "_qi.png"
         if os.path.isfile(filepath) == False:
             visualization.visualization(collection, update_query_word_list, filepath)
         for update_word, update_count in update_query_word_list[:m]:
             update = update_count
             origin = 0
             if update_word in query_model[update_query]:
                 origin = query_model[update_query][update_word]
                 query_model[update_query].pop(update_word, None)
                 
             embedded_query_expansion[update_query][update_word] = interpolated_aplpha * origin + (1 - interpolated_aplpha) * update
             
         for un_changed_word in query_model[update_query].keys():
             embedded_query_expansion[update_query][un_changed_word] *= interpolated_aplpha	
         # softmax		
         embedded_query_expansion[update_query] = ProcDoc.softmax(embedded_query_expansion[update_query])	
     return 	embedded_query_expansion			
    def __init__(self,
                 num_of_homo_feats=10,
                 max_qry_length=1794,
                 max_doc_length=2907,
                 query_path=None,
                 document_path=None,
                 corpus="TDT2"):
        res_pos = True
        str2int = True
        self.num_vocab = 51253
        self.max_qry_length = max_qry_length
        self.max_doc_length = max_doc_length
        self.num_of_homo_feats = num_of_homo_feats
        if query_path == None:
            query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
        if document_path == None:
            document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW"
        # read document, reserve position
        doc = ProcDoc.read_file(document_path)
        self.doc = ProcDoc.doc_preprocess(doc, res_pos, str2int)

        # read query, reserve position
        qry = ProcDoc.read_file(query_path)
        self.qry = ProcDoc.query_preprocess(qry, res_pos, str2int)

        # HMMTrainingSet
        self.hmm_training_set = ProcDoc.read_relevance_dict()
        self.homo_feats = self.__genFeature(num_of_homo_feats)
Exemplo n.º 3
0
def specific_modeling(feedback_doc):
    # normalize, sum of the (word_prob = 1) in the document
    feedback_w_doc = ProcDoc.inverted_word_doc(dict(feedback_doc))

    for word, doc_unigram in feedback_w_doc.items():
        feedback_w_doc[word] = ProcDoc.softmax(dict(doc_unigram))

    # specific modeling
    # if the term frequency is supported by almost all documents
    # the term will be penalized because of its low prevalence.
    specific_model = {}
    for word, doc_unigram in feedback_w_doc.items():
        # calculate each word in current document
        word_specific_level = 0
        for doc_name, prob in doc_unigram.items():
            cur_doc_word_prob = prob
            '''
            for other_doc_name, other_prob in doc_unigram.items():
                if doc_name == other_doc_name:
                    continue
                cur_doc_word_prob *= (1 - other_prob)
            '''
            # word_specific_level += cur_doc_word_prob
            word_specific_level += -1 * cur_doc_word_prob * log(
                cur_doc_word_prob)

# specific_model[word] = word_specific_level
        specific_model[word] = sigmoid(1.0 / (0.5 + word_specific_level))
    # softmax
    specific_model = ProcDoc.softmax(dict(specific_model))
    return specific_model
Exemplo n.º 4
0
def feedback(query_docs_point_dict, query_model, doc_unigram, doc_wordcount,
             general_model, background_model, topN):
    lambda_bg = 0.1
    lambda_fb = 0.8
    lambda_ir_fb = 0.2
    lambda_q = 0.1
    specific_model = {}
    for q_key, docs_point_list in query_docs_point_dict.items():
        feedback_doc = {}
        feedback_doc_wc = {}
        # Extract feedback document
        for doc_name in docs_point_list[0:topN]:
            feedback_doc[doc_name] = copy.deepcopy(doc_unigram[doc_name])
            feedback_doc_wc[doc_name] = copy.deepcopy(doc_wordcount[doc_name])
        # generate specific model
        specific_model = specific_modeling(dict(feedback_doc))
        # generate significant model
        significant_model = significant_modeling(general_model, specific_model,
                                                 feedback_doc, feedback_doc_wc)
        '''
        ir_feedback_doc = {}
        ir_feedback_doc_wc = {}
		# Extract irrelevant feedback document 
        for doc_name, point in docs_point_list[len(docs_point_list)-topN:]:
            ir_feedback_doc[doc_name] = doc_unigram[doc_name]
            ir_feedback_doc_wc[doc_name] = doc_wordcount[doc_name]
        # generate specific model    
        ir_specific_model = specific_modeling(dict(ir_feedback_doc))
        # generate significant model
        ir_significant_model = significant_modeling(general_model, ir_specific_model, ir_feedback_doc, ir_feedback_doc_wc)
        '''
        for word, fb_w_prob in significant_model.items():
            original_prob = 0.0
            if word in query_model[q_key]:
                original_prob = query_model[q_key][word]
            else:
                original_prob = 0.0
            # update query unigram
            query_model[q_key][word] = (lambda_q * original_prob) + (
                lambda_fb * fb_w_prob) + (lambda_bg * background_model[word])
        '''
        for word, ir_fb_w_prob in ir_significant_model.items():
            if word in query_model[q_key]:
                query_model[q_key][word] = (1 - lambda_ir_fb) * query_model[q_key][word] + lambda_ir_fb * ir_fb_w_prob
        '''
        query_model[q_key] = ProcDoc.softmax(dict(query_model[q_key]))
    query_model, query_IDs = ProcDoc.dict2np(query_model)
    # plot_diagram.plotModel(general_model, specific_model, significant_model, feedback_doc_wc, feedback_doc)

    return [query_model, query_IDs]
Exemplo n.º 5
0
def run():
	INIT_PROBABILITY = 1.0 / 60
	topic_word_prob_dict = ProcDoc.read_clusters()									# read cluster P(W|T), {T: {W:Prob}}
	doc_topic_prob_dict = defaultdict(dict)														# P(T|D),{D:{T:Prob}} 
	doc_word_topic_prob_dict = defaultdict(dict)									# P(T| w, D), {D: {word:{T:prob}}}
	doc_wc_dict = ProcDoc.read_doc_dict()  											# read document (Doc No.,Doc content)  
	doc_wc_dict = ProcDoc.doc_preprocess(doc_wc_dict)
	# calculate word of the background
	# convert (Doc No.,Doc content) to (Doc_No, {word, count})
	for docName, content in doc_wc_dict.items():
		temp_dict = ProcDoc.word_count(content, {})
		doc_wc_dict[docName] = temp_dict

	# initialize P(T|D)
	print "Initialize P(T|D)"
	for docName, wordCount in doc_wc_dict.items():
		topic_prob = {}
		for topic, wordProb in topic_word_prob_dict.items():
			doc_topic_prob_dict[docName][topic] = INIT_PROBABILITY
			
	'''
	print "Initialize P(T| w, D)"
	for docName, wordCount in doc_wc_dict.items():	
		word_list = {}
		for word, frequency in wordCount.items():	
			topic_prob = {}
			for topic, wordProb in topic_word_prob_dict.items():
				topic_prob[topic] = 0.0
			word_list[word] = topic_prob
		doc_word_topic_prob_dict[docName] = word_list
	'''
	print "start PLSA"
	[topic_word_prob_dict, doc_topic_prob_dict] = PLSA.Probability_LSA(doc_wc_dict, doc_topic_prob_dict, topic_word_prob_dict, doc_word_topic_prob_dict)
	print "end PLSA"
	
	p_plsa = {}			# PLSA P(W|D) {D: {W : Prob}}
	for doc, topic_prob_list in doc_topic_prob_dict.items():
		p_plsa_word = {}
		for topic, doc_prob in topic_prob_list.items():
			for word, word_prob in topic_word_prob_dict[topic].items():
				print word, word_prob
				if word in p_plsa_word:
					p_plsa_word[word] += word_prob * doc_prob
				else:
					p_plsa_word[word] = word_prob * doc_prob
			
		p_plsa[doc] = p_plsa_word

	return p_plsa
    def __genFeature(self, num_of_homo_feats):
        ###################### TODO ######################
        print "generate h features"
        qry = self.qry
        doc = self.doc
        homo_feats = {}
        df = ProcDoc.docFreq(doc)

        for q_id, q_terms in qry.items():
            npscq = np.asarray([self.scq(df, q_term) for q_term in q_terms])
            homo_feats[q_id] = np.asarray([
                np.sum(npscq),
                np.amax(npscq),
                np.amin(npscq),
                np.mean(npscq)
            ])

        # np.sum(a)
        # np.amax(a)
        # np.amin(a)
        # np.mean(a)
        # a.prod()**(1.0/len(a))
        # len(a) / np.sum(1.0/a)
        # var = variation(a, axis=0) idmax = np.argmax(var)

        return homo_feats
Exemplo n.º 7
0
def significant_modeling(general_model, specific_model, feedback_doc,
                         feedback_doc_wc):
    lambda_sw = 0.1
    lambda_s = 0.2
    lambda_g = 0.7
    significant_model = {}
    # initialize
    feedback_word = []
    for doc_name, word_count in feedback_doc_wc.items():
        for word, count in word_count.items():
            if word in feedback_word:
                continue
            else:
                feedback_word.append(word)
    for s_word in feedback_word:
        significant_model[s_word] = 1.0 / len(feedback_word)

    hidden_significant_doc_word = {}
    objective_value_list = []
    # EM training
    for step in range(100):
        # E Step:
        for doc_name, word_count in feedback_doc_wc.items():
            hidden_word_variable = {}
            for word, count in word_count.items():
                denominator = lambda_sw * significant_model[
                    word] + lambda_s * specific_model[
                        word] + lambda_g * general_model[word]
                hidden_word_variable[
                    word] = lambda_sw * significant_model[word] / denominator
            hidden_significant_doc_word[doc_name] = hidden_word_variable
        # M Step:
        denominator = 0.0
        for word in list(significant_model.keys()):
            word_sum = 0
            for doc_name, word_count in feedback_doc_wc.items():
                if word in word_count:
                    word_sum += word_count[word] * hidden_significant_doc_word[
                        doc_name][word]
                    denominator += word_sum
            significant_model[word] = word_sum

        significant_model = {
            word: word_sum / denominator
            for word, word_sum in dict(significant_model).items()
        }

        # softmax
        significant_model = ProcDoc.softmax(dict(significant_model))
        # Objective function
        objective_value = 0.0
        for doc_name, word_count in feedback_doc_wc.items():
            for word, count in word_count.items():
                objective_value += count * log(lambda_sw *
                                               significant_model[word] +
                                               lambda_g * general_model[word] +
                                               lambda_s * specific_model[word])
        objective_value_list.append(objective_value)
    #plot_diagram.plotList(objective_value_list)
    return significant_model
def main():
	documents = ProcDoc.read_doc()
	texts = [[word for word in document.lower().split()] for document in documents]
	total_docs = len(texts) * 1.0

	term_freq = []
	doc_freq = {}
	for text in texts:
		cur_term_freq = {}
		for token in text:
			if token in cur_term_freq:
				cur_term_freq[token] += 1
			else:	
				cur_term_freq[token] = 1
				if token in doc_freq:
					doc_freq[token] += 1
				else:
					doc_freq[token] = 1
		term_freq.append(cur_term_freq)

	tfidf = []	
	for doc_tf in term_freq:
		doc_tfidf = {}
		for term, tf in doc_tf.items():
			idf = log(1 + total_docs / doc_freq[term])
			doc_tfidf[term] = tf / idf	
		tfidf.append(doc_tfidf)	
		
	_tfidf = []
	for doc_tfidf in tfidf:
		vector = []
		for token in doc_freq.keys():
			if token in doc_tfidf:
				vector.append(doc_tfidf[token])
			else:	
				vector.append(0)
		_tfidf.append(vector)
	
	_tfidf = np.array(_tfidf)
	
	
	output = Queue()
	pipeline = [_tfidf[:len(_tfidf) * 1/ 4], _tfidf[len(_tfidf) * 1/ 4:len(_tfidf) * 2/ 4], _tfidf[len(_tfidf) * 2/ 4:len(_tfidf) * 3/ 4], _tfidf[len(_tfidf) * 3/ 4:]]
		
	processes = [Process(target=my_cosine_similarity, args=(output, x_func, _tfidf)) for x_func in pipeline]
		
	for p in processes:
		p.start()
		
	result = [output.get() for p in processes]
	result.sort()
	cosine_sim = []
	
	for r in results:
		cosine_sim += r[1]
		
	cosine_sim = sparse.csr_matrix(cosine_sim)
	print cosine_sim
	return cosine_sim	
Exemplo n.º 9
0
def feedback(query_docs_point_dict, query_model, doc_unigram, doc_wordcount,
             general_model, background_model, topN):
    lambda_bg = 0.1
    lambda_fb = 0.8
    lambda_ir_fb = 0.2
    lambda_q = 0.1
    specific_model = {}
    significant_model_dict = {}
    for q_key, docs_point_list in query_docs_point_dict.items():
        feedback_doc = {}
        feedback_doc_wc = {}
        # Extract feedback document
        for doc_name, point in docs_point_list[0:topN]:
            feedback_doc[doc_name] = copy.deepcopy(doc_unigram[doc_name])
            feedback_doc_wc[doc_name] = copy.deepcopy(doc_wordcount[doc_name])
        # generate specific model
        specific_model = specific_modeling(dict(feedback_doc))
        # generate significant model
        significant_model = significant_modeling(general_model, specific_model,
                                                 feedback_doc, feedback_doc_wc)
        '''
        ir_feedback_doc = {}
        ir_feedback_doc_wc = {}
		# Extract irrelevant feedback document 
        for doc_name, point in docs_point_list[len(docs_point_list)-topN:]:
            ir_feedback_doc[doc_name] = doc_unigram[doc_name]
            ir_feedback_doc_wc[doc_name] = doc_wordcount[doc_name]
        # generate specific model    
        ir_specific_model = specific_modeling(dict(ir_feedback_doc))
        # generate significant model
        ir_significant_model = significant_modeling(general_model, ir_specific_model, ir_feedback_doc, ir_feedback_doc_wc)
        '''
        for word, fb_w_prob in significant_model.items():
            original_prob = 0.0
            if word in query_model[q_key]:
                original_prob = query_model[q_key][word]
            else:
                original_prob = 0.0
            # update query unigram
            query_model[q_key][word] = (lambda_q * original_prob) + (
                lambda_fb * fb_w_prob) + (lambda_bg * background_model[word])
        '''
        for word, ir_fb_w_prob in ir_significant_model.items():
            if word in query_model[q_key]:
                query_model[q_key][word] = (1 - lambda_ir_fb) * query_model[q_key][word] + lambda_ir_fb * ir_fb_w_prob
        '''

        query_model[q_key] = ProcDoc.softmax(dict(query_model[q_key]))

    #plot_diagram.plotModel(general_model, specific_model, significant_model, feedback_doc_wc, feedback_doc)

    if topN == None:
        with open("rel_supervised_swlm_entropy_s.pkl", "wb") as file:
            Pickle.dump(query_model, file, True)
    else:
        with open("rel_swlm_entropy_S_" + str(topN) + ".pkl", "wb") as file:
            Pickle.dump(query_model, file, True)

    return query_model
Exemplo n.º 10
0
def calculate(pred_relevance, split_idx):
	rel_query_model = pred_relevance
	print type(rel_query_model)
	print rel_query_model.shape.eval()
	with open("query_model.pkl", "rb") as file: query_model = Pickle.load(file)[:split_idx]
	with open("query_list.pkl", "rb") as file:	query_list = Pickle.load(file)[:split_idx]

	with open("doc_model.pkl", "rb") as file: doc_model = Pickle.load(file)
	with open("doc_list.pkl", "rb") as file: doc_list = Pickle.load(file)
	#with open("relevance_model_RM.pkl", "rb") as file : rel_query_model = Pickle.load(file)
	#with open("query_relevance_model_RLE.pkl", "rb") as file : rel_query_model = Pickle.load(file)

	background_model = ProcDoc.read_background_dict()
	qry_eval = evaluate.evaluate_model(True)

	''' document smoothing '''
	for doc_idx in range(doc_model.shape[0]):
		doc_vec = doc_model[doc_idx]
		doc_model[doc_idx] = (1 - doc_lambda) * doc_vec + doc_lambda * background_model
	
	mAP_list = []
	query_rel_list = []
	query_bg_list = []	
	doc_model = np.log(doc_model)

	doc_model = doc_model
	for rel_qry_lambda in np.linspace(0, 1., num=11):
		''' query smoothing '''	
		with open("query_model.pkl", "rb") as file: query_model = Pickle.load(file)[:split_idx]
		X = T.matrix()
		Y = (1- rel_qry_lambda)*X + rel_qry_lambda * rel_query_model 
		f = theano.function([X], Y)
		query_model = f(query_model)
		result = np.argsort(-np.dot(query_model, doc_model.T), axis = 1)
		query_docs_ranking = {}
		''' speedup '''
		for q_idx in range(len(query_list)):
			docs_ranking = []
			for doc_idx in result[q_idx]:
				docs_ranking.append(doc_list[doc_idx])
			query_docs_ranking[query_list[q_idx]] = docs_ranking
		
		''' query 
		for query_key, query_vec in  zip(query_list, query_model):
			print len(query_docs_ranking.keys())
			query_result = np.argsort(-(query_vec * doc_model).sum(axis = 1))
			docs_ranking = []
			for doc_`idx in query_result:
				docs_ranking.append(doc_list[doc_idx])
				query_docs_ranking[query_key] = docs_ranking
			
		mAP = eval.mean_average_precision(query_docs_ranking)	
		print mAP, qry_lambda, rel_qry_lambda
		'''
		mAP = qry_eval.mean_average_precision(query_docs_ranking)	
		mAP_list.append(mAP)
	return max(mAP_list)
def plotModel(general_model, specific_model, significant_model,
              feedback_doc_wc, feedback_doc_unigram):

    general_model_softmax = {}
    general_list = []
    specific_list = []
    significant_list = []
    unigram_list = []
    feedback_wc = {}
    feedback_wu = {}

    for doc, wc in feedback_doc_wc.items():
        total_word_sum = ProcDoc.word_sum(wc)
        for word, count in wc.items():
            if word in feedback_wc:
                feedback_wc[word] += count
                feedback_wu[
                    word] += total_word_sum * feedback_doc_unigram[doc][word]
            else:
                feedback_wc[word] = count
                feedback_wu[
                    word] = total_word_sum * feedback_doc_unigram[doc][word]

    feedback_wc = sorted(feedback_wc.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
    total_word_sum = ProcDoc.word_sum(dict(feedback_wc))
    for word, count in feedback_wc:
        general_list.append(count)
        specific_list.append(total_word_sum * specific_model[word])
        significant_list.append(total_word_sum * significant_model[word])
        unigram_list.append(feedback_wu[word])

    import matplotlib.pyplot as plt
    plt.figure(8)
    plt.plot(range(len(general_list)), general_list, label='general')
    plt.plot(range(len(specific_list)), specific_list, label='specific')
    # plt.plot(range(len(significant_list)), significant_list, label='significant')
    # plt.plot(range(len(unigram_list)), unigram_list, label='unigram')
    plt.title('Loss')
    plt.legend(loc='upper left')
    plt.title('Accuracy')
    plt.show()
    r = raw_input()
Exemplo n.º 12
0
    def __init__(self,
                 qry_path=None,
                 rel_path=None,
                 isTraining=True,
                 doc_path=None):
        # default training step
        if qry_path == None:
            qry_path = "../Corpus/TDT2/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
        if doc_path == None:
            doc_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW"
        if rel_path == None:
            rel_path = "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain"
        self.vocab_size = 51253
        # relevance set
        self.rel_set = ProcDoc.readRELdict(rel_path, isTraining)
        self.evaluate_model = EvaluateModel(rel_path, isTraining)

        # read documents
        doc = ProcDoc.readFile(doc_path)
        self.doc = ProcDoc.docPreproc(doc)
        self.doc_len = Statistical.compLenAcc(self.doc)

        # read queries
        qry = ProcDoc.readFile(qry_path)
        self.qry_tf = ProcDoc.qryPreproc(qry, self.rel_set)
        self.qry_len = Statistical.compLenAcc(self.qry_tf)
        [self.qry, self.doc] = Statistical.TFIDF(self.qry_tf, self.doc,
                                                 self.qry_len, self.doc_len)

        # dict to numpy
        self.qry_tf, self.qry_tf_IDs = self.__dict2np(self.qry_tf)
        self.qry, self.qry_IDs = self.__dict2np(self.qry, self.qry_tf_IDs)
        self.doc, self.doc_IDs = self.__dict2np(self.doc)

        # precompute len(document)
        self.doc = Statistical.l2Normalize(self.doc)
Exemplo n.º 13
0
    def __init__(self, query_model, isSpoken=False):
        smoothing = 0.0
        with open("test_query_list.pkl", "rb") as file:
            self.query_list = pickle.load(file)
        with open("doc_list.pkl", "rb") as file:
            self.doc_list = pickle.load(file)
        if isSpoken:
            with open("doc_model_wc_s.pkl", "rb") as file:
                doc_model = pickle.load(file)
        else:
            with open("doc_model_wc.pkl", "rb") as file:
                doc_model = pickle.load(file)

        background_model = ProcDoc.read_background_dict()
        self.query_model = copy.deepcopy(query_model)
        self.vocabulary_size = 51253
        self.doc_model = copy.deepcopy(doc_model)
        self.doc_model = doc_model
        self.background_model = background_model
    def __init__(self, query_model):
        self.query_model = copy.deepcopy(query_model)
        self.vocabulary_size = 51253
        smoothing = 0.1
        with open("test_query_list.pkl", "rb") as file:
            self.query_list = pickle.load(file)
        with open("doc_list.pkl", "rb") as file:
            self.doc_list = pickle.load(file)

        with open("doc_model.pkl", "rb") as file:
            doc_model = pickle.load(file)

        self.background_model = ProcDoc.read_background_dict()
        ''' smoothing '''
        for d_idx, doc_vec in enumerate(doc_model):
            doc_model[d_idx] = (
                1 - smoothing) * doc_vec + smoothing * self.background_model

        self.doc_model = doc_model
    def __genFeature(self, num_of_homo_feats):
        print "generate h features"
        qry = self.qry
        doc = self.doc
        homo_feats = {}
        df = ProcDoc.docFreq(doc)

        for q_id, q_terms in qry.items():
            npscq = np.asarray([self.__scq(df, q_term) for q_term in q_terms])
            harm_mean = self.__harm_mean(npscq)
            geo_mean = self.__geo_mean(npscq)
            homo_feats[q_id] = np.asarray([
                np.std(npscq),
                np.sum(npscq),
                np.amax(npscq),
                np.amin(npscq),
                np.mean(npscq), harm_mean, geo_mean
            ])
        return homo_feats
Exemplo n.º 16
0
def MStep(doc_wc_dict, doc_topic_prob_dict, topic_word_prob_dict,
          doc_word_topic_prob_dict):
    # P(w | T)
    for tp, w_prob_list in topic_word_prob_dict.items():
        for word, word_prob in w_prob_list.items():
            denominator = 0.0
            for w, w_p in w_prob_list.items():
                for doc_name, doc_wc_list in doc_wc_dict.items():
                    try:
                        d_w_c = doc_wc_list[w]
                        d_w_t_p = doc_word_topic_prob_dict[doc_name][w][tp]
                        denominator += d_w_c * d_w_t_p
                    except KeyError:
                        pass

            molecellur = 0.0
            for doc_name, doc_wc_list in doc_wc_dict.items():
                try:
                    d_w_c = doc_wc_list[word]
                    d_w_t_p = doc_word_topic_prob_dict[doc_name][word][tp]
                    molecellur += d_w_c * d_w_t_p
                except KeyError:
                    pass

            if denominator != 0.0:
                topic_word_prob_dict[tp][word] = molecellur / denominator

    # P(T| D)
    for doc_name, topic_list in doc_topic_prob_dict.items():
        denominator = ProcDoc.word_sum(doc_wc_dict[doc_name]) * 1.0
        for tp, tp_prob in topic_list.items():
            molecellur = 0.0
            for d_w, doc_wc in doc_wc_dict[doc_name].items():
                try:
                    d_w_c = doc_wc
                    d_w_t_p = doc_word_topic_prob_dict[doc_name][d_w][tp]
                    molecellur += d_w_c * d_w_t_p / denominator
                except KeyError:
                    pass
            doc_topic_prob_dict[doc_name][tp] = molecellur
Exemplo n.º 17
0
import sys
sys.path.append("../Tools")

import numpy as np
import cPickle as pickle
import ProcDoc
from PLSA_class import pLSA
from Clustering import ClusterModel

np.random.seed(1337)
corpus = "TDT2"
doc_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW"
cluster_dir = "Topic"
num_of_topic = 4
iterations = 20
doc = ProcDoc.readFile(doc_path)
doc_dict = ProcDoc.docPreproc(doc)

# general model
collection = {}
for doc_ID, word_count in doc_dict.items():
    for word, count in word_count.items():
        if word in collection:
            collection[word] += count
        else:
            collection[word] = count

if not os.path.isfile(cluster_dir + "/pwz_list.pkl"):
    with open("exp/w_IDs.pkl", "wb") as wIDs_file : pickle.dump(collection.keys(), wIDs_file, True)
    cluster_mdl = ClusterModel(doc_dict, collection.keys(), num_of_topic)
    cluster_mdl.save(cluster_dir)
Exemplo n.º 18
0
import cPickle as Pickle
import os

data = {}				# content of document (doc, content)
background_model = {}	# word count of 2265 document (word, number of words)
general_model = {}
query = {}				# query
vocabulary = np.zeros(51253)

#document_path = "../Corpus/Spoken_Doc"
document_path = "../Corpus/SPLIT_DOC_WDID_NEW"	
query_path = "../Corpus/Train/XinTrainQryTDT2/QUERY_WDID_NEW"


# read document
data = ProcDoc.read_file(document_path)
doc_wordcount = ProcDoc.doc_preprocess(data)

# HMMTraingSet
HMMTraingSetDict = ProcDoc.read_relevance_dict()
query_relevance = {}

query = ProcDoc.read_file(query_path)
query = ProcDoc.query_preprocess(query)
query_wordcount = {}

for q, q_content in query.items():
	query_wordcount[q] = ProcDoc.word_count(q_content, {})

query_unigram = ProcDoc.unigram(query_wordcount)
Exemplo n.º 19
0
path = CommonPath(is_training, is_short, is_spoken)
log_filename = path.getLogFilename()
qry_path = path.getQryPath()
doc_path = path.getDocPath()
rel_path = path.getRelPath()

dict_path = path.getDictPath()
bg_path = path.getBGPath()

print("Vector-Space-Model")
# read relevant set for queries and documents
eval_mdl = Evaluate.EvaluateModel(rel_path, is_training)
rel_set = eval_mdl.getAset()

# Preprocess for queries and documents
qry_file = ProcDoc.readFile(qry_path)
doc_file = ProcDoc.readFile(doc_path)

# Term Frequency
qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set)
doc_mdl_dict = ProcDoc.docPreproc(doc_file)

# Convert dictionary to numpy array (feasible to compute)
qry_mdl_np_, qry_IDs = ProcDoc.dict2npSparse(qry_mdl_dict)
doc_mdl_np_, doc_IDs = ProcDoc.dict2npSparse(doc_mdl_dict)

# TF-IDF
print("TF-IDF")
[qry_mdl_np, doc_mdl_np] = Statistical.TFIDF(qry_mdl_np_, doc_mdl_np_, {"qry":[3, 3], "doc": [3, 3]})

# Cosine Similarity
Exemplo n.º 20
0

with open(model_path + "doc_list.pkl", "rb") as f:
    doc_list = Pickle.load(f)
with open(model_path + "query_list.pkl", "rb") as f:
    qry_list = Pickle.load(f)
with open(model_path + "test_query_list.pkl", "rb") as f:
    tstQry_list = Pickle.load(f)

wordModel = word2vec_model.word2vec_model()
wordVec = wordModel.getWord2Vec()
vocab_length = wordModel.vocabulary_length
print vocab_length

# document
doc = ProcDoc.read_file(document_path)
doc = ProcDoc.doc_preprocess(doc)
#[docTmpList, docEmbList] = content2Emb(doc, wordVec, 100)
#doc_emb = rePermute(docTmpList, docEmbList, doc_list)
#doc_emb = content2List(doc, doc_list)
#doc_emb = np.asarray(doc_emb)
#print doc_emb.shape
#np.save(model_path + "doc_id_fix_pad.npy", doc_emb)

# train query
query = ProcDoc.read_file(query_path)
query = ProcDoc.query_preprocess(query)
#[qryTmpList, qryEmbList] = content2Emb(query, wordVec, 100)
#qry_emb = rePermute(qryTmpList, qryEmbList, qry_list)
qry_emb = content2List(query, qry_list)
qry_emb = np.asarray(qry_emb)
Exemplo n.º 21
0
type_feat = "sparse"  # or embeddings
query_path = None
document_path = None
QDrel_file_path = None

corpus = "TDT2"

# qry and doc
if query_path == None:
    query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
if document_path == None:
    document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW"
if QDrel_file_path == None:
    QDrel_file_path = "../Significant-Words-Language-Models/train-qry-results-0.675969697596.txt"
# relevancy set
hmm_training_set = ProcDoc.readRELdict()

# read document, reserve position
doc = ProcDoc.readFile(document_path)
doc = ProcDoc.docPreproc(doc, RES_POS)

# read query, reserve position
qry = ProcDoc.readFile(query_path)
qry = ProcDoc.qryPreproc(qry, hmm_training_set, RES_POS)
QDrel = RelPrep.readQDRel(QDrel_file_path)

print len(qry), len(doc)
print len(QDrel)
NRMprep.getTrainAndValidation(qry, doc, QDrel, NUM_VOCAB, type_rank, type_feat)
# (pointwise or pairwise) and (sparse or embeddings)
# prepare data and label
Exemplo n.º 22
0
ID_map = {}


def ID2Word(proc_dict, ID_map):
    for key, content in proc_dict.items():
        for i, ID in enumerate(content):
            content[i] = ID_map[ID]
    return proc_dict


# read relevant set for queries and documents
eval_mdl = Evaluate.EvaluateModel(rel_path, is_training)
rel_set = eval_mdl.getAset()

# read queris and documents
qry_file = ProcDoc.readFile(qry_path)
doc_file = ProcDoc.readFile(doc_path)

# preprocess + reserve postion infomation
qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set, True)
doc_mdl_dict = ProcDoc.docPreproc(doc_file, True)

# read dictionary (ID, Word)
import codecs
with codecs.open(dict_path, 'r', encoding='utf-8') as rf:
    for idx, line in enumerate(rf.readlines()):
        info = line.split("\r\n")[0].split(" ")
        ID_map[idx] = info[-1]

qry_mdl_dict = ID2Word(qry_mdl_dict, ID_map)
doc_mdl_dict = ID2Word(doc_mdl_dict, ID_map)
Exemplo n.º 23
0
import numpy as np
import ProcDoc
import cPickle as Pickle

corpus = "TDT2"
model_path = "../Corpus/model/" + corpus + "/UM/"

with open(model_path + "query_model.pkl", "rb") as f:
    qry_model = Pickle.load(f)
with open(model_path + "doc_model.pkl", "rb") as f:
    doc_model = Pickle.load(f)
background = ProcDoc.read_background_dict()
qry_smooth_alpha = 0.
doc_smooth_alpha = 0.8

background_model = ProcDoc.read_background_dict()
print background_model.shape

for idx, vec in enumerate(doc_model):
    doc_model[idx] = (1 -
                      doc_smooth_alpha) * vec + doc_smooth_alpha * background

for idx, vec in enumerate(qry_model):
    qry_model[idx] = (1 -
                      qry_smooth_alpha) * vec + qry_smooth_alpha * background

LM_score = np.dot(qry_model, np.log(doc_model).T)
with open("LM_score.pkl", "wb") as f:
    Pickle.dump(LM_score, f, True)
Exemplo n.º 24
0
# -*- coding: utf-8 -*-
import ProcDoc
from gensim import corpora, models, matutils
from sklearn.cluster import KMeans
documents = ProcDoc.read_doc()
documents = ProcDoc.doc_preprocess(documents)

# remove common words and tokenize
texts = [[word for word in document.lower().split()] for document in documents]

texts = [[token for token in text] for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

print "TFIDF:"
corpus_tfidf = matutils.corpus2csc(corpus_tfidf).transpose()
print corpus_tfidf
print "__________________________________________"

num_of_clusters = 64
kmeans = KMeans(n_clusters=num_of_clusters)
doc_cluster = kmeans.fit_predict(corpus_tfidf)
clusters = [[] for i in range(num_of_clusters)]

doc_index = 0
for cluster in doc_cluster:
    clusters[cluster].append(doc_index)
    doc_index += 1
Exemplo n.º 25
0
import cPickle as Pickle
import plot_diagram

data = {}  # content of document (doc, content)
background_model = {}  # word count of 2265 document (word, number of words)
general_model = {}
query = {}  # query
query_lambda = 0.4
doc_lambda = 0.8

document_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW"
query_path = "../Corpus/TDT2/QUERY_WDID_NEW_middle"
#query_path = "../Corpus/Train/XinTrainQryTDT2/QUERY_WDID_NEW"

# document model
data = ProcDoc.read_file(document_path)
doc_wordcount = ProcDoc.doc_preprocess(data)
doc_unigram = ProcDoc.unigram(doc_wordcount)

#word_idf = ProcDoc.inverse_document_frequency(doc_wordcount)

# background_model
background_model = ProcDoc.read_background_dict()

# general model
collection = {}
for key, value in doc_wordcount.items():
    for word, count in value.items():
        if word in collection:
            collection[word] += count
        else:
Exemplo n.º 26
0
import ProcDoc
import Expansion
import timeit
import evaluate
import cPickle as Pickle

data = {}  # content of document (doc, content)
query = {}  # query
doc_freq = {}

document_path = "../../Corpus/TDT2/Spoken_Doc"
query_path = "../../Corpus/TDT2/QUERY_WDID_NEW"
#with open("HMMTraingSetDict.pkl", "rb") as file: HMMTraingSetDict = Pickle.load(file)

# document model
data = ProcDoc.read_file(document_path)
doc_wordcount = ProcDoc.doc_preprocess(data)
total_docs = len(doc_wordcount.keys()) * 1.0
[doc_model, doc_freq] = ProcDoc.compute_TFIDF(doc_wordcount)

# query model
query = ProcDoc.read_file(query_path)
query = ProcDoc.query_preprocess(query)
query_wordcount = {}

for q_key, q_content in query.items():
    query_wordcount[q_key] = ProcDoc.word_count(q_content, {})

query_model = defaultdict(dict)
for q_key, word_count_dict in query_wordcount.items():
    max_freq = np.max(np.array(word_count_dict.values()), axis=0)
Exemplo n.º 27
0
    doc_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW"
    nn_method += ".h5"
    results_file += ".txt"
    rel_lambda = 0.5

dict_path = "../Corpus/TDT2/LDC_Lexicon.txt"
bg_path = "../Corpus/background"

# read relevant set for queries and documents
eval_mdl = Evaluate.EvaluateModel(rel_path, is_training)
rel_set = eval_mdl.getAset()

alpha = 0.8
beta = 0.4

qry_file = ProcDoc.readFile(qry_path)
doc_file = ProcDoc.readFile(doc_path)

qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set)
doc_mdl_dict = ProcDoc.docPreproc(doc_file)

qry_unimdl_dict = ProcDoc.unigram(qry_mdl_dict)
doc_unimdl_dict = ProcDoc.unigram(doc_mdl_dict)

# origin query model
qry_mdl_np, qry_IDs = ProcDoc.dict2npSparse(qry_unimdl_dict)
# refine query model
ref_qry_mdl_np, qry_IDs = ProcDoc.dict2npSparse(qry_unimdl_dict)
doc_mdl_np, doc_IDs = ProcDoc.dict2npSparse(doc_unimdl_dict)

NRM_mdl_np = nn_model.predict(nn_method, qry_mdl_np)
Exemplo n.º 28
0
def embedded_query_expansion_ci(query_embedded, query_wordcount, collection,
                                collection_total_similarity, word2vec,
                                interpolated_aplpha, m):
    # load query model
    query_model = Pickle.load(open("model/query_model.pkl", "rb"))
    embedded_query_expansion = query_model

    update_embedded_query_expansion = {}
    if os.path.isfile("model/update_embedded_query_expansion_ci.pkl") == True:
        # check if a file exist
        update_embedded_query_expansion = Pickle.load(
            open("model/update_embedded_query_expansion_ci.pkl", "rb"))
    else:
        # calculate every query
        for query, query_word_count_dict in query_wordcount.items():
            top_prob_dict = {}
            # calculate every word in collection
            for word in collection.keys():
                total_probability = collection_total_similarity[word]
                p_w_q = 0
                if not word in query_word_count_dict:
                    p_w_q = total_probability  # p(w|q)
                    # total probability theory(for every query term)
                    for query_term in query_word_count_dict.keys():
                        if query_term in query_embedded:
                            cur_word_similarity = word2vec.getWordSimilarity(
                                query_embedded[query_term], collection[word])
                            p_w_q *= (cur_word_similarity / total_probability)
                # storage probability
                top_prob_dict[word] = p_w_q
            # softmax
            top_prob_dict = ProcDoc.softmax(top_prob_dict)
            # sorted top_prob_dict by value(probability)
            top_prob_list = sorted(top_prob_dict.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)
            update_embedded_query_expansion[query] = top_prob_list
        # storage update expansion
        Pickle.dump(update_embedded_query_expansion,
                    open("model/update_embedded_query_expansion_ci.pkl", "wb"),
                    True)

    # update query model
    for update_query, update_query_word_list in update_embedded_query_expansion.items(
    ):
        filepath = "visual/" + update_query + "_ci.png"
        if os.path.isfile(filepath) == False:
            visualization.visualization(collection, update_query_word_list,
                                        filepath)

        for update_word, update_count in update_query_word_list[:m]:
            update = update_count
            origin = 0
            if update_word in query_model[update_query]:
                origin = query_model[update_query][update_word]
                query_model[update_query].pop(update_word, None)

            embedded_query_expansion[update_query][
                update_word] = interpolated_aplpha * origin + (
                    1 - interpolated_aplpha) * update

        for un_changed_word in query_model[update_query].keys():
            embedded_query_expansion[update_query][
                un_changed_word] *= interpolated_aplpha

        # softmax
        embedded_query_expansion[update_query] = ProcDoc.softmax(
            embedded_query_expansion[update_query])
    return embedded_query_expansion
Exemplo n.º 29
0
                 query_path = None, document_path = None, corpus = "TDT2"):
        #ranks = ["pointwise", "pairwise"]
        #feats = ["spare", "emb"]
        res_pos = True
        self.num_vocab = 51253
        self.num_feats = len_feats
        self.type_rank = type_rank
        self.type_feat = type_feat
        # qry and doc
        if query_path == None: 
            query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
        if document_path == None:
            document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW"
        
        # relevancy set
        self.hmm_training_set = ProcDoc.readRELdict()
        
	# read document, reserve position
        doc = ProcDoc.readFile(document_path)
        self.doc = ProcDoc.docPreproc(doc, res_pos)
		
        # read query, reserve position
        qry = ProcDoc.readFile(query_path)
        self.qry = ProcDoc.qryPreproc(qry, self.hmm_training_set, res_pos)        
        
        # generate h featrues
        self.input_feats = self.__genFeature(self.num_feats)
        
    def genTrainValidSet(self, percent = None, isTest = False):
        print "generate training set and validation set"
        if percent == None: percent = 80
Exemplo n.º 30
0
background_model = {}  # word count of 2265 document (word, number of words)
general_model = {}
query = {}  # query

query_lambda = 0
doc_lambda = 0.9
#remove_list = ["update_embedded_query_expansion_ci.pkl", "update_embedded_query_expansion_qi.pkl", "collection_embedded.pkl", "query_embedded.pkl", "collection_total_similarity.pkl"]
remove_list = []

document_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW"
query_path = "../Corpus/TDT2/QUERY_WDID_NEW_middle"
word_emb_path = "data/word2vec_dict.pkl"
relevance_path = "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt"

# document model
data = ProcDoc.read_file(document_path)
doc_wordcount = ProcDoc.doc_preprocess(data)
doc_unigram = ProcDoc.unigram(dict(doc_wordcount))
doc_mdl, doc_IDs = ProcDoc.dict2np(doc_unigram)
# background_model
background_model = ProcDoc.read_background_dict()
background_model_np = ProcDoc.read_background_np()

# document smoothing
for doc_idx in xrange(doc_mdl.shape[0]):
    doc_vec = doc_mdl[doc_idx]
    doc_mdl[doc_idx] = (
        1 - doc_lambda) * doc_vec + doc_lambda * background_model_np

# general model
collection = {}