def read_subtopic_from_files(basepath, topicID): """ get the subtopic of the original query. return a list with subtopic information. for each data_prepare,would build a word dict to store the information of terms. ------------ return: ------------ subtopic_list: a list with [id,label,words,URLs] subtopic_length_average,subtopic_term_number_all,topic_word_set,query [ID,label,words,frequency,urls] """ subtopic_path = basepath + "subtopic/" topic_path = basepath + "topic/" f = open(subtopic_path + topicID) lines = f.readlines() f.close() query_probability_dict = get_orinal_query_vec(basepath, topicID) subtopic_list = [] urlset = set() topic_word_set = get_topic_words(topic_path + topicID) for line in lines: elements = line.strip().split('||') if len(elements) != 4: continue label = int(elements[0]) words = elements[1].split("\t") subtopic_frequency = int(elements[2]) if len(elements[3]) > 1: urls = elements[3].split("\t") else: urls = [elements[3].strip()] #processing the URL related for url in urls: if url == 0: continue if url not in urlset: urlset.add(url) terms = [] subtopicID = "".join([word.split("/")[0] for word in words]) for word in words: if (not isGoodWord(word)) or word in topic_word_set: continue else: term, pos = word.split("/") terms.append(term) if len(terms) == 0: continue subtopic_list.append( (subtopicID, label, terms, subtopic_frequency, urls)) return subtopic_list, topic_word_set, query_probability_dict
def read_subtopic_from_files(basepath,topicID): """ get the subtopic of the original query. return a list with subtopic information. for each data_prepare,would build a word dict to store the information of terms. ------------ return: ------------ subtopic_list: a list with [id,label,words,URLs] subtopic_length_average,subtopic_term_number_all,topic_word_set,query [ID,label,words,frequency,urls] """ subtopic_path = basepath + "subtopic/" topic_path = basepath+"topic/" f = open(subtopic_path+topicID) lines = f.readlines() f.close() query_probability_dict = get_orinal_query_vec(basepath,topicID) subtopic_list = [] urlset = set() topic_word_set = get_topic_words(topic_path+topicID) for line in lines: elements = line.strip().split('||') if len(elements)!=4:continue label = int(elements[0]) words = elements[1].split("\t") subtopic_frequency = int(elements[2]) if len(elements[3])>1: urls = elements[3].split("\t") else: urls = [elements[3].strip()] #processing the URL related for url in urls: if url == 0:continue if url not in urlset: urlset.add(url) terms = [] subtopicID = "".join([word.split("/")[0] for word in words ]) for word in words: if (not isGoodWord(word)) or word in topic_word_set : continue else: term,pos = word.split("/") terms.append(term) if len(terms) == 0: continue subtopic_list.append((subtopicID,label, terms,subtopic_frequency,urls)) return subtopic_list,topic_word_set,query_probability_dict
def subtopic_expansion(basepath,topicID,expansion_method,mining_method): """ this function provides prepared information of subtopics. """ print topicID,"subtopic expansion ",expansion_method,mining_method if not os.path.exists(basepath+"/pickle_expansion/"+topicID+"-"+mining_method+"/"): os.mkdir(basepath+"/pickle_expansion/"+topicID+"-"+mining_method+"/") pickle_file = basepath+"/pickle_expansion/"+topicID+"-"+mining_method+"/"+expansion_method if os.path.exists(pickle_file) and os.path.isfile(pickle_file): infile = open(pickle_file,"rb") subtopic_list = pickle.load(infile) word_details = pickle.load(infile) topic_words = pickle.load(infile) query_dict = pickle.load(infile) infile.close() else: if mining_method == "1": subtopic_candidates,topic_words,query_dict = read_subtopic_from_files(basepath,topicID) else: subtopic_candidates,topic_words,query_dict = read_subtopic_from_mining(basepath,topicID) word_embedding = WordEmbeddingFramework() subtopic_list = [] # (subtopicID,label,terms,subtopic_frequency,urls) word_details = {} # {word:[embedding vector,frequency]} for subtopic_str,label,terms,subtopic_frequency,urls in subtopic_candidates: term_for_expansion = [] subtopic_words =[] subtopic = [] if expansion_method.startswith("E"): # E2,E6 and E4 expansion method require the Q+A for expansion if expansion_method == "E2" or expansion_method == "E4" or expansion_method == "E6": for word in topic_words: vec = word_embedding.get_embedding_vector(word) if vec != None: term_for_expansion.append(word) for word in terms: vec = word_embedding.get_embedding_vector(word) if vec == None: continue if not word_details.has_key(word): word_details[word]=[vec] word_details[word].append(subtopic_frequency) term_for_expansion.append(word) subtopic_words.append(word) if len(term_for_expansion)==0: print subtopic_str continue sim_word_weight = word_embedding.get_expansion_words(term_for_expansion) if sim_word_weight == None: continue i =0 if expansion_method =="E2" or expansion_method=="E1": subtopic_words=[] for s_word,weight in sim_word_weight: if not isGoodWord(s_word+"/n"):continue i +=1 if weight<0.6:continue if word_details.has_key(s_word): word_details[s_word][1]+= subtopic_frequency else: vec = word_embedding.get_embedding_vector(s_word) if vec == None:continue word_details[s_word]=[] word_details[s_word].append(vec) word_details[s_word].append(subtopic_frequency) # E5 and E6 method use the expansion words and the original words as the subtopic words if expansion_method == "E5" or expansion_method == "E6": subtopic_words.append(s_word) if i>3:break if expansion_method == "E1" or expansion_method == "E2": subtopic_words.append(s_word) if i>5:break else: for word in terms: vec = word_embedding.get_embedding_vector(word) if vec == None: continue if not word_details.has_key(word): word_details[word]=[vec] word_details[word].append(subtopic_frequency) subtopic_words.append(word) subtopic.append(subtopic_str) subtopic.append(label) subtopic.append(subtopic_words) subtopic.append(subtopic_frequency) subtopic.append(urls) subtopic_list.append(subtopic) outfile = open(pickle_file,"wb") pickle.dump(subtopic_list,outfile) pickle.dump(word_details,outfile) pickle.dump(topic_words,outfile) pickle.dump(query_dict,outfile) outfile.close() return subtopic_list,word_details,topic_words,query_dict
def get_subtopic_candidate(basepath,topicID): """ get the subtopic of the original query. return a list with subtopic information. for each data_prepare,would build a word dict to store the information of terms. ------------ return: ------------ subtopic_list: a list with [id,label,words,URLs] url set: url set. word2id_weight: word id and word appeared subtopic subtopic_length_average,subtopic_term_number_all,topic_word_set,query [ID,label,words,frequency,urls] """ subtopic_path = basepath + "subtopic/" topic_path = basepath+"topic/" f = open(subtopic_path+topicID) lines = f.readlines() f.close() query_probability_dict = get_orinal_query_vec(basepath,topicID) subtopic_term_number_all = 0.0 subtopic_number = 0 subtopic_length_sum = 0.0 word2id_weight = {} subtopic_list = [] urlset = set() wid = 0 topic_word_set = get_topic_words(topic_path+topicID) for line in lines: elements = line.strip().split('||') if len(elements)!=4:continue label = int(elements[0]) words = elements[1].split("\t") subtopic_frequency = int(elements[2]) if len(elements[3])>1: urls = elements[3].split("\t") else: urls = [elements[3].strip()] #processing the URL related for url in urls: if url == 0:continue if url not in urlset: urlset.add(url) terms = [] subtopicID = "".join([word.split("/")[0] for word in words ]) for word in words: if (not isGoodWord(word)) or word in topic_word_set : continue else: term,pos = word.split("/") terms.append(term) if word2id_weight.has_key(term): word2id_weight[term][1] += subtopic_frequency subtopic_term_number_all += subtopic_frequency word2id_weight[term][2].append([subtopicID,subtopic_frequency]) else: subtopic_term_number_all += subtopic_frequency word2id_weight[term]=[wid,subtopic_frequency,[[subtopicID,subtopic_frequency]]] wid += 1 if len(terms) == 0: continue subtopic_length_sum += subtopic_frequency * len(terms) subtopic_number += subtopic_frequency subtopic_list.append((subtopicID,label, terms,subtopic_frequency,urls)) subtopic_length_average = subtopic_length_sum/subtopic_number return subtopic_list,urlset,word2id_weight,subtopic_length_average,subtopic_term_number_all,topic_word_set,query_probability_dict,subtopic_number
def subtopic_expansion(basepath, topicID, expansion_method, mining_method): """ this function provides prepared information of subtopics. """ print topicID, "subtopic expansion ", expansion_method, mining_method if not os.path.exists(basepath + "/pickle_expansion/" + topicID + "-" + mining_method + "/"): os.mkdir(basepath + "/pickle_expansion/" + topicID + "-" + mining_method + "/") pickle_file = basepath + "/pickle_expansion/" + topicID + "-" + mining_method + "/" + expansion_method if os.path.exists(pickle_file) and os.path.isfile(pickle_file): infile = open(pickle_file, "rb") subtopic_list = pickle.load(infile) word_details = pickle.load(infile) topic_words = pickle.load(infile) query_dict = pickle.load(infile) infile.close() else: if mining_method == "1": subtopic_candidates, topic_words, query_dict = read_subtopic_from_files( basepath, topicID) else: subtopic_candidates, topic_words, query_dict = read_subtopic_from_mining( basepath, topicID) word_embedding = WordEmbeddingFramework() subtopic_list = [] # (subtopicID,label,terms,subtopic_frequency,urls) word_details = {} # {word:[embedding vector,frequency]} for subtopic_str, label, terms, subtopic_frequency, urls in subtopic_candidates: term_for_expansion = [] subtopic_words = [] subtopic = [] if expansion_method.startswith("E"): # E2,E6 and E4 expansion method require the Q+A for expansion if expansion_method == "E2" or expansion_method == "E4" or expansion_method == "E6": for word in topic_words: vec = word_embedding.get_embedding_vector(word) if vec != None: term_for_expansion.append(word) for word in terms: vec = word_embedding.get_embedding_vector(word) if vec == None: continue if not word_details.has_key(word): word_details[word] = [vec] word_details[word].append(subtopic_frequency) term_for_expansion.append(word) subtopic_words.append(word) if len(term_for_expansion) == 0: print subtopic_str continue sim_word_weight = word_embedding.get_expansion_words( term_for_expansion) if sim_word_weight == None: continue i = 0 if expansion_method == "E2" or expansion_method == "E1": subtopic_words = [] for s_word, weight in sim_word_weight: if not isGoodWord(s_word + "/n"): continue i += 1 if weight < 0.6: continue if word_details.has_key(s_word): word_details[s_word][1] += subtopic_frequency else: vec = word_embedding.get_embedding_vector(s_word) if vec == None: continue word_details[s_word] = [] word_details[s_word].append(vec) word_details[s_word].append(subtopic_frequency) # E5 and E6 method use the expansion words and the original words as the subtopic words if expansion_method == "E5" or expansion_method == "E6": subtopic_words.append(s_word) if i > 3: break if expansion_method == "E1" or expansion_method == "E2": subtopic_words.append(s_word) if i > 5: break else: for word in terms: vec = word_embedding.get_embedding_vector(word) if vec == None: continue if not word_details.has_key(word): word_details[word] = [vec] word_details[word].append(subtopic_frequency) subtopic_words.append(word) subtopic.append(subtopic_str) subtopic.append(label) subtopic.append(subtopic_words) subtopic.append(subtopic_frequency) subtopic.append(urls) subtopic_list.append(subtopic) outfile = open(pickle_file, "wb") pickle.dump(subtopic_list, outfile) pickle.dump(word_details, outfile) pickle.dump(topic_words, outfile) pickle.dump(query_dict, outfile) outfile.close() return subtopic_list, word_details, topic_words, query_dict
def get_subtopic_candidate(basepath, topicID): """ get the subtopic of the original query. return a list with subtopic information. for each data_prepare,would build a word dict to store the information of terms. ------------ return: ------------ subtopic_list: a list with [id,label,words,URLs] url set: url set. word2id_weight: word id and word appeared subtopic subtopic_length_average,subtopic_term_number_all,topic_word_set,query [ID,label,words,frequency,urls] """ subtopic_path = basepath + "subtopic/" topic_path = basepath + "topic/" f = open(subtopic_path + topicID) lines = f.readlines() f.close() query_probability_dict = get_orinal_query_vec(basepath, topicID) subtopic_term_number_all = 0.0 subtopic_number = 0 subtopic_length_sum = 0.0 word2id_weight = {} subtopic_list = [] urlset = set() wid = 0 topic_word_set = get_topic_words(topic_path + topicID) for line in lines: elements = line.strip().split('||') if len(elements) != 4: continue label = int(elements[0]) words = elements[1].split("\t") subtopic_frequency = int(elements[2]) if len(elements[3]) > 1: urls = elements[3].split("\t") else: urls = [elements[3].strip()] #processing the URL related for url in urls: if url == 0: continue if url not in urlset: urlset.add(url) terms = [] subtopicID = "".join([word.split("/")[0] for word in words]) for word in words: if (not isGoodWord(word)) or word in topic_word_set: continue else: term, pos = word.split("/") terms.append(term) if word2id_weight.has_key(term): word2id_weight[term][1] += subtopic_frequency subtopic_term_number_all += subtopic_frequency word2id_weight[term][2].append( [subtopicID, subtopic_frequency]) else: subtopic_term_number_all += subtopic_frequency word2id_weight[term] = [ wid, subtopic_frequency, [[subtopicID, subtopic_frequency]] ] wid += 1 if len(terms) == 0: continue subtopic_length_sum += subtopic_frequency * len(terms) subtopic_number += subtopic_frequency subtopic_list.append( (subtopicID, label, terms, subtopic_frequency, urls)) subtopic_length_average = subtopic_length_sum / subtopic_number return subtopic_list, urlset, word2id_weight, subtopic_length_average, subtopic_term_number_all, topic_word_set, query_probability_dict, subtopic_number
def doc_preprocessing(basepath, topicID, word2id_weight, topic_words): """ """ document_path = basepath + "documents_seg/" + topicID + "/" documents = os.listdir(document_path) doc_subtopic_path = basepath + "doc_subtopic_relation/" #store information of term in the subtopic word set. #include docs ID which term appear and term frequency in collection. document_term_frequency = {} document_list = [] sum_document_len = 0.0 topic_word_count = 0.0 # true_rank = get_document_true_rank(basepath,topicID) doc_sid = get_document_relate_subtopicID(doc_subtopic_path, topicID) for document in documents: # if not true_rank.has_key(document): # continue o = open(document_path + document) lines = o.readlines() o.close() terms = [] terms_pos = [] for line in lines: line = line.replace("\n", "") words = line.split("\t") for windex, word in enumerate(words): if word in topic_words: topic_word_count += 1 if isGoodWord(word): sum_document_len += 1 else: continue term, pos = word.split("/") if document_term_frequency.has_key(term): document_term_frequency[term][0] += 1 if document_term_frequency[term][1].count(document) == 0: document_term_frequency[term][1].append(document) else: item = [1, [document]] document_term_frequency[term] = item if word2id_weight.has_key(term): terms.append(term) terms_pos.append(pos) # sum_document_len+= len(terms) doc = Document() doc.set_doc_str(" ".join(terms)) doc.set_id(document) doc.set_term_vec(terms) doc.set_pos_vec(terms_pos) doc.set_true_rank("1") doc.set_related_sid(doc_sid[document]) document_list.append(doc) average_document_len = sum_document_len / len(document_list) return document_list, document_term_frequency, average_document_len, topic_word_count