def __init__(self, num_of_homo_feats=10, max_qry_length=1794, max_doc_length=2907, query_path=None, document_path=None, corpus="TDT2"): res_pos = True str2int = True self.num_vocab = 51253 self.max_qry_length = max_qry_length self.max_doc_length = max_doc_length self.num_of_homo_feats = num_of_homo_feats if query_path == None: query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if document_path == None: document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" # read document, reserve position doc = ProcDoc.read_file(document_path) self.doc = ProcDoc.doc_preprocess(doc, res_pos, str2int) # read query, reserve position qry = ProcDoc.read_file(query_path) self.qry = ProcDoc.query_preprocess(qry, res_pos, str2int) # HMMTrainingSet self.hmm_training_set = ProcDoc.read_relevance_dict() self.homo_feats = self.__genFeature(num_of_homo_feats)
#document_path = "../Corpus/Spoken_Doc" document_path = "../Corpus/SPLIT_DOC_WDID_NEW" query_path = "../Corpus/Train/XinTrainQryTDT2/QUERY_WDID_NEW" # read document data = ProcDoc.read_file(document_path) doc_wordcount = ProcDoc.doc_preprocess(data) # HMMTraingSet HMMTraingSetDict = ProcDoc.read_relevance_dict() query_relevance = {} query = ProcDoc.read_file(query_path) query = ProcDoc.query_preprocess(query) query_wordcount = {} for q, q_content in query.items(): query_wordcount[q] = ProcDoc.word_count(q_content, {}) query_unigram = ProcDoc.unigram(query_wordcount) # create outside query model query_model = [] q_list = query_unigram.keys() for q, w_uni in query_unigram.items(): if q in HMMTraingSetDict: vocabulary = np.zeros(51253) for w, uni in w_uni.items():
# general model collection = {} collection_total_similarity = {} for key, value in doc_wordcount.items(): for word, count in value.items(): if word in collection: collection[word] += count else: collection[word] = count collection_word_sum = 1.0 * ProcDoc.word_sum(collection) general_model = {k: v / collection_word_sum for k, v in collection.items()} # query model query = ProcDoc.read_file(query_path) query = ProcDoc.query_preprocess(query) query_wordcount = {} for q, q_content in query.items(): query_wordcount[q] = ProcDoc.word_count(q_content, {}) query_unigram = ProcDoc.unigram(dict(query_wordcount)) query_model = query_unigram Pickle.dump(query_model, open("model/query_model.pkl", "wb"), True) # remove template file for rm_file in remove_list: if os.path.isfile("model/" + rm_file): os.remove("model/" + rm_file) # Embedded Query Expansion
LAYERS = 1 TRAINING_SIZE = 800 EPOCHS = 200 BATCH_SIZE = 50 # All the numbers, plus sign and space for padding. VOCAB_SIZE = 51253 corpus = "TDT2" ENCODE_LENGTH = len('{0:016b}'.format(VOCAB_SIZE)) qry_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" res_pos = True str2int = True # chars = list(range(VOCAB_SIZE + 1)) ctable = CharacterTable(chars, ENCODE_LENGTH) qry = ProcDoc.read_file(qry_path) qry = ProcDoc.query_preprocess(qry, res_pos, str2int) TRAINING_SIZE = len(qry.keys()) questions = [] expected = [] count = 0 print('Generating data...') for q_name, q_cont in qry.items(): #a = ' '.join(str(np.random.choice(chars)) for i in range(np.random.randint(1, MAXLEN))) # Pad the data with spaces such that it is always MAXLEN. q = [str(e + 1) for e in q_cont] for x in xrange(MAXLEN - len(q)): q.insert(0, '0') #print(q) questions.append(q) count += 1 print(str(count) + "/" + str(TRAINING_SIZE), end='\r')