def __init__(self, num_of_homo_feats=10, max_qry_length=1794, max_doc_length=2907, query_path=None, document_path=None, corpus="TDT2"): res_pos = True self.num_vocab = 51253 self.max_qry_length = max_qry_length self.max_doc_length = max_doc_length self.num_of_homo_feats = num_of_homo_feats if query_path == None: query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if document_path == None: document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" # relevance set self.hmm_training_set = ProcDoc.readRELdict() # read document, reserve position doc = ProcDoc.readFile(document_path) self.doc = ProcDoc.docPreproc(doc, res_pos, 200) # read query, reserve position qry = ProcDoc.readFile(query_path) self.qry = ProcDoc.qryPreproc(qry, self.hmm_training_set, res_pos, 200) # generate h featrues self.homo_feats = self.__genFeature(num_of_homo_feats)
def __init__(self, qry_path=None, rel_path=None, isTraining=True, doc_path=None): # default training step if qry_path == None: qry_path = "../Corpus/TDT2/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if doc_path == None: doc_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW" if rel_path == None: rel_path = "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain" self.vocab_size = 51253 # relevance set self.rel_set = ProcDoc.readRELdict(rel_path, isTraining) self.evaluate_model = EvaluateModel(rel_path, isTraining) # read documents doc = ProcDoc.readFile(doc_path) self.doc = ProcDoc.docPreproc(doc) self.doc_len = Statistical.compLenAcc(self.doc) # read queries qry = ProcDoc.readFile(qry_path) self.qry_tf = ProcDoc.qryPreproc(qry, self.rel_set) self.qry_len = Statistical.compLenAcc(self.qry_tf) [self.qry, self.doc] = Statistical.TFIDF(self.qry_tf, self.doc, self.qry_len, self.doc_len) # dict to numpy self.qry_tf, self.qry_tf_IDs = self.__dict2np(self.qry_tf) self.qry, self.qry_IDs = self.__dict2np(self.qry, self.qry_tf_IDs) self.doc, self.doc_IDs = self.__dict2np(self.doc) # precompute len(document) self.doc = Statistical.l2Normalize(self.doc)
type_feat = "sparse" # or embeddings query_path = None document_path = None QDrel_file_path = None corpus = "TDT2" # qry and doc if query_path == None: query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if document_path == None: document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" if QDrel_file_path == None: QDrel_file_path = "../Significant-Words-Language-Models/train-qry-results-0.675969697596.txt" # relevancy set hmm_training_set = ProcDoc.readRELdict() # read document, reserve position doc = ProcDoc.readFile(document_path) doc = ProcDoc.docPreproc(doc, RES_POS) # read query, reserve position qry = ProcDoc.readFile(query_path) qry = ProcDoc.qryPreproc(qry, hmm_training_set, RES_POS) QDrel = RelPrep.readQDRel(QDrel_file_path) print len(qry), len(doc) print len(QDrel) NRMprep.getTrainAndValidation(qry, doc, QDrel, NUM_VOCAB, type_rank, type_feat) # (pointwise or pairwise) and (sparse or embeddings) # prepare data and label