def test_generator(data_generator): for ids, query, docs in test_gen(data_generator): docs_ids = [] docs_array = [] docs_mask_array = [] query_array = [] query_ids = [] for i in range(len(ids)): for doc in docs[i]: # pad docs, use cache here maybe_padding(doc) docs_array.append(doc["tokens"]) docs_mask_array.append(doc["sentences_mask"]) docs_ids.append(doc["id"]) query_tokens = pad_tokens([query[i]], cfg["max_q_terms"])[0] query_tokens = [query_tokens] * len(docs[i]) query_array.append(query_tokens) query_ids.append([ids[i]] * len(docs[i])) #print(np.array(docs_mask_array)) yield flat_list(query_ids), [ np.array(flat_list(query_array)), np.array(docs_array), np.array(docs_mask_array) ], docs_ids, None
def _generate(self, **kwargs): for i in range(len(self.folds_query_list)): # create the folds test_query = self.folds_query_list[i] test_goldstandard_trec_file = self.folds_goldstandard_trec_file[i] test_query_docs = self.folds_query_docs[i] train_query = flat_list(self.folds_query_list[:i] + self.folds_query_list[i + 1:]) train_goldstandard = merge_dicts(self.folds_goldstandard[:i] + self.folds_goldstandard[i + 1:]) train_query_docs = merge_dicts(self.folds_query_docs[:i] + self.folds_query_docs[i + 1:]) train_collection = TrainCollection(train_query, train_goldstandard, train_query_docs) test_collection = TestCollection(test_query, test_goldstandard_trec_file, test_query_docs, self.trec_script_eval_path, train_collection.skipped_queries) yield train_collection, test_collection
def _generate(self, **kwargs): query_ids = [] queries = [] query_docs = [] i=0 for query_data in self.query_list: if query_data["id"] in self.skipped_queries: continue if query_data["id"] not in self.query_docs: print("[WARNING] -",query_data["id"],"does not have docs, so it will be skipped") continue while True: #do while left_space = self.b_size-len(flat_list(query_docs)) if len(self.query_docs[query_data["id"]][i:])<left_space: # all the documents fit the batch query_docs.append(self.query_docs[query_data["id"]][i:]) i=0 else: # docs do not fit in the batch query_docs.append(self.query_docs[query_data["id"]][i:i+left_space]) i = i+left_space query_ids.append(query_data["id"]) queries.append(query_data["query"]) # DEBUG PRINTTTTT #print(query_data["id"], i, len(flat_list(query_docs))) #ouptup accoring to the batch size if len(flat_list(query_docs))>=self.b_size: yield query_ids, queries, query_docs # reset vars query_ids = [] queries = [] query_docs = [] if i==0: break