def load_from_db_or_from_galago(table_name, key, galago_fn): if has_key(table_name, key): return load(table_name, key) r = galago_fn() if not has_key(table_name, key): save(table_name, key, r) flush() return r
def get_doc_list(query_id: str): q_res_id: str = "{}_{}".format(query_id, q_config_id) ticker.tick() if has_key(QueryResult, q_res_id): r: List[SimpleRankedListEntry] = load(QueryResult, q_res_id) for entry in r: doc_id, rank, score = entry doc_list.add(doc_id)
def enum_paragraph(step_size, subword_len, subword_tokenize: Callable[[str], List[Subword]], doc: SimpleRankedListEntry) -> Iterable[Paragraph]: # load tokens and BERT subword tokens tokens = load(TokenizedCluewebDoc, doc.doc_id) subword_tokens: List[List[Subword]] = lmap(subword_tokenize, tokens) cursor = 0 while cursor < len(subword_tokens): cursor_ed = move_cursor(subword_tokens, cursor, subword_len) yield Paragraph(doc_id=doc.doc_id, doc_rank=doc.rank, doc_score=doc.score, subword_tokens=list( flatten(subword_tokens[cursor:cursor_ed])), tokens=tokens[cursor:cursor_ed]) cursor += step_size
def write_tfrecord(ranked_list_d: RankedListDict, queries: List[Query], q_rels: Dict[str, List[str]], save_path): max_seq_length = 512 tokenizer = get_tokenizer() encoder = AllSegmentAsDoc(max_seq_length) writer = RecordWriterWrap(save_path) data_id = 0 data_info = [] for query in queries: if query.qid not in ranked_list_d: print("Warning query {} not found".format(query.qid)) continue print(query.qid) ranked_list = ranked_list_d[query.qid] doc_ids = [doc_entry.doc_id for doc_entry in ranked_list] preload_man.preload(BertTokenizedCluewebDoc, doc_ids) q_tokens = tokenizer.tokenize(query.text) for doc_entry in ranked_list: try: tokens_list: List[List[str]] = load(BertTokenizedCluewebDoc, doc_entry.doc_id) tokens = flatten(tokens_list) insts: List[Tuple[List, List]] = encoder.encode(q_tokens, tokens) for inst in insts: label = doc_entry.doc_id in q_rels[query.qid] input_tokens, segment_ids = inst feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature["label_ids"] = create_int_feature([int(label)]) feature["data_id"] = create_int_feature([int(data_id)]) writer.write_feature(feature) data_info.append((data_id, query.qid, doc_entry.doc_id)) data_id += 1 except KeyError as e: print("doc {} not found".format(doc_entry.doc_id)) return data_info
def get_instances(self, cid, data_id_manager, entries): doc_ids = lmap(lambda x: x.doc_id, entries) preload_man.preload(BertTokenizedCluewebDoc, doc_ids) n_doc_not_found = 0 for entry in entries[:self.top_n]: try: tokens: List[List[str]] = load(BertTokenizedCluewebDoc, entry.doc_id) for sent_idx, sent in enumerate(tokens[:self.num_sent]): for pid in self.pid_dict[int(cid)]: info = { 'cid': cid, 'pid': pid, 'doc_id': entry.doc_id, 'sent_idx': sent_idx } yield Instance(pid, sent, data_id_manager.assign(info)) except KeyError: n_doc_not_found += 1 if n_doc_not_found: print("{} of {} docs not found".format(n_doc_not_found, len(doc_ids)))
def generate(claim_lm: ClaimLM, ranked_list: List[SimpleRankedListEntry]) -> List[Record]: claim_text = claim_lm.claim claim_tokens = bert_tokenizer.tokenize(claim_text) claim_token_len = len(claim_tokens) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) doc_ids = lmap(lambda x: x.doc_id, ranked_list[:top_n]) print("loading docs") preload_man.preload(BertTokenizedCluewebDoc, doc_ids) window_size = max_seq_length - claim_token_len - 3 step_size = max_seq_length - 112 enum_paragraph = enum_paragraph_functor(step_size, window_size) def get_record(tokens) -> Record: scores, masks = get_target_labels(tokens, log_odd, stopwords, fail_logger) return Record(claim_tokens, tokens, scores, masks) tokens_list: List[List[str]] = [] not_found = 0 for doc_id in doc_ids: try: tokens: List[str] = list( flatten(load(BertTokenizedCluewebDoc, doc_id))) tokens_list.append(tokens) except KeyError: not_found += 1 pass print("{} of {} not found".format(not_found, len(tokens_list))) paragraph_list: Iterable[List[str]] = enum_paragraph(tokens_list) records: List[Record] = lmap(get_record, paragraph_list) return records
def fetch_from_q_res_id(self, query_res_id: str, ) -> List[SimpleRankedListEntry]: def translate_structure(raw_data) -> List[SimpleRankedListEntry]: try: dummy = raw_data[0].doc_id r = raw_data except AttributeError: def tuple_to_ranked_entry(tuple) -> SimpleRankedListEntry: doc_id, rank, score = tuple return SimpleRankedListEntry(doc_id=doc_id, rank=rank, score=score) r = lmap(tuple_to_ranked_entry, raw_data) return r try: raw_data = load(QueryResult, query_res_id) data = translate_structure(raw_data) return data except KeyError: print(query_res_id) raise
def get_tokens(doc_id) -> List[str]: return load(TokenizedCluewebDoc, doc_id)
def get_db_item_or_make(self, table_name, doc_id): if has_key(table_name, doc_id): return load(table_name, doc_id) print("doc_id not found:", doc_id) self.launch_doc_processor(doc_id) return load(table_name, doc_id)
def load_tf(doc_id): return load(CluewebDocTF, doc_id)
def load_doc(doc_id): return load(TokenizedCluewebDoc, doc_id)
def main(): doc_id = "clueweb12-0005wb-96-30750" doc = load(BertTokenizedCluewebDoc, doc_id) print("doc has {} lines", len(doc)) print("last line:", pretty_tokens(doc[-1], True))