def tokenize(self, text): tokens = nltk.tokenize.word_tokenize(text.lower()) if self.drop_stopwords: tokens = drop_words(tokens, self.stopword) terms = clean_query(tokens) out_terms = [] for t in terms: t = t.replace("'", "") if t: out_terms.append(t) return out_terms
def write_claim_as_query(): d_ids = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) queries = [] for c in claims: cid = c["cId"] claim_text = c["text"] tokens = claim_text.split() query_text = clean_query(tokens) print(query_text) q_entry = get_query_entry_bm25_anseri(cid, query_text) queries.append(q_entry) out_path = os.path.join(output_path, "perspective_dev_claim_query.json") save_queries_to_file(queries, out_path)
def main(): print("Start") spr = StreamPickleReader("robust_candi_query_") query_per_task = 1000 * 10 out_idx = 0 while spr.has_next(): queries = [] for i in range(query_per_task): if not spr.has_next(): break q_id, query = spr.get_item() query = clean_query(query) queries.append(get_query_entry(q_id, query)) out_path = os.path.join(cpath.output_path, "query", "g_query_{}.json".format(out_idx)) save_queries_to_file(queries, out_path) out_idx += 1
def transform(q: Query) -> Dict: tokens = word_tokenize(q.text) tokens = clean_query(tokens) return format_query_bm25(q.qid, tokens)
def clean_tokenize_str_to_tokens(raw_str: str) -> List[str]: terms = clean_query(nltk.word_tokenize(raw_str)) terms = [t.lower() for t in terms] return terms
def clean_text_for_query(raw_str: str) -> str: terms = clean_query(nltk.word_tokenize(raw_str)) terms = " ".join([t.lower() for t in terms]) return terms