예제 #1
0
    def tokenize(self, text):
        tokens = nltk.tokenize.word_tokenize(text.lower())
        if self.drop_stopwords:
            tokens = drop_words(tokens, self.stopword)

        terms = clean_query(tokens)
        out_terms = []
        for t in terms:
            t = t.replace("'", "")
            if t:
                out_terms.append(t)
        return out_terms
예제 #2
0
def write_claim_as_query():
    d_ids = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    queries = []
    for c in claims:
        cid = c["cId"]
        claim_text = c["text"]
        tokens = claim_text.split()
        query_text = clean_query(tokens)
        print(query_text)
        q_entry = get_query_entry_bm25_anseri(cid, query_text)
        queries.append(q_entry)

    out_path = os.path.join(output_path, "perspective_dev_claim_query.json")
    save_queries_to_file(queries, out_path)
예제 #3
0
def main():
    print("Start")
    spr = StreamPickleReader("robust_candi_query_")
    query_per_task = 1000 * 10
    out_idx = 0
    while spr.has_next():
        queries = []
        for i in range(query_per_task):
            if not spr.has_next():
                break
            q_id, query = spr.get_item()
            query = clean_query(query)
            queries.append(get_query_entry(q_id, query))

        out_path = os.path.join(cpath.output_path, "query",
                                "g_query_{}.json".format(out_idx))
        save_queries_to_file(queries, out_path)
        out_idx += 1
예제 #4
0
 def transform(q: Query) -> Dict:
     tokens = word_tokenize(q.text)
     tokens = clean_query(tokens)
     return format_query_bm25(q.qid, tokens)
예제 #5
0
def clean_tokenize_str_to_tokens(raw_str: str) -> List[str]:
    terms = clean_query(nltk.word_tokenize(raw_str))
    terms = [t.lower() for t in terms]
    return terms
예제 #6
0
def clean_text_for_query(raw_str: str) -> str:
    terms = clean_query(nltk.word_tokenize(raw_str))
    terms = " ".join([t.lower() for t in terms])
    return terms