示例#1
0
def get_softcopy_ids(input, output, sent_length, topN, related_words_dict=None):
    """
    Given input and output text, we get a padded soft copy target
    numpy array with a pre-calculated related_words_dict dict.
    """
    if isinstance(input, str):
        input = NLP(input)
    if isinstance(output, str):
        output = NLP(output)

    input_tokens = set([token.text.lower() for token in input])
    input_lemmas = set([token.lemma_ for token in input])
    input_synonyms = set()
    input_antomyms = set()
    input_semantic_related = set()
    for token in input:
        if related_words_dict is not None:
            synonyms = set(related_words_dict[token.text]["synonyms"])
            antonyms = set(related_words_dict[token.text]["antonyms"])
            semantic_related = set(related_words_dict[token.text]["semantic_related"][:topN])
        else:
            synonyms = set(get_synonyms(token.text))
            antonyms = set(get_antonyms(token.text))
            semantic_related = set(get_semantic_related_words(token.text, topN))
        input_synonyms = set.union(input_synonyms, synonyms)
        input_antomyms = set.union(input_antomyms, antonyms)
        input_semantic_related = set.union(input_semantic_related, semantic_related)

    is_copy_padded = np.zeros([sent_length], dtype=np.float32)
    i = 0
    for token in output:
        if token.text.lower() in input_tokens:  # hard copy
            is_copy_padded[i] = 1
        elif token.lemma_ in input_lemmas:  # variant form
            is_copy_padded[i] = 2
        elif token.text.lower() in input_synonyms:  # synonym
            is_copy_padded[i] = 3
        elif token.text.lower() in input_antomyms:  # antonym
            is_copy_padded[i] = 4
        elif token.text.lower() in input_semantic_related or token.text in input_semantic_related:  # semantic related
            is_copy_padded[i] = 5
        else:
            pass

        i += 1
        if i >= sent_length:
            break

    return is_copy_padded
示例#2
0
def text2tokens(sentence):
    """
    Transform text to tokens list.
    :param sentence: input text
    :return: list of token texts
    """
    doc = NLP(sentence)
    return [token.text for token in doc]
示例#3
0
def get_answer_ner_tag(context, answer_text, processed_by_spacy=False):
    label = 'UNK'
    if not processed_by_spacy:
        doc = NLP(context)
    else:
        doc = context
    for ent in doc.ents:
        if ent.text == answer_text or answer_text in ent.text:
            return ent.label_
    return label
示例#4
0
def get_content_ids(text, function_words_list, sent_length):
    """
    Get a padded binary numpy array to indicate which part of
    input text tokens are content tokens.
    """
    if isinstance(text, str):
        text = NLP(text)

    is_content = separate_content_function_words(text, function_words_list)
    is_content_padded = np.zeros([sent_length], dtype=np.float32)
    is_content_padded[:min(len(is_content), sent_length)] = is_content[:sent_length]

    return is_content_padded
示例#5
0
def select_clues(context, answer, answer_bio_ids, max_dependency_distance, processed_by_spacy=False):
    """
    Select clue chunks given context and answer.
    """
    # return a list of [(clue_text, clue_binary_ids)] tuples.
    answer_start = answer_bio_ids.index('B')
    try:
        answer_end = list(reversed(answer_bio_ids)).index('I')
        answer_end = len(answer_bio_ids) - 1 - answer_end
    except:
        answer_end = answer_start
    if not processed_by_spacy:
        doc = NLP(context)
    else:
        doc = context

    doc_token_list = [token for token in doc]
    # text_str = ' '.join([tk.text for tk in doc])

    idx2token, idx2related, context_tokens = get_all_related(doc, doc_token_list)
    clue_flags = [0] * len(doc)
    for aid in range(answer_start, answer_end + 1):
        sort_related = idx2related[aid]
        for tk_id, path in sort_related:
            if (tk_id < answer_start or tk_id > answer_end) and len(path) <= max_dependency_distance:
                cur_clue = idx2token[tk_id]
                if cur_clue.pos_ not in ['ADP', 'DET', 'ADV', 'PUNCT', 'PART']:
                    clue_flags[tk_id] = 1
    clues = []
    i = 0
    while i < len(clue_flags):
        if clue_flags[i] == 0:
            i += 1
            continue
        j = i
        while j < len(clue_flags):
            if clue_flags[j] == 1:
                j += 1
            else:
                break
        clue_text = ' '.join(context_tokens[i:j])
        clue_binary_ids = [0] * len(clue_flags)
        clue_binary_ids[i:j] = [1] * (j - i)
        clues.append({"clue_text": clue_text, "clue_binary_ids": clue_binary_ids})
        i = j
    return clues
示例#6
0
def get_chunks(sentence, doc=None):
    """
    Input a sentence, output a list of its chunks (ner_tag, pos_tag, leaves_without_position, st, ed).
    Such as ('PERSON', 'NP', ['Beyoncé', 'Giselle', 'Knowles-Carter'], 0, 2).
    """
    tree = None
    try:
        tree = PARSER.parse(
            sentence)  # NOTICE: when sentence is too long, it will have error.
    except Exception as e:
        print(e)  ## DELETE
        pass
    if doc is None:
        doc = NLP(sentence)
    max_depth, node_num, orig_chunklist = _navigate(tree)

    chunklist = []
    # parse the result of _navigate
    for chunk in orig_chunklist:
        try:
            if chunk[0] == 'word':
                continue
            chunk_pos_tag, leaves = chunk
            leaves_without_position = []
            position_list = []
            for v in leaves:
                tmp = v.split('___')
                wd = tmp[0]
                index = int(tmp[1])
                leaves_without_position.append(wd)
                position_list.append(index)
            st = position_list[0]
            ed = position_list[-1]

            chunk_ner_tag = "UNK"
            chunk_text = " ".join(leaves_without_position)
            for ent in doc.ents:
                if ent.text == chunk_text or chunk_text in ent.text:
                    chunk_ner_tag = ent.label_

            chunklist.append((chunk_ner_tag, chunk_pos_tag,
                              leaves_without_position, st, ed))
        except:
            continue
    return chunklist, tree, doc
示例#7
0
def select_answers(context, processed_by_spacy=False):
    """
    Input a context, we select which part of the input words belonging to the answer.
    """
    # return a list of [(answer_text, answer_bio_ids)] tuples.
    tree = None
    try:
        tree = PARSER.parse(
            context)  # TODO: if the context is too long, it will cause error.
    except:
        pass
    if not processed_by_spacy:
        doc = NLP(context)
    else:
        doc = context
    token2idx, idx2token = get_token2char(doc)
    max_depth, node_num, chunklist = _navigate(tree)
    answer_chunks = _post(chunklist)
    answers = []
    for chunk in answer_chunks:
        label, leaves, st, ed = chunk
        # print('leaves={}\tst={}\ted={}\tlabel={}'.format(leaves, st, ed, label))
        try:
            char_st, char_ed = str_find(context, leaves)
            if char_st < 0:
                continue
            answer_text = context[char_st:char_ed + 1]
            st = idx2token[char_st]
            ed = idx2token[char_ed]
            answer_bio_ids = ['O'] * len(doc)
            answer_bio_ids[st:ed + 1] = ['I'] * (ed - st + 1)
            answer_bio_ids[st] = 'B'
            char_st = token2idx[st][0]
            char_ed = token2idx[ed][1]
            # print('answer_text={}\tchar_st={}\tchar_ed={}\tst={}\ted={}'.format(answer_text, char_st, char_ed, st, ed))
        except:
            continue
        answers.append((answer_text, char_st, char_ed, answer_bio_ids, label))

    return answers
示例#8
0
def get_spacy_processed_examples(config,
                                 augmented_sentences,
                                 debug=False,
                                 debug_length=20,
                                 shuffle=False):
    print("Start transform augmented sentences to spaCy processed examples...")
    start = datetime.now()
    examples = []
    i = 0
    for s in tqdm(augmented_sentences):
        if "ans_sent_doc" not in s:
            s["ans_sent_doc"] = NLP(s["context"])
        s["ans_sent_tokens"] = [token.text for token in s["ans_sent_doc"]]
        examples.append(s)
        i = i + 1
        if debug and i >= debug_length:
            break
    if shuffle:
        random.shuffle(augmented_sentences)

    print(("Time of get spaCy processed examples: {}").format(datetime.now() -
                                                              start))
    print("Number of spaCy processed examples: ", len(examples))
    return examples
示例#9
0
def get_spacy_processed_examples(config,
                                 raw_examples,
                                 debug=False,
                                 debug_length=20,
                                 shuffle=False):
    """
    Get a list of spaCy processed examples given raw examples.
    """
    print("Start transform raw examples to spaCy processed examples...")
    start = datetime.now()
    examples = []
    eval_examples = []
    meta = {}
    meta["num_q"] = 0

    for t in QUESTION_TYPES:
        meta[t] = 0

    for e in tqdm(raw_examples):
        meta["num_q"] += 1

        ans_sent = normalize_text(e["ans_sent"])
        ans_sent_doc = NLP(ans_sent)
        ans_sent_tokens = [token.text for token in ans_sent_doc]
        ans_sent_chars = [list(token) for token in ans_sent_tokens]
        spans = get_token_char_level_spans(ans_sent, ans_sent_tokens)
        ans_sent_syntactic_edges = get_dependency_tree_edges(ans_sent_doc)

        ques = normalize_text(e["question"])
        ques = "<sos> " + ques + " <eos>"  # notice: this is important for QG
        ques_doc = NLP(ques)
        ques_tokens = [token.text for token in ques_doc]
        ques_chars = [list(token) for token in ques_tokens]
        ques_type, ques_type_id = get_question_type(e["question"])
        meta[ques_type] += 1

        answer_text = normalize_text(e["answer_text"])
        answer_start = e["answer_start"]
        answer_end = answer_start + len(answer_text)
        answer_span = []

        for idx, span in enumerate(spans):
            if not (answer_end <= span[0] or answer_start >= span[1]):
                answer_span.append(idx)
        y1_in_sent, y2_in_sent = answer_span[0], answer_span[-1]
        answer_in_sent = " ".join(ans_sent_tokens[y1_in_sent:y2_in_sent + 1])

        example = {
            "question": ques,
            "ques_doc": ques_doc,
            "ques_tokens": ques_tokens,
            "ques_chars": ques_chars,
            "ques_type": ques_type,  # string type
            "ques_type_id": ques_type_id,  # type id number
            "ans_sent": ans_sent,
            "ans_sent_doc": ans_sent_doc,
            "ans_sent_tokens": ans_sent_tokens,
            "ans_sent_chars": ans_sent_chars,
            "ans_sent_syntactic_edges": ans_sent_syntactic_edges,
            "answer": answer_in_sent,
            "y1_in_sent": y1_in_sent,
            "y2_in_sent": y2_in_sent,
            "id": meta["num_q"]
        }
        examples.append(example)

        if debug and meta["num_q"] >= debug_length:
            break

    if shuffle:
        random.shuffle(examples)

    print(("Time of get spaCy processed examples: {}").format(datetime.now() -
                                                              start))
    print("Number of spaCy processed examples: ", len(examples))
    return examples, meta, eval_examples
示例#10
0
        e = [token.head.i, token.i, token.dep_]
        edges.append(e)
    return edges


if __name__ == "__main__":
    text = "<sos> How are you? <eos>"
    text2 = "Apple is how are you three"
    counters = {}
    counters["word"] = Counter()
    counters["char"] = Counter()
    counters["pos"] = Counter()
    counters["ner"] = Counter()
    counters["iob"] = Counter()
    counters["dep"] = Counter()
    spacy_doc = NLP(text)
    spacy_doc2 = NLP(text2)
    print(spacydoc2bpe(spacy_doc2))
    for token in spacy_doc:
        counters["word"][token.text] += 1
        for char in token.text:
            counters["char"][char] += 1
        counters["pos"][token.tag_] += 1
        counters["ner"][token.ent_type_] += 1
        counters["iob"][token.ent_iob_] += 1
        counters["dep"][token.dep_] += 1
    emb_mats = {}
    emb_dicts = {}
    emb_mats["word"], emb_dicts["word"] = get_embedding(counters["word"],
                                                        "word",
                                                        vec_size=3)
def get_clue_info(question, sentence, answer, answer_start,
                  chunklist=None, y1_in_sent=None, doc=None, ques_doc=None, sent_limit=100):
    example = {
        "question": question,
        "ans_sent": sentence,
        "answer_text": answer}

    if doc is None:
        doc = NLP(sentence)

    if ques_doc is None:
        ques_doc = NLP(question)

    if chunklist is None:
        chunklist, _, _ = get_chunks(sentence, doc)

    example["ans_sent_tokens"] = [token.text for token in doc]
    example["ques_tokens"] = [token.text for token in ques_doc]
    example["ans_sent_doc"] = doc

    if y1_in_sent is None:
        spans = get_token_char_level_spans(sentence, example["ans_sent_tokens"])
        answer_end = answer_start + len(answer)
        answer_span = []
        for idx, span in enumerate(spans):
            if not (answer_end <= span[0] or
                    answer_start >= span[1]):
                answer_span.append(idx)
        y1_in_sent = answer_span[0]

    answer_start = y1_in_sent

    doc_token_list = [token for token in doc]
    idx2token, idx2related, context_tokens = get_all_related(doc, doc_token_list)

    clue_rank_scores = []
    for chunk in chunklist:
        candidate_clue = chunk[2]  # list of chunk words

        ques_lower = " ".join(example["ques_tokens"]).lower()
        candidate_clue_text = " ".join(candidate_clue).lower()
        ques_lemmas = [t.lemma_ for t in ques_doc]
        sent_lemmas = [t.lemma_ for t in doc]
        ques_tokens = [t.lower() for t in example["ques_tokens"]]
        candidate_clue_is_content = [int(w.lower() not in FUNCTION_WORDS_LIST) for w in candidate_clue]
        candidate_clue_lemmas = sent_lemmas[chunk[3]:chunk[4] + 1]
        candidate_clue_content_lemmas = [candidate_clue_lemmas[i] for i in range(len(candidate_clue_lemmas)) if
                                         candidate_clue_is_content[i] == 1]
        candidate_clue_lemmas_in_ques = [candidate_clue_lemmas[i] for i in range(len(candidate_clue_lemmas)) if
                                         candidate_clue_lemmas[i] in ques_lemmas]
        candidate_clue_content_lemmas_in_ques = [candidate_clue_content_lemmas[i] for i in
                                                 range(len(candidate_clue_content_lemmas)) if
                                                 candidate_clue_content_lemmas[i] in ques_lemmas]

        candidate_clue_tokens_in_ques = [candidate_clue[i] for i in range(len(candidate_clue)) if
                                         candidate_clue[i].lower() in ques_tokens]
        candidate_clue_content_tokens = [candidate_clue[i] for i in range(len(candidate_clue)) if
                                         candidate_clue_is_content[i] == 1]
        candidate_clue_content_tokens_in_ques = [candidate_clue_content_tokens[i] for i in
                                                 range(len(candidate_clue_content_tokens)) if
                                                 candidate_clue_content_tokens[i].lower() in ques_tokens]
        candidate_clue_content_tokens_in_ques_soft = candidate_clue_content_tokens_in_ques  # !!!! TODO: soft in.

        score = 0
        if (len(candidate_clue_lemmas_in_ques) == len(candidate_clue_lemmas) or len(
                candidate_clue_tokens_in_ques) == len(candidate_clue)) and \
                sum(candidate_clue_is_content) > 0 and \
                candidate_clue[0].lower() not in NOT_BEGIN_TOKENS_FOR_ANSWER_CLUE:
            score += len(candidate_clue_content_lemmas_in_ques)
            score += len(candidate_clue_content_tokens_in_ques)
            score += len(candidate_clue_content_tokens_in_ques_soft)
            score += int(candidate_clue_text in ques_lower)

        clue_rank_scores.append(score)
        # print("______".join([str(score), " ".join(chunk[2])]))  #!!!!!!!!! for debug

    if len(clue_rank_scores) == 0 or max(clue_rank_scores) == 0:
        clue_chunk = None
        clue_pos_tag = "UNK"
        clue_ner_tag = "UNK"
        clue_length = 0

        clue_answer_dep_path_len = -1

        selected_clue_binary_ids_padded = np.zeros([sent_limit], dtype=np.float32)
    else:
        clue_chunk = chunklist[clue_rank_scores.index(max(clue_rank_scores))]
        clue_pos_tag = clue_chunk[1]
        clue_ner_tag = clue_chunk[0]
        clue_length = clue_chunk[4] - clue_chunk[3] + 1

        clue_start = clue_chunk[3]
        clue_end = clue_chunk[4]
        clue_answer_dep_path_len = abs(clue_start - answer_start)
        answer_related = idx2related[answer_start]
        for tk_id, path in answer_related:
            if tk_id == clue_start:
                clue_answer_dep_path_len = len(path)

        selected_clue_binary_ids_padded = np.zeros([sent_limit], dtype=np.float32)
        if clue_start < sent_limit and clue_end < sent_limit:
            selected_clue_binary_ids_padded[clue_start:clue_end + 1] = 1

    clue_info = {
        "clue_pos_tag": clue_pos_tag,
        "clue_ner_tag": clue_ner_tag,
        "clue_length": clue_length,
        "clue_chunk": clue_chunk,
        "clue_answer_dep_path_len": clue_answer_dep_path_len,
        "selected_clue_binary_ids_padded": selected_clue_binary_ids_padded,
    }
    return clue_info
示例#12
0
def get_processed_examples(raw_examples,
                           debug=False,
                           debug_length=20,
                           shuffle=False):
    """
    Get a list of spaCy processed examples given raw examples.
    """
    print("Start transform raw examples to processed examples...")
    start = datetime.now()
    examples = []
    meta = {}
    meta["num_q"] = 0
    num_spans_len_error = 0
    num_not_match_error = 0
    for e in tqdm(raw_examples):
        # paragraph info (here is sentence)
        ans_sent = normalize_text(e["ans_sent"])
        ans_sent_doc = NLP(ans_sent)
        ans_sent_tokens = [token.text for token in ans_sent_doc]
        spans = get_token_char_level_spans(ans_sent, ans_sent_tokens)

        # question info
        ques = normalize_text(e["question"])
        # ques = "<sos> " + ques + " <eos>"  # notice: this is important for QG
        ques_doc = NLP(ques)
        ques_type, ques_type_id = get_question_type(e["question"])

        # answer info
        answer_text = normalize_text(e["answer_text"])
        answer_start = e["answer_start"]
        answer_end = answer_start + len(answer_text)
        answer_span = []

        for idx, span in enumerate(spans):
            if not (answer_end <= span[0] or answer_start >= span[1]):
                answer_span.append(idx)
        y1_in_sent, y2_in_sent = answer_span[0], answer_span[-1]
        answer_in_sent = " ".join(ans_sent_tokens[y1_in_sent:y2_in_sent + 1])

        # clue info
        clue_info = FQG_data_augmentor.get_clue_info(ques,
                                                     ans_sent,
                                                     answer_in_sent,
                                                     None,
                                                     chunklist=None,
                                                     y1_in_sent=y1_in_sent,
                                                     doc=ans_sent_doc,
                                                     ques_doc=ques_doc,
                                                     sent_limit=100)
        ans_sent_is_clue = clue_info["selected_clue_binary_ids_padded"]
        clue_token_position = np.where(ans_sent_is_clue == 1)[0]
        if clue_info["clue_chunk"] is not None:
            clue_tokenized_text = " ".join(clue_info["clue_chunk"][2])
        else:
            clue_tokenized_text = None
        if len(clue_token_position) > 0 and clue_tokenized_text is not None:
            clue_start_token_idx = clue_token_position[0]
            clue_end_token_idx = clue_token_position[-1]

            if len(
                    spans
            ) > clue_end_token_idx:  # !!! NOTICE: sometimes we have len(spans) <= clue_end_token_idx. Need further debug...
                clue_start = spans[clue_start_token_idx][0]
                clue_end = spans[clue_end_token_idx][1]  # [)
                clue_text = ans_sent[clue_start:clue_end]
            else:
                num_spans_len_error += 1
                clue_text = None

            if clue_text != clue_tokenized_text:
                if clue_text is not None:
                    num_not_match_error += 1
                clue_start = ans_sent.find(clue_tokenized_text)
                clue_end = clue_start + len(clue_tokenized_text)
                if clue_start > 0:  # not -1
                    clue_text = clue_tokenized_text
                    # print("clue_text revised: ", clue_text)
                    # print("clue_start revised: ", clue_start)
                else:
                    continue
        else:
            clue_text = None
            clue_start = None

        # final example
        example = {
            "paragraph": ans_sent,
            "question": ques,
            "ques_type": ques_type,  # string type
            "answer": answer_text,
            "answer_start": answer_start,
            "clue": clue_text,
            "clue_start": clue_start,
            "para_id": meta["num_q"]
        }
        examples.append(example)

        meta["num_q"] += 1

        if debug and meta["num_q"] >= debug_length:
            break

    if shuffle:
        random.shuffle(examples)

    print("num_not_match_error: ", num_not_match_error)
    print("num_spans_len_error: ", num_spans_len_error)
    print(
        ("Time of get processed examples: {}").format(datetime.now() - start))
    print("Number of processed examples: ", len(examples))
    return examples
示例#13
0
    sentences = [
        "Mary has lived in England for ten years.",
        "He's going to fly to Chicago next week.",
        "I don't understand this chapter of the book.",
        "The children will be swimming in the ocean at five o'clock.",
        "John had eaten lunch before his colleague arrived.",
        "The best time to study is early in the morning or late in the evening.",
        "The trees along the river are beginning to blossom.",
        "Our friends called us yesterday and asked if we'd like to visit them next month.",
        "You'll be happy to know that she's decided to take the position.",
        "I won't give away your secret."
    ]

    for sentence in sentences:
        print(sentence)
        spacy_doc = NLP(sentence)
        print(separate_content_function_words(spacy_doc, FUNCTION_WORDS_LIST))

# [[2, 5]]
# Mary has lived in England for ten years.
# [1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
# He's going to fly to Chicago next week.
# [0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
# I don't understand this chapter of the book.
# [0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0.]
# The children will be swimming in the ocean at five o'clock.
# [1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0.]
# John had eaten lunch before his colleague arrived.
# [1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
# The best time to study is early in the morning or late in the evening.
# [1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1.]