Пример #1
0
class SentenceTaggerPredictorNER(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the [`CrfTagger`](../models/crf_tagger.md) model
    and also the [`SimpleTagger`](../models/simple_tagger.md) model.

    ``P.S.``: For words tokenization is uses ``JustSpacesWordSplitter`` from ``word_splitter``
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like `{"sentence": "..."}`.
        Runs the underlying model, and adds the `"words"` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.tokenize(sentence)
        return self._dataset_reader.text_to_instance(tokens)
Пример #2
0
 def test_passes_through_correctly(self):
     word_processor = WordTokenizer()
     sentence = "this (sentence) has 'crazy' \"punctuation\"."
     tokens = word_processor.tokenize(sentence)
     expected_tokens = [
         "this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"",
         "punctuation", "\"", "."
     ]
     assert tokens == expected_tokens
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     """
     Expects JSON that looks like ``{"sentence": "..."}``.
     Runs the underlying model, and adds the ``"label"`` to the output.
     """
     sentence = json_dict["sentence"]
     if not hasattr(self._dataset_reader, "tokenizer") and not hasattr(
             self._dataset_reader, "_tokenizer"):
         tokenizer = WordTokenizer()
         sentence = [str(t) for t in tokenizer.tokenize(sentence)]
     return self._dataset_reader.text_to_instance(sentence)
Пример #4
0
    def _read(self, file_path: str) -> Iterator[Instance]:

        # Keys: title + abstractText
        splitter = SpacyWordSplitter('en_core_web_sm', True, True, True)
        tokenizer = WordTokenizer(word_splitter=splitter)
        with open(file_path, 'r') as f:
            json_docs = json.load(f)

        for article in json_docs['documents']:
            doc_name = article['pmid']
            title = article['title']
            abstract = article['abstractText']
            text = title + " " + abstract

            tokens = tokenizer.tokenize(text)

            yield self.text_to_instance(doc_name, tokens)
Пример #5
0
    def _read(self, file_path: str) -> Iterator[Instance]:
        splitter = SpacyWordSplitter('en_core_web_sm', True, True, True)
        tokenizer = WordTokenizer(word_splitter=splitter)
        root = ElementTree.parse(file_path).getroot()
        xml_sents = root.findall("./sentence")

        for xml_sent in tqdm(xml_sents):
            text = xml_sent.find("text").text
            annotations = xml_sent.find('aspectTerms')
            if annotations is not None:
                annotations = annotations.findall("aspectTerm")
            else:
                annotations = []

            # Sorts the annotations by start character
            annotations.sort(key=lambda x: int(x.get('from')))

            # Tokenizes the sentence
            tokens = tokenizer.tokenize(text)

            # Assigns tags based on annotations
            tags = []
            next = 0
            current = None
            for token in tokens:
                # Checks if the next annotation begins somewhere in this token
                start_entity = next < len(annotations)
                start_entity = start_entity and token.idx <= int(
                    annotations[next].get('from'))
                start_entity = start_entity and token.idx + len(
                    token.text) > int(annotations[next].get('from'))

                if start_entity:
                    tags.append('I' if current is None else 'B')
                    current = annotations[next]
                    next += 1
                elif current is not None:
                    if token.idx < int(current.get('to')):
                        tags.append('I')
                    else:
                        tags.append('O')
                        current = None
                else:
                    tags.append('O')

            yield self.text_to_instance(xml_sent.get('id'), tokens, tags)
Пример #6
0
class DocumentOracleDerivation(object):
    def __init__(self,
                 min_combination_num: int = 3,
                 max_combination_num: int = 5,
                 rm_stop_word: bool = True,
                 synonyms: bool = True,
                 stem: bool = False,
                 tokenization: bool = True,
                 beam_sz: int = 5,
                 candidate_percent: float = 1.0):
        self.min_combination_num = min_combination_num
        self.max_combination_num = max_combination_num
        self.rm_stop_word = rm_stop_word
        self.stem = stem
        self.tokenization = tokenization
        self.beam_sz = beam_sz
        self.candidate_percent = candidate_percent
        if self.stem:
            self.stemmer = PorterStemmer().stem_word
        else:
            self.stemmer = lambda x: x
        self.synonyms = synonyms
        if self.tokenization:
            from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
            self.tokenizer = WordTokenizer()
        if self.rm_stop_word:
            self.stop_words = list(set(stopwords.words('english'))) + [x for x in string.punctuation] + ['``', '\'\'']
        else:
            self.stop_words = []

    def get_rouge_w_annotation_ready_to_use(self, gold_tokens: List[str],
                                            pred_tokens: List[str]):
        gold_lower = list(set([x.lower() for x in gold_tokens]))
        gold_wo_stop = [x for x in gold_lower if x not in self.stop_words]  # change of index
        gold_wo_stop = replace_w_morphy(gold_wo_stop)
        gold_stem = [ps.stem(x) for x in gold_wo_stop]

        pred_lower = list([x.lower() for x in pred_tokens])

        pred_lower = replace_w_morphy(pred_lower)
        pred_lower = remove_duplicate_tok(pred_lower)

        pred_stem = [ps.stem(x) for x in pred_lower]
        pred_stem = remove_duplicate_tok(pred_stem)
        size_of_gold = len(gold_stem)
        size_of_pred = len(pred_stem)

        gold_key, gold_value = [], []
        for idx, word in enumerate(gold_wo_stop):
            # for one gold word, we have a minigroup
            _tmp = []
            if word in pred_lower:
                _tmp.append(word)
            elif word in pred_stem:
                _tmp.append(word)
            elif gold_stem[idx] in pred_lower:
                _tmp.append(gold_stem[idx])
            elif gold_stem[idx] in pred_stem:
                _tmp.append(gold_stem[idx])

            # if word or stm word could match, we don't need to search syn
            if _tmp != []:
                _tmp = _tmp[0]
                gold_key.append(_tmp)
                gold_value.append(1)
            else:
                if word not in cache_for_th:
                    try:
                        cache_for_th[word] = flatten(th.Word(word).synonyms('all', relevance=[3]))
                    except:
                        cache_for_th[word] = []

                if gold_stem[idx] not in cache_for_th:
                    try:
                        cache_for_th[gold_stem[idx]] = flatten(
                            th.Word(gold_stem[idx]).synonyms('all', relevance=[3]))
                    except:
                        cache_for_th[gold_stem[idx]] = []
                syn = cache_for_th[word]
                syn_stem = cache_for_th[gold_stem[idx]]
                syn = list(set(syn + syn_stem))
                # print(syn)
                l_syn = len(syn)
                if l_syn != 0:
                    gold_key += syn
                    gold_value += [float(1 / l_syn)] * l_syn

        gold_tokens = [ps.stem(x) for x in gold_key]
        # pred_set = set(pred)
        # comp intersection
        vs = 0
        key_index = []
        for p_idx in range(len(pred_lower)):
            p_word = pred_lower[p_idx]
            p_stem_word = pred_stem[p_idx]

            if p_word in gold_key:
                idx = gold_key.index(p_word)
                v = gold_value[idx]
                vs += v
                key_index.append(p_idx)
            elif p_stem_word in gold_tokens:
                idx = gold_tokens.index(p_stem_word)
                v = gold_value[idx]
                vs += v
                key_index.append(p_idx)

        rouge_recall_1 = 0
        if size_of_gold != 0:
            rouge_recall_1 = vs / float(size_of_gold)
        rouge_pre_1 = 0
        if size_of_pred != 0:
            rouge_pre_1 = vs / float(size_of_pred)
        # print(rouge_recall_1, rouge_pre_1)
        # assert rouge_recall_1 <= 1
        # assert rouge_pre_1 <= 1
        if random.random() < 0.00001:
            print("Recall: {}\tPre: {}".format(rouge_recall_1, rouge_pre_1))
            print(pred_tokens)
        customed_recall = rouge_recall_1 + rouge_pre_1 * 0.01 - 0.01
        f1 = 0 if (rouge_recall_1 + rouge_pre_1 == 0) else 2 * (rouge_recall_1 * rouge_pre_1) / (
                rouge_recall_1 + rouge_pre_1)
        return customed_recall, f1, key_index
        # f1 = 0 if (rouge_recall_1 + rouge_pre_1 == 0) else 2 * (rouge_recall_1 * rouge_pre_1) / (
        #         rouge_recall_1 + rouge_pre_1)
        # f1 = rouge_recall_1 * 5 + rouge_pre_1

    def comp_num_seg_out_of_p_sent_beam(self, _filtered_doc_list,
                                        num_sent_in_combination,
                                        target_ref_sum_list,
                                        map_from_new_to_ori_idx) -> dict:
        beam: List[dict] = []
        if len(_filtered_doc_list) < num_sent_in_combination:
            return {"nlabel": num_sent_in_combination,
                    "data": {},
                    "best": None
                    }

        combs = list(range(0, len(_filtered_doc_list)))
        # _num_edu seq_len
        cur_beam = {
            "in": [],
            "todo": combs,
            "val": 0
        }
        beam.append(cur_beam)
        for t in range(num_sent_in_combination):
            dict_pattern = {}
            # compute top beam_sz for every beam
            global_board = []
            for b in beam:
                already_in_beam = b['in']
                todo = b['todo']

                leaderboard = {}
                for to_add in todo:
                    after_add = already_in_beam + [to_add]
                    candidate_doc_list = list(itertools.chain.from_iterable([_filtered_doc_list[i] for i in after_add]))
                    # average_f_score = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list)
                    _, average_f_score, _ = self.get_rouge_w_annotation_ready_to_use(gold_tokens=target_ref_sum_list,
                                                                                     pred_tokens=candidate_doc_list)
                    leaderboard[to_add] = average_f_score
                sorted_beam = [(k, leaderboard[k]) for k in sorted(leaderboard, key=leaderboard.get, reverse=True)]

                for it in sorted_beam:
                    new_in = already_in_beam + [it[0]]
                    new_in.sort()
                    str_new_in = [str(x) for x in new_in]
                    if '_'.join(str_new_in) in dict_pattern:
                        continue
                    else:
                        dict_pattern['_'.join(str_new_in)] = True
                    new_list = todo.copy()
                    new_list.remove(it[0])
                    _beam = {
                        "in": new_in,
                        "todo": new_list,
                        "val": it[1]
                    }
                    global_board.append(_beam)
            # merge and get the top beam_sz among all

            sorted_global_board = sorted(global_board, key=lambda x: x["val"], reverse=True)

            _cnt = 0
            check_dict = []
            beam_waitlist = []
            for it in sorted_global_board:
                str_in = sorted(it['in'])
                str_in = [str(x) for x in str_in]
                _tmp_key = '_'.join(str_in)
                if _tmp_key in check_dict:
                    continue
                else:
                    beam_waitlist.append(it)
                    check_dict.append(_tmp_key)
                _cnt += 1
                if _cnt >= self.beam_sz:
                    break
            beam = beam_waitlist
        # if len(beam) < 2:
        #     print(len(_filtered_doc_list))
        #     print(_num_edu)
        # Write oracle to a string like: 0.4 0.3 0.4
        _comb_bag = {}
        for it in beam:
            n_comb = it['in']
            n_comb.sort()
            n_comb_original = [map_from_new_to_ori_idx[a] for a in n_comb]
            n_comb_original.sort()  # json label
            n_comb_original = [int(x) for x in n_comb_original]
            candidate_doc_list = list(itertools.chain.from_iterable([_filtered_doc_list[i] for i in n_comb]))
            # f1 = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list)
            _, f1, _ = self.get_rouge_w_annotation_ready_to_use(target_ref_sum_list, candidate_doc_list)

            # f_avg = (f1 + f2 + fl) / 3
            _comb_bag[f1] = {"label": n_comb_original,
                             "R1": f1,
                             "nlabel": num_sent_in_combination}
        # print(len(_comb_bag))
        if len(_comb_bag) == 0:
            return {"nlabel": num_sent_in_combination,
                    "data": {},
                    "best": None
                    }
        else:
            best_key = sorted(_comb_bag.keys(), reverse=True)[0]
            rt_dict = {"nlabel": num_sent_in_combination,
                       "data": _comb_bag,
                       "best": _comb_bag[best_key]
                       }
            return rt_dict

    def derive_doc_oracle(self, doc_list: List[str],
                          ref_sum: str,
                          prefix_summary: str = ""
                          ):
        processed_doc_list, processed_ref_sum_str, processed_prefix_sum_str = [], '', ''
        if self.tokenization:
            token_doc_list = self.tokenizer.batch_tokenize(doc_list)
            for doc in token_doc_list:
                processed_doc_list.append([word.text for word in doc])
            processed_ref_sum_list = [w.text for w in self.tokenizer.tokenize(ref_sum)]
            processed_prefix_sum_list = [w.text for w in self.tokenizer.tokenize(prefix_summary)]
        else:
            processed_doc_list = [d.split(" ") for d in doc_list]
            processed_ref_sum_list = ref_sum.split(" ")
            processed_prefix_sum_list = prefix_summary.split(" ")
        processed_doc_list = [[x.lower() for x in sent] for sent in processed_doc_list]
        processed_ref_sum_list = [x.lower() for x in processed_ref_sum_list]
        processed_prefix_sum_list = [x.lower() for x in processed_prefix_sum_list]
        if self.rm_stop_word:
            processed_doc_list = [[x for x in sent if x not in self.stop_words] for sent in processed_doc_list]
            processed_ref_sum_list = [x for x in processed_ref_sum_list if x not in self.stop_words]
            processed_prefix_sum_list = [x for x in processed_prefix_sum_list if x not in self.stop_words]

        target_ref_sum_list = [x for x in processed_ref_sum_list if x not in processed_prefix_sum_list]

        # preprocessing finished
        filtered_doc_list, map_from_new_to_ori_idx = self.pre_prune(processed_doc_list, target_ref_sum_list)

        combination_data_dict = {}

        for num_sent_in_combination in range(self.min_combination_num, self.max_combination_num):
            combination_data = self.comp_num_seg_out_of_p_sent_beam(_filtered_doc_list=filtered_doc_list,
                                                                    num_sent_in_combination=num_sent_in_combination,
                                                                    target_ref_sum_list=target_ref_sum_list,
                                                                    map_from_new_to_ori_idx=map_from_new_to_ori_idx)
            combination_data_dict = {**combination_data_dict, **combination_data['data']}
            combination_data_dict[num_sent_in_combination] = combination_data
        return combination_data_dict

    def pre_prune(self, list_of_doc: List[List[str]],
                  ref_sum: List[str]
                  ):
        keep_candidate_num = math.ceil(len(list_of_doc) * self.candidate_percent)
        # f_score_list = [self.get_approximate_rouge(ref_sum, x) for x in list_of_doc]
        f_score_list = [self.get_rouge_w_annotation_ready_to_use(ref_sum, x)[1] for x in list_of_doc]
        top_p_sent_idx = numpy.argsort(f_score_list)[-keep_candidate_num:]

        map_from_new_to_ori_idx = []
        # filter
        filtered_doc_list = []
        for i in range(len(top_p_sent_idx)):
            filtered_doc_list.append(list_of_doc[top_p_sent_idx[i]])
            map_from_new_to_ori_idx.append(top_p_sent_idx[i])
        return filtered_doc_list, map_from_new_to_ori_idx
Пример #7
0
token_indexer = bert_indexer.PretrainedBertIndexer(
    '../TransformerCoqa/bert-base-uncased-vocab.txt',
    do_lowercase=False,
    max_pieces=8,
    doc_stride=3)
token_embedder = PretrainedBertEmbedder(
    '../TransformerCoqa/bert-base-uncased.tar.gz')

# with open(args.input_file, 'w') as f:
#     data = json.load(f)['data']
#
# for article in data:
#     story = article['story']

a = "the man went to the store and bought a gallon of milk"
b = tokenizer.tokenize(a)
print(b)

bert_vocab = Vocabulary()
c = token_indexer.tokens_to_indices(b, bert_vocab, 'bert')
print(c)

input_ids = c['bert']
for input_id in input_ids:
    tokens = [
        bert_vocab.get_token_from_index(index=idx, namespace='bert')
        for idx in input_id
    ]
    print(tokens)

d = token_embedder(torch.LongTensor(c['bert']))
Пример #8
0
class DocumentOracleDerivation(object):
    def __init__(self,
                 mixed_combination: bool,
                 min_combination_num: int = 1,
                 max_combination_num: int = 8,
                 rm_stop_word: bool = True,
                 stem: bool = False,
                 morphy: bool = False,
                 tokenization: bool = True,
                 beam_sz: int = 5,
                 prune_candidate_percent: float = 0.4):
        self.mixed_combination = mixed_combination
        self.min_combination_num = min_combination_num
        self.max_combination_num = max_combination_num
        self.rm_stop_word = rm_stop_word
        self.stem = stem
        self.tokenization = tokenization
        self.beam_sz = beam_sz
        self.prune_candidate_percent = prune_candidate_percent
        if self.stem:
            self.stemmer = PorterStemmer().stem_word
        else:
            self.stemmer = lambda x: x

        self.morphy = morphy

        if self.tokenization:
            from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
            self.tokenizer = WordTokenizer()
        if self.rm_stop_word:
            self.stop_words = list(set(stopwords.words('english'))) + [
                x for x in string.punctuation
            ] + ['``', '\'\'']
        else:
            self.stop_words = []

    def derive_doc_oracle(
        self,
        doc_list: List[str],
        ref_sum: str,
        prefix_summary: str = "",
    ):
        # return a dict where key=rouge-f1 and value= [0,0,0,1,0,1,0,...] same size as doc_list
        # processed_doc_list, processed_ref_sum_str, processed_prefix_sum_str = [], '', ''
        len_of_doc = len(doc_list)
        processed_doc_list = [self._rouge_clean(x) for x in doc_list]
        processed_ref_sum_str = self._rouge_clean(ref_sum)
        processed_prefix_sum_str = self._rouge_clean(prefix_summary)
        if self.tokenization:
            new_processed_doc_list = []
            token_doc_list = self.tokenizer.batch_tokenize(processed_doc_list)
            for doc in token_doc_list:
                new_processed_doc_list.append([word.text for word in doc])
            processed_doc_list = new_processed_doc_list
            processed_ref_sum_list = [
                w.text for w in self.tokenizer.tokenize(processed_ref_sum_str)
            ]
            processed_prefix_sum_list = [
                w.text
                for w in self.tokenizer.tokenize(processed_prefix_sum_str)
            ]
        else:
            processed_doc_list = [d.split(" ") for d in processed_doc_list]
            processed_ref_sum_list = processed_ref_sum_str.split(" ")
            processed_prefix_sum_list = processed_prefix_sum_str.split(" ")

        # must do lower
        processed_doc_list = [[x.lower() for x in sent]
                              for sent in processed_doc_list]
        processed_ref_sum_list = [x.lower() for x in processed_ref_sum_list]
        processed_prefix_sum_list = [
            x.lower() for x in processed_prefix_sum_list
        ]

        # if self.rm_stop_word:
        #     processed_doc_list = [[x for x in sent if x not in self.stop_words] for sent in processed_doc_list]
        #     processed_ref_sum_list = [x for x in processed_ref_sum_list if x not in self.stop_words]
        #     processed_prefix_sum_list = [x for x in processed_prefix_sum_list if x not in self.stop_words]

        target_ref_sum_list = [
            x for x in processed_ref_sum_list
            if x not in processed_prefix_sum_list
        ]

        # TODO
        f_score_list, score_matrix = self.iter_rouge(processed_doc_list,
                                                     target_ref_sum_list)

        # preprocessing finished
        filtered_doc_list, map_from_new_to_ori_idx = self.pre_prune(
            processed_doc_list, target_ref_sum_list)
        combination_data_dict = {}
        for num_sent_in_combination in range(self.min_combination_num,
                                             self.max_combination_num):
            combination_data = self.comp_num_seg_out_of_p_sent_beam(
                _filtered_doc_list=filtered_doc_list,
                num_sent_in_combination=num_sent_in_combination,
                target_ref_sum_list=target_ref_sum_list,
                map_from_new_to_ori_idx=map_from_new_to_ori_idx)
            if combination_data['best'] is None:
                break
            best_rouge_of_this_batch = combination_data['best']['R1']
            if len(combination_data_dict) >= self.beam_sz:
                rouge_in_bag = [
                    float(k) for k, v in combination_data_dict.items()
                ]
                if best_rouge_of_this_batch < min(rouge_in_bag):
                    break

            combination_data_dict = {
                **combination_data_dict,
                **combination_data['data']
            }
            combination_data_dict = collections.OrderedDict(
                sorted(combination_data_dict.items(), reverse=True))
            sliced = islice(combination_data_dict.items(), self.beam_sz)
            combination_data_dict = collections.OrderedDict(sliced)
            # combination_data_dict[num_sent_in_combination] = combination_data

        # prepare return data
        return_dict = {}
        for k, v in combination_data_dict.items():
            # tmp_list = [0 for _ in range(len_of_doc)]
            # for i in v['label']:
            #     tmp_list[i] = 1
            return_dict[k] = v['label']
        return return_dict

    def iter_rouge(self, list_of_doc, ref_sum):
        f_score_list = [
            self.get_rouge_ready_to_use(ref_sum, x) for x in list_of_doc
        ]
        # score_matrix_delta = [[0 for _ in range(len(list_of_doc))] for _ in range(len(list_of_doc))]
        score_matrix = [[0 for _ in range(len(list_of_doc))]
                        for _ in range(len(list_of_doc))]
        input = []
        for idx, x in enumerate(list_of_doc):
            for jdx, y in enumerate(list_of_doc):
                input.append((idx, jdx, ref_sum, x + y))
                s = self.get_rouge_ready_to_use(ref_sum, x + y)
                score_matrix[idx][jdx] = s
                # if f_score_list[idx] < 0.01:
                #
                #     score_matrix_delta[idx][jdx] = 0
                # else:
                #     score_matrix_delta[idx][jdx] = min(s / (f_score_list[idx] + 0.001), 2)
        # import numpy as np
        # np.set_printoptions(precision=2)
        # import seaborn as sns
        # sns.set()
        # f_score_list = np.asarray([f_score_list, f_score_list])
        # bx = sns.heatmap(f_score_list)
        # fig = bx.get_figure()
        # fig.savefig("individual_output.png")
        # print('-' * 30)
        # print(np.asarray(score_matrix))
        # score_matrix_delta = np.asarray(score_matrix_delta)
        # ax = sns.heatmap(score_matrix_delta)
        # fig = ax.get_figure()
        # fig.savefig("output.png")

        # ncpu=multiprocessing.cpu_count()
        # pool = multiprocessing.Pool(processes=ncpu)
        # results = pool.starmap(self.get_rouge_ready_to_use, input)
        # for r in results:
        #     score, idx,jdx = r
        #     score_matrix[idx][jdx] = score
        return f_score_list, score_matrix

    def comp_num_seg_out_of_p_sent_beam(self, _filtered_doc_list,
                                        num_sent_in_combination,
                                        target_ref_sum_list,
                                        map_from_new_to_ori_idx) -> dict:
        beam: List[dict] = []
        if len(_filtered_doc_list) < num_sent_in_combination:
            return {
                "nlabel": num_sent_in_combination,
                "data": {},
                "best": None
            }

        combs = list(range(0, len(_filtered_doc_list)))
        # _num_edu seq_len
        cur_beam = {"in": [], "todo": combs, "val": 0}
        beam.append(cur_beam)
        for t in range(num_sent_in_combination):
            dict_pattern = {}
            # compute top beam_sz for every beam
            global_board = []
            for b in beam:
                already_in_beam = b['in']
                todo = b['todo']

                leaderboard = {}
                for to_add in todo:
                    after_add = already_in_beam + [to_add]
                    candidate_doc_list = list(
                        itertools.chain.from_iterable(
                            [_filtered_doc_list[i] for i in after_add]))
                    # average_f_score = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list)
                    average_f_score = self.get_rouge_ready_to_use(
                        gold_tokens=target_ref_sum_list,
                        pred_tokens=candidate_doc_list)
                    leaderboard[to_add] = average_f_score
                sorted_beam = [(k, leaderboard[k]) for k in sorted(
                    leaderboard, key=leaderboard.get, reverse=True)]

                for it in sorted_beam:
                    new_in = already_in_beam + [it[0]]
                    new_in.sort()
                    str_new_in = [str(x) for x in new_in]
                    if '_'.join(str_new_in) in dict_pattern:
                        continue
                    else:
                        dict_pattern['_'.join(str_new_in)] = True
                    new_list = todo.copy()
                    new_list.remove(it[0])
                    _beam = {"in": new_in, "todo": new_list, "val": it[1]}
                    global_board.append(_beam)
            # merge and get the top beam_sz among all

            sorted_global_board = sorted(global_board,
                                         key=lambda x: x["val"],
                                         reverse=True)

            _cnt = 0
            check_dict = []
            beam_waitlist = []
            for it in sorted_global_board:
                str_in = sorted(it['in'])
                str_in = [str(x) for x in str_in]
                _tmp_key = '_'.join(str_in)
                if _tmp_key in check_dict:
                    continue
                else:
                    beam_waitlist.append(it)
                    check_dict.append(_tmp_key)
                _cnt += 1
                if _cnt >= self.beam_sz:
                    break
            beam = beam_waitlist
        # if len(beam) < 2:
        #     print(len(_filtered_doc_list))
        #     print(_num_edu)
        # Write oracle to a string like: 0.4 0.3 0.4
        _comb_bag = {}
        for it in beam:
            n_comb = it['in']
            n_comb.sort()
            n_comb_original = [map_from_new_to_ori_idx[a] for a in n_comb]
            n_comb_original.sort()  # json label
            n_comb_original = [int(x) for x in n_comb_original]
            candidate_doc_list = list(
                itertools.chain.from_iterable(
                    [_filtered_doc_list[i] for i in n_comb]))
            # f1 = self.get_approximate_rouge(target_ref_sum_list, candidate_doc_list)
            f1 = self.get_rouge_ready_to_use(target_ref_sum_list,
                                             candidate_doc_list)

            # f_avg = (f1 + f2 + fl) / 3
            _comb_bag[f1] = {
                "label": n_comb_original,
                "R1": f1,
                "nlabel": num_sent_in_combination
            }
        # print(len(_comb_bag))
        if len(_comb_bag) == 0:
            return {
                "nlabel": num_sent_in_combination,
                "data": {},
                "best": None
            }
        else:
            best_key = sorted(_comb_bag.keys(), reverse=True)[0]
            rt_dict = {
                "nlabel": num_sent_in_combination,
                "data": _comb_bag,
                "best": _comb_bag[best_key]
            }
            return rt_dict

    @staticmethod
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    def get_rouge_ready_to_use_w_index(self, gold_tokens: List[str],
                                       pred_tokens: List[str], idx, jdx):
        return self.get_rouge_ready_to_use(gold_tokens, pred_tokens), idx, jdx

    # No synomous standard version

    def get_rouge_ready_to_use(self, gold_tokens: List[str],
                               pred_tokens: List[str]):
        len_gold = len(gold_tokens)
        len_pred = len(pred_tokens)

        gold_bigram = _get_ngrams(2, gold_tokens)
        pred_bigram = _get_ngrams(2, pred_tokens)

        if self.rm_stop_word:
            gold_unigram = set(
                [x for x in gold_tokens if x not in self.stop_words])
            pred_unigram = set(
                [x for x in pred_tokens if x not in self.stop_words])
        else:
            gold_unigram = set(gold_tokens)
            pred_unigram = set(pred_tokens)

        rouge_1 = cal_rouge(pred_unigram, gold_unigram, len_pred,
                            len_gold)['f']
        rouge_2 = cal_rouge(pred_bigram, gold_bigram, len_pred, len_gold)['f']
        rouge_score = (rouge_1 + rouge_2) / 2
        return rouge_score

    def pre_prune(self, list_of_doc: List[List[str]], ref_sum: List[str]):
        keep_candidate_num = math.ceil(
            len(list_of_doc) * self.prune_candidate_percent)
        # f_score_list = [self.get_approximate_rouge(ref_sum, x) for x in list_of_doc]
        f_score_list = [
            self.get_rouge_ready_to_use(ref_sum, x) for x in list_of_doc
        ]
        top_p_sent_idx = numpy.argsort(f_score_list)[-keep_candidate_num:]

        map_from_new_to_ori_idx = []
        # filter
        filtered_doc_list = []
        for i in range(len(top_p_sent_idx)):
            filtered_doc_list.append(list_of_doc[top_p_sent_idx[i]])
            map_from_new_to_ori_idx.append(top_p_sent_idx[i])
        return filtered_doc_list, map_from_new_to_ori_idx
Пример #9
0
def main(args):
    print('Reading original dataset...')
    original_data = []
    with open(args.original) as f:
        total = sum((1 for _ in f))
    with open(args.original) as f:
        for line in tqdm(f, total=total):
            sample = json.loads(line)
            if sample['gold_label'] != '-':
                original_data.append({
                    'sentence1': sample['sentence1'],
                    'sentence2': sample['sentence2'],
                    'gold_label': sample['gold_label']
                })

    print(f'Read {len(original_data)} original instances.')
    print('-' * 100)
    print('Reading mirror instance...')
    mirror_data = []
    count = 0
    with open(args.mirror) as mf:
        total = sum((1 for _ in mf))
    with open(args.mirror) as mf, open(args.prediction) as pf:
        for instance, prediction in tqdm(zip(mf, pf), total=total):
            ins = json.loads(instance)
            pred = json.loads(prediction)
            mirror_data.append({
                'sentence1': ins['sentence1'],
                'sentence2': ins['sentence2'],
                'gold_label': pred['label'],
                'confidence': max(pred['label_probs'])
            })
            count += 1
    print(f'From {total} mirror instances.')

    print('-' * 100)
    print('Finding paraphrase samples...')
    assert len(original_data) == len(mirror_data),\
        'original dataset size != mirror dataset size'
    positive_samples, negative_samples = [], []

    for original, mirror in tqdm(zip(original_data, mirror_data),
                                 total=len(original_data)):
        assert original['sentence1'] == mirror['sentence2']
        assert original['sentence2'] == mirror['sentence1']
        if original['gold_label'] == 'entailment' and mirror['gold_label'] == 'entailment'\
                and mirror['confidence'] >= args.confidence_threshold:
            positive_samples.append({
                'sentence1': original['sentence1'],
                'sentence2': original['sentence2'],
                'label': 1
            })
        else:
            negative_samples.append({
                'sentence1': original['sentence1'],
                'sentence2': original['sentence2'],
                'label': 0
            })

    print('-' * 100)
    print('Tokenize and write into output')
    negative_samples = random.sample(negative_samples, len(positive_samples))
    samples = positive_samples + negative_samples
    random.shuffle(samples)

    tokenizer = WordTokenizer()
    with open(args.output, 'w') as outf:
        # MRPC format
        outf.write(f'Quality\t#1 ID\t#2 ID\t#1 String\t#2 String\n')

        for sample in tqdm(samples, total=len(samples)):
            label = sample['label']
            sentence1, sentence2 = sample['sentence1'], sample['sentence2']
            s1_tokens = ' '.join(
                (t.text for t in tokenizer.tokenize(sentence1)))
            s2_tokens = ' '.join(
                (t.text for t in tokenizer.tokenize(sentence2)))
            outf.write(
                f'{label}\tsentence1\tsentence2\t{s1_tokens}\t{s2_tokens}\n')

    print(f'Written {len(samples)} pairs of paraphrase into {args.output}')