def _get_torchtext_data_iterator(include_lengths=False):
    text_field = torchtext.data.Field(
        sequential=True,
        pad_first=False,  # nosec
        init_token="<s>",
        eos_token="</s>",  # nosec
        include_lengths=include_lengths
    )  # nosec

    example1 = Example.fromdict({"text": "a b c a c"}, {"text": ("text", text_field)})
    example2 = Example.fromdict({"text": "b c a a"}, {"text": ("text", text_field)})
    example3 = Example.fromdict({"text": "c b a"}, {"text": ("text", text_field)})

    dataset = torchtext.data.Dataset(
        [example1, example2, example3],
        {"text": text_field},
    )
    text_field.build_vocab(dataset)

    iterator = torchtext.data.Iterator(
        dataset,
        batch_size=3,
        sort_key=None,
        device=None,
        batch_size_fn=None,
        train=True,
        repeat=False,
        shuffle=None,
        sort=None,
        sort_within_batch=None
    )
    return iterator, text_field
    def __init__(self, path, fields, **kwargs):
        """Create a ConllXDataset given a path and field list.
    Arguments:
        path (str): Path to the data file.
        fields (dict[str: tuple(str, Field)]):
            The keys should be a subset of the columns, and the
            values should be tuples of (name, field).
            Keys not present in the input dictionary are ignored.
    """
        self.n_tokens = 0
        with io.open(os.path.expanduser(path), encoding="utf8") as f:
            examples = []
            for d in conllx_reader(f):
                if len(d["form"]) >= 70 and "train" in path:
                    continue
                else:
                    self.n_tokens += len(Example.fromdict(d, fields).form)
                    examples.append(Example.fromdict(d, fields))
                # examples.append(Example.fromdict(d, fields))

        if isinstance(fields, dict):
            fields, field_dict = [], fields
            for field in field_dict.values():
                if isinstance(field, list):
                    fields.extend(field)
                else:
                    fields.append(field)

        super(ConllXDataset, self).__init__(examples, fields, **kwargs)
예제 #3
0
    def __init__(self, paragraph_path: str, label_path: str, fields: dict, split_sentences: bool, train: bool,
                 max_chars: int=1000, level: str="char",
                 **kwargs):
        """
        Create a WiLIDataset given a path two the raw text and to the labels and field dict.
        """

        self.level = level

        with io.open(os.path.expanduser(paragraph_path), encoding="utf8") as f_par, \
                io.open(os.path.expanduser(label_path), encoding="utf8") as f_lab:

            examples = []
            for d in data_reader(f_par, f_lab, train, split_sentences, max_chars, level):
                for sentence in d:
                    examples.extend([Example.fromdict(sentence, fields)])

        if isinstance(fields, dict):
            fields, field_dict = [], fields
            for field in field_dict.values():
                if isinstance(field, list):
                    fields.extend(field)
                else:
                    fields.append(field)

        super(WiLIDataset, self).__init__(examples, fields, **kwargs)
예제 #4
0
    def __init__(self, paragraph_path: str, label_path: str, switch_path: str,
                 fields: dict, level: str, **kwargs):
        """
        Create a WiLIDataset given a path two the raw text and to the labels and field dict.
        """

        self.level = level

        with io.open(os.path.expanduser(paragraph_path), encoding="utf8") as f_par, \
             io.open(os.path.expanduser(label_path), encoding="utf8") as f_lang, \
                io.open(os.path.expanduser(switch_path), encoding="utf8") as f_switch:

            examples = []
            for d in data_reader(f_par, f_lang, f_switch):
                if d is None: continue
                examples.append(Example.fromdict(d, fields))

        if isinstance(fields, dict):
            fields, field_dict = [], fields
            for field in field_dict.values():
                if isinstance(field, list):
                    fields.extend(field)
                else:
                    fields.append(field)

        super(WiLIDataset, self).__init__(examples, fields, **kwargs)
예제 #5
0
    def __init__(self, fields, path, extension='.txt', **kwargs):
        examples = []

        num_sequences = len(fields)

        data_files = glob.glob(os.path.join(path, '*' + extension))
        for data_file in data_files:
            # Read the file line by line, and create examples from series
            # of num_sequences consecutive lines
            with io.open(os.path.expanduser(data_file), encoding="utf8") as f:
                line_buffer = []
                for line in f:
                    if len(line_buffer) == num_sequences:
                        # Make a new example
                        example = Example.fromlist(line_buffer, fields)
                        examples.append(example)

                        # Remove the first sentence
                        line_buffer.pop(0)
                    line_buffer.append(line)

        print('Found %d examples' % (len(examples)))
        super(StoryDataset, self).__init__(examples, fields, **kwargs)

        def foo(x):
            sort_keys = []
            for i in xrange(0, len(fields)):
                example = getattr(x, fields[i][0])
                sort_keys.append(len(example))
            return sort_keys

        self.sort_key = foo  #lambda x: len(x.field_0)
    def __init__(self, path, load_ext_feats=False):
        """
        Create a Castor dataset involving pairs of texts
        """
        fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD),
                  ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD),
                  ('aid', self.AID_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)]

        examples = []
        with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2:
            sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
            sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

        word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
        self.word_to_doc_cnt = word_to_doc_cnt

        if not load_ext_feats:
            overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt)
        else:
            overlap_feats = np.loadtxt(os.path.join(path, 'overlap_feats.txt'))

        with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file:
            for i, (pair_id, l1, l2, ext_feats, label) in enumerate(zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file)):
                pair_id = pair_id.rstrip('.\n')
                label = label.rstrip('.\n')
                example_list = [pair_id, l1, l2, ext_feats, label, i + 1, ' '.join(l1), ' '.join(l2)]
                example = Example.fromlist(example_list, fields)
                examples.append(example)

        super(CastorPairDataset, self).__init__(examples, fields)
예제 #7
0
    def __init__(self, path):
        """
        Create a MSRP dataset instance
        """
        fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD),
                ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)]

        examples = []
        with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2:
            sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
            sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

        word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
        self.word_to_doc_cnt = word_to_doc_cnt

        with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file:
            for pair_id, l1, l2, label in zip(id_file, sent_list_1, sent_list_2, label_file):
                pair_id = pair_id.rstrip('.\n')
                label = label.rstrip('.\n')
                ext_feats = []

                # Number features
                sent1_nums, sent2_nums = [], []
                match = self.NUMBER_PATTERN.search(' '.join(l1))
                if match:
                    for g in match.groups():
                        if g is not None:
                            sent1_nums.append(g)

                match = self.NUMBER_PATTERN.search(' '.join(l2))
                if match:
                    for g in match.groups():
                        if g is not None:
                            sent2_nums.append(g)

                sent1_nums = set(sent1_nums)
                sent2_nums = set(sent2_nums)
                exact = int(sent1_nums == sent2_nums)
                superset = int(sent1_nums.issuperset(sent2_nums) or sent2_nums.issuperset(sent1_nums))
                ext_feats.append(1 if (exact or (len(sent1_nums) == 0 and len(sent2_nums) == 0)) else 0)
                ext_feats.append(exact)
                ext_feats.append(superset)

                # Length difference
                ext_feats.append(len(l2) - len(l1))

                # Overlap
                overlap = len(set(l1) & set(l2))
                ext_feats.append(overlap / len(l1))
                ext_feats.append(overlap / len(l2))

                example = Example.fromlist([pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)], fields)
                examples.append(example)

        super(MSRP, self).__init__(examples, fields)
예제 #8
0
    def __init__(self, path: str, text_field: Field, label_field: Field, **kwargs) -> None:
        fields = [('text', text_field), ('label', label_field)]
        examples = []

        with open(path) as f:
            for line in f.readlines():
                line = line.strip()
                label = line[-1]
                text = line[:-2]
                examples.append(Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)
예제 #9
0
    def __init__(self, path, fields, **kwargs):
        """Create a ConllUDataset given a path and field list.
    Arguments:
        path (str): Path to the data file.
        fields (dict[str: tuple(str, Field)]):
            The keys should be a subset of the columns, and the
            values should be tuples of (name, field).
            Keys not present in the input dictionary are ignored.
    """

        with io.open(os.path.expanduser(path), encoding="utf8") as f:

            # examples = [Example.fromdict(d, fields) for d in conllu_reader(f)]
            # count = 0
            if "train" in path:
                examples = []
                for d in conllu_reader(f):
                    if len(Example.fromdict(d, fields).form) <= 70:
                        examples.append(Example.fromdict(d, fields))
            else:
                examples = [
                    Example.fromdict(d, fields) for d in conllu_reader(f)
                ]

                # if len(Example.fromdict(d, fields).form) > 60:
            #     count += 1
            # print(count)

        if isinstance(fields, dict):
            fields, field_dict = [], fields
            for field in field_dict.values():
                if isinstance(field, list):
                    fields.extend(field)
                else:
                    fields.append(field)

        super(ConllUDataset, self).__init__(examples, fields, **kwargs)
예제 #10
0
def augment(data_source, aug_algo, encoder_model, sim_measure, labeled_examples, unlabeled_examples, train_ds, test_ds, text_field, label_field, num_classes, sigma=None):
    res = encode_data_with_pretrained(data_source, train_ds, test_ds, text_field, encoder_model, labeled_examples, unlabeled_examples)
    x_l, y_l, x_u, y_u, xs_u_unencoded = res

    if aug_algo.startswith("knn"):
        algo_version = aug_algo.split('_')[1]
        if algo_version == 'base':
            classifications, indices = knn_classify(x_l, y_l, x_u, n=1, weights='uniform')
            frac_used = 1
        elif algo_version == 'threshold':
            classifications, indices = knn_classify(x_l, y_l, x_u, n=2, threshold=0.99)
            y_u = y_u[indices]
            xs_u_unencoded = [xs_u_unencoded[idx] for idx in indices]
            frac_used = float(len(xs_u_unencoded)/len(x_u))
    elif aug_algo.startswith("kmeans"):
        algo_version = aug_algo.split('_')[1]
        if algo_version == "base":
            classifications = kmeans(x_l, x_u, y_l, n_clusters=num_classes)
        elif algo_version == "recursive":
            classifications = recursive_kmeans(x_l, x_u, y_l, n_clusters=num_classes)
        frac_used = 1
    elif aug_algo.startswith("lp"):
        algo_version = aug_algo.split('_')[1]
        lp = LabelProp(x_l, y_l, x_u, y_u, num_classes, data_source=data_source, sigma=sigma)
        if algo_version == 'base':
            lp.propagate()
            classifications, indices = lp.classify(threshold=False)
            y_u = y_u[indices]
            xs_u_unencoded = [xs_u_unencoded[idx] for idx in indices]
            frac_used = float(len(xs_u_unencoded)/len(x_u))
        elif algo_version == "threshold":
            lp.propagate()
            classifications, indices = lp.classify(threshold=True)
            y_u = y_u[indices]
            xs_u_unencoded = [xs_u_unencoded[idx] for idx in indices]
            frac_used = float(len(xs_u_unencoded)/len(x_u))
        elif algo_version == "recursive":
            classifications, indices = lp.recursive(x_l, y_l, x_u, y_u)
            y_u = y_u[indices]
            xs_u_unencoded = [xs_u_unencoded[idx] for idx in indices]
            frac_used = float(len(xs_u_unencoded)/len(x_u))
    
    num_correct = np.sum(classifications == y_u)
    aug_acc = 0 if len(classifications) == 0 else float(num_correct/len(classifications))
    new_labeled_data = [{'x': x, 'y': classifications[i]} for i, x in enumerate(xs_u_unencoded)]
    example_fields = {'x': ('x', text_field), 'y': ('y', label_field)}
    new_examples = [Example.fromdict(x, example_fields) for x in new_labeled_data]

    return labeled_examples + new_examples, aug_acc, frac_used
예제 #11
0
    def __init__(self, src_path, tgt_path, src_field, tgt_field, **kwargs):
        fields = {"src": ("src", src_field), "tgt": ("tgt", tgt_field)}
        examples = []
        for src, tgt in zip(open(src_path), open(tgt_path)):
            example = Example.fromdict({
                "src": src.strip(),
                "tgt": tgt.strip()
            }, fields)
            examples.append(example)

        if isinstance(fields, dict):
            fields, field_dict = [], fields
            for field in field_dict.values():
                if isinstance(field, list):
                    fields.extend(field)
                else:
                    fields.append(field)

        super(BilingualDataset, self).__init__(examples, fields, **kwargs)
예제 #12
0
    def seg(self,sentences):
        examples = []
        fields=[('unigram', self.unigram_field), ('fwd_bigram', self.bigram_field),('back_bigram', self.bigram_field)]
        for sent in sentences:
            columns = [[], [], []]
            chars = ['<BOS>'] + list(sent) + ['<EOS>']
            for c,f_bi,b_bi in zip(chars[1:-1],zip(chars,chars[1:]),zip(chars[1:],chars[2:])):
                fwd_bi = ''.join(f_bi)
                back_bi = ''.join(b_bi)
                columns[0].append(c)
                columns[1].append(fwd_bi)
                columns[2].append(back_bi)
            examples.append(Example.fromlist(columns,fields))

        dataset = data.Dataset(examples,fields)
        iter = data.BucketIterator(dataset, batch_size=64, train=False, shuffle=False, sort=False, device=device)

        decoded =self.model.decode(iter)
        segmented_sentence = self.BMSE2seg(sentences,decoded)
        return segmented_sentence
예제 #13
0
    def __init__(self, path):
        """
        Create a Castor dataset involving pairs of texts
        """
        fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD),
                  ('sentence_2', self.TEXT_FIELD),
                  ('ext_feats', self.EXT_FEATS_FIELD),
                  ('label', self.LABEL_FIELD),
                  ('sentence_1_raw', self.RAW_TEXT_FIELD),
                  ('sentence_2_raw', self.RAW_TEXT_FIELD)]

        examples = []

        ids, labels, sent_list_1, sent_list_2 = [], [], [], []
        with open(path) as f:
            for line in f:
                content = json.loads(line)
                sent_list_1.append(content['question'])
                sent_list_2.append(content['qaquestion'])

        word_to_doc_cnt = get_pairwise_word_to_doc_freq(
            sent_list_1, sent_list_2)
        overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2,
                                                      word_to_doc_cnt)
        self.word_to_doc_cnt = word_to_doc_cnt

        with open(path) as f:
            for line in f:
                content = json.loads(line)
                ids.append(content['qid'])
                labels.append(content['qarel'])

        for pair_id, l1, l2, ext_feats, label in zip(ids, sent_list_1,
                                                     sent_list_2,
                                                     overlap_feats, labels):
            example = Example.fromlist([
                pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)
            ], fields)
            examples.append(example)

        super(SemevalDataset, self).__init__(examples, fields)
예제 #14
0
    def create_example_objs(self, hard_training_instances: list) -> list:
        """
        Create `Example` objects from the list of hard training instances
        This method will return a list of `Example` objects that will
        be used to extend the Data Iterator

        Arguments:
            hard_training_instances: List of hard training instances across all batches

        Returns:
            A list of `Example` torchtext objects
        """

        example_objs = []
        for i in range(len(hard_training_instances)):
            example = Example()
            setattr(example, "src", list(hard_training_instances[i][0][0]))
            setattr(example, "trg", list(hard_training_instances[i][0][1]))
            example_objs.append(example)

        return example_objs
예제 #15
0
    def __init__(self,
                 path,
                 qnum_field,
                 sent_field,
                 page_field,
                 confidence_field,
                 text_field,
                 unigram_field,
                 bigram_field,
                 trigram_field,
                 example_mode='sentence',
                 use_wiki=False,
                 n_wiki_sentences=3,
                 replace_title_mentions='',
                 **kwargs):
        from unidecode import unidecode

        if use_wiki and 'train' in path:
            base_path = os.path.dirname(path)
            filename = os.path.basename(s3_wiki)
            output_file = os.path.join(base_path, filename)
            if not os.path.exists(output_file):
                download_from_url(s3_wiki, output_file)
            with open(output_file) as f:
                self.wiki_lookup = json.load(f)
        else:
            self.wiki_lookup = {}
        self.path = path
        self.example_mode = example_mode

        text_dependent_fields = []
        if text_field is not None:
            text_dependent_fields.append(('text', text_field))
        if unigram_field is not None:
            text_dependent_fields.append(('unigram', unigram_field))
        if bigram_field is not None:
            text_dependent_fields.append(('bigram', bigram_field))
        if trigram_field is not None:
            text_dependent_fields.append(('trigram', trigram_field))

        example_fields = {
            'qnum': [('qnum', qnum_field)],
            'sent': [('sent', sent_field)],
            'page': [('page', page_field)],
            'confidence': [('confidence', confidence_field)],
            'text': text_dependent_fields
        }

        examples = []
        answer_set = set()
        with open(path) as f:
            for ex in json.load(f)['questions']:
                if example_mode == 'sentence':
                    sentences = ex['sentences']
                    confidences = ex['confidences']
                    for i, s in enumerate(sentences):
                        if (len(confidences[i]) != len(s)):
                            raise ValueError(str(len(confidences[i])),
                                             str(len(s)), ex['qnum'])

                        examples.append(
                            Example.fromdict(
                                {
                                    'qnum': ex['qnum'],
                                    'sent': i,
                                    'text': s,
                                    'page': ex['page'],
                                    'confidence': confidences[i]
                                }, example_fields))
                        answer_set.add(ex['page'])
                elif example_mode == 'question':
                    raise NotImplementedError(
                        'Question tokenization is not implemented yet, submit a PR!'
                    )
                elif example_mode == 'runs':
                    raise NotImplementedError(
                        'Run tokenization is not implemented yet, submit a PR!'
                    )
                else:
                    raise ValueError(
                        f"Valid modes are 'sentence', 'question', and 'runs', but '{example_mode}' was given"
                    )

        if use_wiki and n_wiki_sentences > 0 and 'train' in path:
            for page in answer_set:
                if page in self.wiki_lookup:
                    sentences = extract_wiki_sentences(
                        page,
                        self.wiki_lookup[page]['text'],
                        n_wiki_sentences,
                        replace_title_mentions=replace_title_mentions)
                    for i, s in enumerate(sentences):
                        examples.append(
                            Example.fromdict(
                                {
                                    'qnum': -1,
                                    'sent': i,
                                    'text': s,
                                    'page': page
                                }, example_fields))

        dataset_fields = {
            'qnum': qnum_field,
            'sent': sent_field,
            'page': page_field,
            'confidence': confidence_field,
        }
        if text_field is not None:
            dataset_fields['text'] = text_field
        if unigram_field is not None:
            dataset_fields['unigram'] = unigram_field
        if bigram_field is not None:
            dataset_fields['bigram'] = bigram_field
        if trigram_field is not None:
            dataset_fields['trigram'] = trigram_field

        super(QuizBowl, self).__init__(examples, dataset_fields, **kwargs)
예제 #16
0
파일: dataset.py 프로젝트: NPSDC/qb
    def __init__(
        self,
        path,
        qanta_id_field,
        sent_field,
        page_field,
        text_field,
        unigram_field,
        bigram_field,
        trigram_field,
        example_mode="sentence",
        use_wiki=False,
        n_wiki_sentences=3,
        replace_title_mentions="",
        **kwargs,
    ):
        from unidecode import unidecode

        if use_wiki and "train" in path:
            base_path = os.path.dirname(path)
            filename = os.path.basename(s3_wiki)
            output_file = os.path.join(base_path, filename)
            if not os.path.exists(output_file):
                download_from_url(s3_wiki, output_file)
            with open(output_file) as f:
                self.wiki_lookup = json.load(f)
        else:
            self.wiki_lookup = {}
        self.path = path
        self.example_mode = example_mode

        text_dependent_fields = []
        if text_field is not None:
            text_dependent_fields.append(("text", text_field))
        if unigram_field is not None:
            text_dependent_fields.append(("unigram", unigram_field))
        if bigram_field is not None:
            text_dependent_fields.append(("bigram", bigram_field))
        if trigram_field is not None:
            text_dependent_fields.append(("trigram", trigram_field))

        example_fields = {
            "qanta_id": [("qanta_id", qanta_id_field)],
            "sent": [("sent", sent_field)],
            "page": [("page", page_field)],
            "text": text_dependent_fields,
        }

        examples = []
        answer_set = set()
        with open(path) as f:
            for ex in json.load(f)["questions"]:
                if example_mode == "sentence":
                    sentences = [
                        ex["text"][start:end]
                        for start, end in ex["tokenizations"]
                    ]
                    for i, s in enumerate(sentences):
                        examples.append(
                            Example.fromdict(
                                {
                                    "qanta_id": ex["qanta_id"],
                                    "sent": i,
                                    "text": unidecode(s),
                                    "page": ex["page"],
                                },
                                example_fields,
                            ))
                        answer_set.add(ex["page"])
                elif example_mode == "question":
                    examples.append(
                        Example.fromdict(
                            {
                                "qanta_id": ex["qanta_id"],
                                "sent": -1,
                                "text": unidecode(ex["text"]),
                                "page": ex["page"],
                            },
                            example_fields,
                        ))
                    answer_set.add(ex["page"])
                else:
                    raise ValueError(
                        f"Valid modes are 'sentence' and 'question', but '{example_mode}' was given"
                    )

        if use_wiki and n_wiki_sentences > 0 and "train" in path:
            print("Loading wikipedia")
            pages = [(p, self.wiki_lookup[p]["text"]) for p in answer_set
                     if p in self.wiki_lookup]

            def extract(args):
                title, text = args
                sentences = extract_wiki_sentences(
                    title,
                    text,
                    n_wiki_sentences,
                    replace_title_mentions=replace_title_mentions,
                )
                return title, sentences

            for page, sentences in pseq(pages).map(extract).list():
                for i, s in enumerate(sentences):
                    examples.append(
                        Example.fromdict(
                            {
                                "qanta_id": -1,
                                "sent": i,
                                "text": s,
                                "page": page
                            },
                            example_fields,
                        ))

        dataset_fields = {
            "qanta_id": qanta_id_field,
            "sent": sent_field,
            "page": page_field,
        }
        if text_field is not None:
            dataset_fields["text"] = text_field
        if unigram_field is not None:
            dataset_fields["unigram"] = unigram_field
        if bigram_field is not None:
            dataset_fields["bigram"] = bigram_field
        if trigram_field is not None:
            dataset_fields["trigram"] = trigram_field

        super(QuizBowl, self).__init__(examples, dataset_fields, **kwargs)
예제 #17
0
def repeat_augment_and_train(dir_to_save, iter_func, model_wrapper, data_source, aug_algo, encoder_model, sim_measure, datasets, text_field, label_field, frac, num_classes, classifier_params, k, learning_type):
    """
    Runs k trials of augmentation & repeat-classification for a given fraction of labeled training data.
    Args:
        dir_to_save (str): directory to save models created/loaded during this process
        aug_algo (str): which augmentation algorithm to use
        encoder_model (str): encoder model to use for augmentation (w similarity measure between these encodings)
        sim_measure (str): which similarity measure to use
        datasets (list(Dataset)): train/val/test torchtext datasets
        text_field (Field): torchtext field for sentences
        label_field (LabelField): torchtext LabelField for class labels
        frac (float): Fraction of labeled training data to use
        classifier_params (dict): params for intent classifier to use on augmented data.
        k (int): Number of times to repeat augmentation-classifier training process
        learning_type (str): inductive|transductive
    Returns:
        8 statistical measures of the results of these trials
    """
    train_ds, val_ds, test_ds = datasets
    class_accs, aug_accs, aug_fracs = [], [], []
    ps, rs, fs = [], [], []

    # FOR ENTROPY HEURISTIC
    # mst_sigmas, entropies, sigmas, accs, fracs = [], [], [], [], []

    # # ABLATION STUDY
    # sigmas, f1_means, f1_stds, aug_acc_means, aug_acc_stds, frac_used_means, frac_used_stds = [],[],[],[],[],[],[]
    # for sigma in np.arange(0.035, 0.155, 0.005):
    #     sigmas.append(sigma)

    for i in tqdm(range(k), total=k):
        examples = train_ds.examples
        np.random.shuffle(examples)
        cutoff = int(frac*len(examples))
        if learning_type == "transductive":
            labeled_examples = train_ds.examples
            unlabeled_examples = test_ds.examples
        elif frac == 0: # 1 labeled eg from each class
            classes_seen = {i: 0 for i in range(num_classes)}
            labeled_examples, unlabeled_examples = [], []
            for eg in examples:
                if classes_seen[eg.y] == 0:
                    labeled_examples.append(eg)
                    classes_seen[eg.y] += 1
                else:
                    unlabeled_examples.append(eg)
        else: # at least one labeled eg from each class
            while True:
                labeled_examples = examples[:cutoff]
                unlabeled_examples = examples[cutoff:]
                if len(set([eg.y for eg in labeled_examples])) == num_classes:
                    break
                np.random.shuffle(examples)

        ##################################################################################################################
        # PROPAGATION PROCESS VISUALISATION (FOR DEMO)
        # from matplotlib import pyplot as plt
        # from pandas import DataFrame
        # from sklearn.decomposition import PCA
        # from sklearn.manifold import TSNE
        # import matplotlib.transforms as transforms

        # # EXTRACT DATA & COMPUTE DIM_REDUCED EMBEDDINGS
        # pickle.dump(labeled_examples, Path(f'./paper/{frac}_labeled_egs.pkl').open('wb'))
        # pickle.dump(unlabeled_examples, Path(f'./paper/{frac}_unlabeled_egs.pkl').open('wb'))
        # labeled_examples = pickle.load(Path(f'./paper/{frac}_labeled_egs.pkl').open('rb'))
        # unlabeled_examples = pickle.load(Path(f'./paper/{frac}_unlabeled_egs.pkl').open('rb'))
        # intents = pickle.load(Path(f'./data/ic/{data_source}/intents.pkl').open('rb'))
        # res = encode_data_with_pretrained(data_source, train_ds, test_ds, text_field, encoder_model, labeled_examples, unlabeled_examples)
        # x_l, y_l, x_u, y_u, _ = res
        # X = np.concatenate([x_l, x_u])
        # Y = np.concatenate([y_l, y_u])
        # pca = PCA(n_components=100)
        # pca_res = pca.fit_transform(X)
        # tsne = TSNE(n_components=2, verbose=0, perplexity=30, n_iter=1000)
        # tsne_pca_res = tsne.fit_transform(pca_res)
        # ts1, ts2 = tsne_pca_res[:,0], tsne_pca_res[:,1]
        # df_tsne_pca = DataFrame([{
        #     'intent': intents[y],
        #     'x-tsne-pca': t1,
        #     'y-tsne-pca': t2,
        #     'og_idx': idx
        # } for idx, (y,t1,t2) in enumerate(zip(Y,ts1,ts2))])
        # df_tsne_pca.to_pickle(f'./paper/{frac}_dataframe.pkl')
        # df_tsne_pca = pd.read_pickle(f'./paper/{frac}_dataframe.pkl')

        # # PLOT INITIAL DATASET
        # fig, ax = plt.subplots()
        # n_l = len(labeled_examples)
        # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)):
        #     values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values]
        #     for i, v in enumerate(values):
        #         if v[0] < n_l:
        #             ax.scatter(v[1], v[2], color=f'C{idx}', s=100, alpha=1, label=intent)
        #         else:
        #             ax.scatter(v[1], v[2], color='black', s=100, alpha=0.2)
        # title = 'propagation_initial_labeled_only'
        # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)):
        #     values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values]
        #     ax.scatter([v[1] for v in values], [v[2] for v in values], color=f'C{idx}', s=100, alpha=1, label=intent)
        # title = 'propagation_initial_all'
        # ax.grid(b=False)
        # ax.set_ylim(-7.6, 12.5)
        # ax.set_xlim(-10.5, 5.2)
        # fig.set_size_inches(15, 10)
        # plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large')
        # plt.savefig(f'./paper/{title}.pdf', format='pdf', dpi=100)
        # plt.show()
        # assert(False)
        
        # # PRELIMINARY DATA FOR MAIN PLOT
        # dim_reduced_points = [0 for _ in range(100)]
        # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)):
        #     values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values]
        #     for v in values:
        #         dim_reduced_points[int(v[0])] = (v[1:],intent)
        # data = pickle.load(Path('./paper/propagation_data.pkl').open('rb'))
        # indices = pickle.load(Path('./paper/indices_data.pkl').open('rb'))
        # classifications = pickle.load(Path('./paper/classifications_data.pkl').open('rb'))
        # colors = {'findconnection': 'C1', 'departuretime': 'C0'}
        # intent_map = {0: 'findconnection', 1: 'departuretime'}
        # classified_indices = [0, 1]
        # classified_true_labels = ['findconnection', 'departuretime']
        # classified_intents = ['findconnection', 'departuretime']
        # classified_xs = [dim_reduced_points[i][0][0] for i in classified_indices]
        # classified_ys = [dim_reduced_points[i][0][1] for i in classified_indices]

        # # PLOT EACH RECURSION & PROPAGATION ITERATION
        # with plt.style.context('seaborn-whitegrid'):
        #     plt.rcParams['font.family'] = 'serif'
        #     plt.rcParams['mathtext.fontset'] = 'dejavuserif'

        #     # starting point plot
        #     title = '0_final'
        #     fig, ax = plt.subplots()
        #     unclassified_indices = [i for i in range(100) if i not in classified_indices]
        #     unclassified_xs = [dim_reduced_points[i][0][0] for i in unclassified_indices]
        #     unclassified_ys = [dim_reduced_points[i][0][1] for i in unclassified_indices]
        #     ax.scatter(unclassified_xs, unclassified_ys, color='black', s=100, alpha=0.2)
        #     ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1])
        #     ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0])
        #     ax.text(2, 10, 'Recursion 0 -- complete', fontsize=15, color='black', ha="center", va="center")
        #     ax.grid(b=False)
        #     ax.set_ylim(-7.6, 12.5)
        #     ax.set_xlim(-10.5, 5.2)
        #     fig.set_size_inches(15, 10)
        #     plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large')
        #     plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150)
        #     plt.close()

        #     for recursion_idx, prop_data in tqdm(enumerate(data), total=len(data)):
        #         # plot results during propagation
        #         Y_us = [prop_data[0]] if len(prop_data) == 1 else np.array(prop_data)[range(0, len(prop_data), 100)]
        #         for prop_idx, Y_u in enumerate(Y_us):
        #             title = f'{recursion_idx+1}_{(prop_idx+1)*100}'
        #             fig, ax = plt.subplots()
        #             for idx, row in enumerate(Y_u):
        #                 color = colors[intent_map[np.argmax(row)]]
        #                 prob = np.max(row)
        #                 ax.scatter(unclassified_xs[idx], unclassified_ys[idx], color=color, s=100, alpha=prob*0.75)
        #             for (x, y, intent, true_label) in zip(classified_xs[2:], classified_ys[2:], classified_intents[2:], classified_true_labels[2:]):
        #                 ax.scatter(x, y, color=colors[intent], marker='s', s=100, alpha=1)
        #                 if intent != true_label:
        #                     ax.scatter(x, y, color='black', marker='x', s=150, alpha=1)
        #             ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1])
        #             ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0])
        #             ax.text(2, 10, f'Recursion {recursion_idx+1} -- iterating...', fontsize=15, color='black', ha="center", va="center")
        #             ax.grid(b=False)
        #             ax.set_ylim(-7.6, 12.5)
        #             ax.set_xlim(-10.5, 5.2)
        #             fig.set_size_inches(15, 10)
        #             plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large')
        #             plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150)
        #             plt.close()
                
        #         # plot the end result of each recursion - i.e. new ground truth classifications
        #         classified_indices += [i + 2 for i in indices[recursion_idx]]
        #         classified_xs = [dim_reduced_points[i][0][0] for i in classified_indices]
        #         classified_ys = [dim_reduced_points[i][0][1] for i in classified_indices]
        #         classified_true_labels = [dim_reduced_points[i][1] for i in classified_indices]
        #         classified_intents += [intent_map[intent_class] for intent_class in classifications[recursion_idx]]
        #         unclassified_indices = [i for i in range(100) if i not in classified_indices]
        #         unclassified_xs = [dim_reduced_points[i][0][0] for i in unclassified_indices]
        #         unclassified_ys = [dim_reduced_points[i][0][1] for i in unclassified_indices]
        #         title = f'{recursion_idx+1}_final'
        #         fig, ax = plt.subplots()
        #         ax.scatter(unclassified_xs, unclassified_ys, color='black', s=100, alpha=0.2)
        #         for (x, y, intent, true_label) in zip(classified_xs[2:], classified_ys[2:], classified_intents[2:], classified_true_labels[2:]):
        #             ax.scatter(x, y, color=colors[intent], marker='s', s=100, alpha=1)
        #             if intent != true_label:
        #                 ax.scatter(x, y, color='black', marker='x', s=150, alpha=1)
        #         ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1])
        #         ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0])
        #         ax.text(2, 10, f'Recursion {recursion_idx+1} -- complete', fontsize=15, color='black', ha="center", va="center")
        #         ax.grid(b=False)
        #         ax.set_ylim(-7.6, 12.5)
        #         ax.set_xlim(-10.5, 5.2)
        #         fig.set_size_inches(15, 10)
        #         plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large')
        #         plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150)
        #         plt.close()
    
        # assert(False)
        ##################################################################################################################

        # # ENTROPY HEURISTIC
        # res = encode_data_with_pretrained(data_source, train_ds, test_ds, text_field, encoder_model, labeled_examples, unlabeled_examples)
        # x_l, y_l, x_u, y_u, _ = res
        # mst_sigma, entropy, sigma, acc, frac_used = sigma_fit(x_l, y_l, x_u, y_u, num_classes, data_source)
        # mst_sigmas.append(mst_sigma); entropies.append(entropy); sigmas.append(sigma); accs.append(acc); fracs.append(frac_used)
        # continue

        if aug_algo == "eda":
            x_l, y_l = [eg.x for eg in labeled_examples], [eg.y for eg in labeled_examples]
            augmented_x_l, augmented_y_l = eda_corpus(x_l, y_l)
            new_labeled_data = [{'x': x, 'y': y} for x,y in zip(augmented_x_l, augmented_y_l)]
            augmented_train_examples = [Example.fromdict(x, {'x': ('x', text_field), 'y': ('y', label_field)}) for x in new_labeled_data]
            aug_acc = 1; frac_used = 0
        elif aug_algo == "none":
            augmented_train_examples = labeled_examples
            aug_acc = 1; frac_used = 0
        elif aug_algo == "self_feed":
            sf_thresh = 0.7
            augmented_train_examples, aug_acc, frac_used = self_feed(data_source, dir_to_save, iter_func, model_wrapper, labeled_examples, unlabeled_examples, val_ds, test_ds, text_field, label_field, classifier_params, thresh=sf_thresh)
        else:
            augmented_train_examples, aug_acc, frac_used = augment(data_source, aug_algo, encoder_model, sim_measure, labeled_examples, unlabeled_examples, train_ds, test_ds, text_field, label_field, num_classes, sigma=None)
        
        aug_accs.append(aug_acc); aug_fracs.append(frac_used)
        new_train_ds = data.Dataset(augmented_train_examples, {'x': text_field, 'y': label_field})
        new_datasets = (new_train_ds, val_ds, test_ds)

        if learning_type == "inductive":
            acc, p, r, f = do_basic_train_and_classify(new_train_ds, test_ds, classifier_params, data_source)
        else: # transductive
            predictions = [eg.y for eg in augmented_train_examples[len(train_ds.examples):]]
            test_Y = [eg.y for eg in test_ds.examples]
            acc = accuracy_score(predictions, test_Y)
            avg = "macro avg" if data_source == "chat" else "weighted avg"
            report = classification_report(predictions, test_Y, output_dict=True)[avg]
            p, r, f = report['precision'], report['recall'], report['f1-score']
        
        class_accs.append(acc); ps.append(p); rs.append(r); fs.append(f)

    # # ENTROPY HEURISTIC
    # print(np.mean(entropies), np.std(entropies))
    # print(np.mean(mst_sigmas), np.std(mst_sigmas))
    # print(np.mean(sigmas), np.std(sigmas))
    # print(np.mean(accs), np.std(accs))
    # print(np.mean(fracs), np.std(fracs))
    # assert(False)

    # # ABLATION STUDY
    # print(f"SIGMA: {sigma}")
    # f1_means.append(np.mean(class_accs)); f1_stds.append(np.std(class_accs))
    # aug_acc_means.append(np.mean(aug_accs)); aug_acc_stds.append(np.std(aug_accs))
    # frac_used_means.append(np.mean(aug_fracs)); frac_used_stds.append(np.std(aug_fracs))
    # assert(False)

    print(f"FRAC '{frac}' Results Below:")
    print(f'classification acc --> mean: {np.mean(class_accs)}; std: {np.std(class_accs)}')
    print(f'augmentation acc --> mean: {np.mean(aug_accs)}; std: {np.std(aug_accs)}\t (average frac used: {np.mean(aug_fracs)})')
    print(f'p/r/f1 means --> precision mean: {np.mean(ps)}; recall mean: {np.mean(rs)}; f1 mean: {np.mean(fs)}')
    print(f'p/r/f1 stds --> precision std: {np.std(ps)}; recall std: {np.std(rs)}; f1 std: {np.std(fs)}')

    class_acc_mean, class_acc_std = np.mean(class_accs), np.std(class_accs)
    aug_acc_mean, aug_acc_std, aug_frac_mean = np.mean(aug_accs), np.std(aug_accs), np.mean(aug_fracs)
    p_mean, r_mean, f_mean = np.mean(ps), np.mean(rs), np.mean(fs)
    p_std, r_std, f_std = np.std(ps), np.std(rs), np.std(fs)
    
    # # ABLATION STUDY
    # print([round(s, 3) for s in sigmas])
    # print(f1_means)
    # print(f1_stds)
    # print(aug_acc_means)
    # print(aug_acc_stds)
    # print(frac_used_means)
    # print(frac_used_stds)
    # assert(False)

    return class_acc_mean, class_acc_std, aug_acc_mean, aug_acc_std, aug_frac_mean, p_mean, p_std, r_mean, r_std, f_mean, f_std
예제 #18
0
    def __init__(self,
                 path,
                 qanta_id_field,
                 sent_field,
                 page_field,
                 text_field,
                 unigram_field,
                 bigram_field,
                 trigram_field,
                 example_mode='sentence',
                 use_wiki=False,
                 n_wiki_sentences=3,
                 replace_title_mentions='',
                 **kwargs):
        from unidecode import unidecode

        if use_wiki and 'train' in path:
            base_path = os.path.dirname(path)
            filename = os.path.basename(s3_wiki)
            output_file = os.path.join(base_path, filename)
            if not os.path.exists(output_file):
                download_from_url(s3_wiki, output_file)
            with open(output_file) as f:
                self.wiki_lookup = json.load(f)
        else:
            self.wiki_lookup = {}
        self.path = path
        self.example_mode = example_mode

        text_dependent_fields = []
        if text_field is not None:
            text_dependent_fields.append(('text', text_field))
        if unigram_field is not None:
            text_dependent_fields.append(('unigram', unigram_field))
        if bigram_field is not None:
            text_dependent_fields.append(('bigram', bigram_field))
        if trigram_field is not None:
            text_dependent_fields.append(('trigram', trigram_field))

        example_fields = {
            'qanta_id': [('qanta_id', qanta_id_field)],
            'sent': [('sent', sent_field)],
            'page': [('page', page_field)],
            'text': text_dependent_fields
        }

        examples = []
        answer_set = set()
        with open(path) as f:
            for ex in json.load(f)['questions']:
                if example_mode == 'sentence':
                    sentences = [
                        ex['text'][start:end]
                        for start, end in ex['tokenizations']
                    ]
                    for i, s in enumerate(sentences):
                        examples.append(
                            Example.fromdict(
                                {
                                    'qanta_id': ex['qanta_id'],
                                    'sent': i,
                                    'text': unidecode(s),
                                    'page': ex['page']
                                }, example_fields))
                        answer_set.add(ex['page'])
                elif example_mode == 'question':
                    examples.append(
                        Example.fromdict(
                            {
                                'qanta_id': ex['qanta_id'],
                                'sent': -1,
                                'text': unidecode(ex['text']),
                                'page': ex['page']
                            }, example_fields))
                    answer_set.add(ex['page'])
                else:
                    raise ValueError(
                        f"Valid modes are 'sentence' and 'question', but '{example_mode}' was given"
                    )

        if use_wiki and n_wiki_sentences > 0 and 'train' in path:
            print('Loading wikipedia')
            pages = [(p, self.wiki_lookup[p]['text']) for p in answer_set
                     if p in self.wiki_lookup]

            def extract(args):
                title, text = args
                sentences = extract_wiki_sentences(
                    title,
                    text,
                    n_wiki_sentences,
                    replace_title_mentions=replace_title_mentions)
                return title, sentences

            for page, sentences in pseq(pages).map(extract).list():
                for i, s in enumerate(sentences):
                    examples.append(
                        Example.fromdict(
                            {
                                'qanta_id': -1,
                                'sent': i,
                                'text': s,
                                'page': page
                            }, example_fields))

        dataset_fields = {
            'qanta_id': qanta_id_field,
            'sent': sent_field,
            'page': page_field,
        }
        if text_field is not None:
            dataset_fields['text'] = text_field
        if unigram_field is not None:
            dataset_fields['unigram'] = unigram_field
        if bigram_field is not None:
            dataset_fields['bigram'] = bigram_field
        if trigram_field is not None:
            dataset_fields['trigram'] = trigram_field

        super(QuizBowl, self).__init__(examples, dataset_fields, **kwargs)
예제 #19
0
        'ENC_EMB_DIM': 256,
        'DEC_EMB_DIM': 256,
        'ENC_HID_DIM': 512,
        'DEC_HID_DIM': 512,
        'ENC_DROPOUT': 0.5,
        'DEC_DROPOUT': 0.5
    }
    network = create_seq2seq(network_params, device)
    network.load_state_dict(torch.load('weights/tut1-model.pt'))

    # sentence = input('Enter sentence in german: ')
    sentence = 'Ein Hund rennt im Schnee.'
    while sentence is not 'exit':
        # Convert custom sentence to tensor

        example = Example.fromlist([sentence], [('de', src_field)])
        batch = [example.de]
        idx_input = src_field.process(batch).to(device)

        # Translate this tensor
        output_probs = network(idx_input, None, 0)
        idx_output = output_probs.squeeze(1).argmax(axis=1)
        # TODO is actually probs, not idx

        # Convert back
        output_sentence = ' '.join([trg_field.vocab.itos[idx] for idx in idx_output])

        print(output_sentence)
        sentence = input('Enter sentence in german: ')

 def _make_example(self, src, tgt):
     return Example.fromdict({
         "src": src.strip(),
         "tgt": tgt.strip()
     }, self._fields)
 def read_one(self, data_file, dataset_type="train"):
     pkl_data = pickle.load(Path(data_file).open('rb'))
     examples = [Example.fromdict(x, self.fields1) for x in pkl_data]
     dataset = data.Dataset(examples, fields=self.fields2)
     return dataset
예제 #22
0
def fromTSV(data, fields):
    return Example.fromlist(data.split('\t'), fields)