Пример #1
0
    def __init__(self, path, text_field, label_field, samples=None, cap=None):
        fields = {'text': ('text', TEXT), 'label': ('label', LABEL)}
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        if samples:
            examples = []
            for ix in samples:
                examples.append(Example.fromdict(data[ix], fields))
        else:
            examples = []
            for d in data:
                examples.append(Example.fromdict(d, fields))
        if cap:
            if not isinstance(cap, int):
                raise (
                    "cap needs to be an instance of int, got {}".format(cap))
            if cap < len(examples):
                examples = examples[:cap]
        if isinstance(fields, dict):
            fields, fields_dict = [], fields
            for field in fields_dict.values():
                if isinstance(field, list):
                    fields.extend(field)
                else:
                    fields.append(field)

        super(DailyDialog, self).__init__(examples, fields)
    def __init__(self, fields, readers, data, dirs, sort_key,
                 filter_pred=None):
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields

        read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_
                      in zip(readers, data, dirs)]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(
                    ex_dict, src_field.base_field, tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
            if "tgt" in ex_dict:
                _add_tgt_plan(ex_dict, fields['tgt'].base_field, "tgt_plan")
            _add_segment_details(ex_dict, fields['src'].base_field, "src", "segment_lengths", "segment_count")
            ex_fields = {k: [(k, v)] for k, v in fields.items() if
                         k in ex_dict}
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(Dataset, self).__init__(examples, fields, filter_pred)
Пример #3
0
def annotate(model, doc, inp_fields):
    examples = []
    for span1, span2, rel_type in span_pair_generator(doc):
        example = process_span_pair(span1, span2, doc, rel_type)
        if example is not None:
            torch_ex = Example.fromdict(example, inp_fields)
            ex_tensors = {}
            for name in inp_fields:
                tgt_fields = inp_fields[name]
                if not isinstance(tgt_fields, list):
                    tgt_fields = [tgt_fields]

                data = [getattr(torch_ex, name)]
                for name, field in tgt_fields:
                    ex_tensors[name] = field.process(data)
            word, length = ex_tensors['text']
            mask = get_mask(word, ex_tensors['pos1'], ex_tensors['pos2'],
                            length)
            logits = model(word=word,
                           chars=ex_tensors['chars'],
                           pos1=ex_tensors['pos1_rel'],
                           pos2=ex_tensors['pos2_rel'],
                           mask=mask)
            _, output = torch.max(logits, dim=1)
            output = [int(x.item()) for x in output]
            relation = Relation(source='nre',
                                label=Label(value=str(output[0])),
                                annotation_from=span1,
                                annotation_to=span2,
                                doc=doc)
            doc.add(relation)
Пример #4
0
    def __init__(self, fields, path, filter_pred=None):

        paths = glob(path) if isinstance(path, str) else path
        assert len(paths) > 0
        paths.sort()
        examples = []
        for p in paths:
            with open(p) as f:
                language = lang_name(p) if 'language' in fields else None

                for line in f:
                    line = line.strip()
                    if line:
                        ex_dict = dict()
                        if language is not None:
                            ex_dict["language"] = language
                        line_fields = line.strip().split('\t')
                        if len(line_fields) == 3:
                            src, trg, inflection = line_fields
                            ex_dict['trg'] = trg
                        else:
                            src, inflection = line_fields
                            fields.pop("trg", None)  # hmm

                        ex_dict["src"] = src
                        ex_dict["inflection"] = inflection

                        ex = Example.fromdict(ex_dict, fields)
                        examples.append(ex)

        fields = dict(chain.from_iterable(fields.values()))
        super(SigmorphonDataset, self).__init__(examples, fields, filter_pred)
    def __init__(self,
                 fields,
                 readers,
                 data,
                 dirs,
                 sort_key,
                 filter_pred=None):
        #print('fields', fields)
        #print('readers', readers)
        #print('dirs', dirs)
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields

        #print('can_copy', can_copy)
        read_iters = [
            r.read(dat[1], dat[0], dir_)
            for r, dat, dir_ in zip(readers, data, dirs)
        ]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            #print('qields ques', ex_dict)
            if can_copy:
                ques_field = fields['ques']
                ans_field = fields['ans']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(ex_dict,
                                                      ques_field.base_field,
                                                      ans_field.base_field,
                                                      tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)

            ex_fields = {
                k: [(k, v)]
                for k, v in fields.items() if k in ex_dict
            }

            ########## HACK ##############
            #ex_dict["ques"] = [ex_dict["ques"], ex_dict["scores"]]
            #############################
            #print(ex_fields)
            #print(ex_dict)
            #ex_fields["src_map_weights"] = ex_dict["src_map_weights"]
            ex = Example.fromdict(ex_dict, ex_fields)
            #scores_dict = {k: [(k, v)] for k, v in fields.items() if k == "scores"}
            #ex_temp = Example.fromdict(scores_dict, ex_fields)
            #print(ex.ques)
            #print(ex.ans)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(Dataset, self).__init__(examples, fields, filter_pred)
Пример #6
0
    def __init__(self, fields, path, filter_pred=None, decompose=True):

        paths = glob(path) if isinstance(path, str) else path
        assert len(paths) > 0
        paths.sort()
        examples = []
        for p in paths:
            with open(p) as f:
                language = g2p_lang_name(p) if 'language' in fields else None

                for line in f:
                    line = line.strip()
                    if line:
                        ex_dict = dict()
                        if language is not None:
                            ex_dict["language"] = language
                        line_fields = line.strip().split('\t')
                        assert 0 < len(line_fields) <= 2
                        src = line_fields[0]
                        if decompose:
                            # hard-coding the Korean decomposition
                            src = unicodedata.normalize("NFD", src)
                        ex_dict["src"] = src
                        if len(line_fields) == 2:
                            ex_dict['trg'] = line_fields[1]
                        else:
                            fields.pop("trg", None)

                        ex = Example.fromdict(ex_dict, fields)
                        examples.append(ex)

        fields = dict(chain.from_iterable(fields.values()))
        super(SigmorphonG2PDataset, self).__init__(examples, fields,
                                                   filter_pred)
Пример #7
0
    def __init__(self, root_path, img_dir, filename, fields, train, **kwargs):
        with open(os.path.join(root_path, filename), 'rb') as f:
            data = pickle.load(f)

        examples = []
        rand_crop = 'rand_crop' in kwargs and kwargs['rand_crop']
        self.img_transform = preprocess_rc if train and rand_crop else preprocess_1c
        self.train = train
        self.cap_field = fields['caption'][1]
        for cnt, ex in enumerate(data['captions']):
            img_id = ex['image_id']
            img_path = ex['image_path']
            examples.append({
                'image_id':
                img_id,
                'img_to_load':
                os.path.join(root_path, img_dir, img_path)
                if rand_crop else None,
                'img_1c_feat':
                torch.Tensor(data['features'][img_id]),
                'caption':
                ex['caption'],
                'caption_id':
                cnt
            })
        examples = [Example.fromdict(ex, fields) for ex in examples]
        super(ImageCaptionDataset, self).__init__(examples, fields.values())
Пример #8
0
    def __init__(self,
                 fields,
                 src_examples_iter,
                 tgt_examples_iter,
                 filter_pred=None):

        dynamic_dict = 'src_map' in fields and 'alignment' in fields

        if tgt_examples_iter is not None:
            examples_iter = (self._join_dicts(
                src,
                tgt) for src, tgt in zip(src_examples_iter, tgt_examples_iter))
        else:
            examples_iter = src_examples_iter

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in examples_iter:
            if dynamic_dict:
                src_field = fields['src'][0][1]
                tgt_field = fields['tgt'][0][1]
                src_vocab, ex_dict = self._dynamic_dict(
                    ex_dict, src_field, tgt_field)
                self.src_vocabs.append(src_vocab)
            ex_fields = {k: v for k, v in fields.items() if k in ex_dict}
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # the dataset's self.fields should have the same attributes as examples
        fields = dict(chain.from_iterable(ex_fields.values()))

        super(DatasetBase, self).__init__(examples, fields, filter_pred)
Пример #9
0
    def __init__(self, fields, readers, data, dirs, sort_key,
                 filter_pred=None):
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields
        read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_
                      in zip(readers, data, dirs)]
        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            src_ex_vocab = None
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(
                    ex_dict, src_field.base_field, tgt_field.base_field)
            ex_fields = {k: [(k, v)] for k, v in fields.items() if
                         k in ex_dict}
            ex = Example.fromdict(ex_dict, ex_fields)
            if (filter_pred is not None and filter_pred(ex)) or (filter_pred is None):
                examples.append(ex)
                if can_copy:
                    self.src_vocabs.append(src_ex_vocab)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])
        assert len(examples) == len(self.src_vocabs), "example {}, src_vocabs {}".format(len(examples), len(self.src_vocabs))

        super(Dataset, self).__init__(examples, fields, filter_pred)
Пример #10
0
    def from_raw(cls, fields, readers, data, dirs, sort_key, filter_pred=None):
        assert filter_pred is None, 'filter_pred != None f***s up the data'
        can_copy = 'src_map' in fields and 'alignment' in fields

        read_iters = [
            r.read(dat[1], dat[0], dir_)
            for r, dat, dir_ in zip(readers, data, dirs)
        ]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(ex_dict,
                                                      src_field.base_field,
                                                      tgt_field.base_field)
                src_vocabs.append(src_ex_vocab)
            ex_fields = {
                k: [(k, v)]
                for k, v in fields.items() if k in ex_dict
            }
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        return cls(examples, fields, sort_key, src_vocabs)
Пример #11
0
    def __init__(self,
                 fields,
                 src_data,
                 tgt_data,
                 filter_func=None,
                 sort_key=None):

        self.sort_key = sort_key

        ex_fields = {k: [(k, v)] for k, v in fields.items()}

        self.src_vocabs = []

        examples = []
        for ei, (sd, td) in enumerate(zip(src_data, tgt_data)):
            sd = sd.replace("\n", "")
            td = td.replace("\n", "")
            #print(f"GLOSS'{sd}' => TRANS'{td}'")
            ex_dict = {"indices": ei, "src": sd, "tgt": td}
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        dataset_fields = []
        for _, name_field in ex_fields.items():
            dataset_fields.append(name_field[0])

        super(TextDataset, self).__init__(examples, dataset_fields,
                                          filter_func)
Пример #12
0
    def __init__(self, fields, data, sort_key, filter_pred=None):
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in data:
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                ex_dict = _dynamic_dict(ex_dict, src_field.base_field,
                                        tgt_field.base_field)
                self.src_vocabs.append(ex_dict["src_ex_vocab"])
            ex_fields = {
                k: [(k, v)]
                for k, v in fields.items() if k in ex_dict
            }
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(Dataset, self).__init__(examples, fields, filter_pred)
Пример #13
0
    def __init__(self,
                 fields,
                 readers,
                 data,
                 dirs,
                 sort_key,
                 filter_pred=None):
        # print("+++++++++++++++++++dataset_base Dataset init+++++++++++++++++++++++++")
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields
        # for k,v in fields.items():
        #     if(k in ['src','tgt','tt']):
        #         print(("dataset_base.py init() fields_item",k,v.fields[0][1].include_lengths))
        # for r,dat,dir_ in zip(readers,data,dirs):
        #     print(r)
        #     # print(dat[1])
        #     # print(dat[0])
        #     print(type(dir_))
        #     exit()
        read_iters = [
            r.read(dat[1], dat[0], dir_)
            for r, dat, dir_ in zip(readers, data, dirs)
        ]
        # print(type(read_iters))
        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            # print(("ex_dict",ex_dict))
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(ex_dict,
                                                      src_field.base_field,
                                                      tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
                # print("===========================can_copy==================")
            # print("====================111111======================")
            ex_fields = {
                k: [(k, v)]
                for k, v in fields.items() if k in ex_dict
            }
            # print(("dataset_base.py init ex_fields",ex_fields['tt'][0][1].fields[0][1].include_lengths))
            # exit()
            ex = Example.fromdict(ex_dict, ex_fields)
            # print(("ex",ex['src']))
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            # print((_,nf_list))
            assert len(nf_list) == 1
            # print(nf_list[0])
            fields.append(nf_list[0])
        # print(fields)
        # exit()

        super(Dataset, self).__init__(examples, fields, filter_pred)
Пример #14
0
def make_examples(source, cvt):
    assert len(source['documents']) > 0
    sq = source['question'].strip()
    passage_list = []
    for doc in source['documents']:
        #ques_len = len(doc['segmented_title']) + 1
        #clean_passage = "".join(doc['segmented_paragraphs'][doc['most_related_para']][ques_len:])
        #if len(clean_passage) > 4:
        #    passage_list.append(clean_passage)
        passage_list.append(doc['paragraphs'][doc['most_related_para']])
    ret = []
    for passage in passage_list[:args.max_para_num]:
        sample = cvt.convert(sq,
                             passage,
                             args.max_query_length,
                             args.max_seq_length,
                             to_tensor=False)
        (input_ids, input_mask,
         segment_ids) = sample['input'], sample['att_mask'], sample['seg']
        example = {
            'question_id': source['question_id'],
            'question_text': sq,
            'question_type': source['question_type'],
            'passage': passage,
            'answers': source['answers'],
            'input_ids': input_ids,
            'input_mask': input_mask,
            'segment_ids': segment_ids
        }
        ret.append(Example.fromdict(example, {t[0]: t for t in FIELDS}))
    return ret
Пример #15
0
    def __init__(self, fields, readers, data, dirs, sort_key,
                 filter_pred=None):
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields

        read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_
                      in zip(readers, data, dirs)]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(
                    ex_dict, src_field.base_field, tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
            ex_fields = {k: [(k, v)] for k, v in fields.items() if
                         k in ex_dict}
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(Dataset, self).__init__(examples, fields, filter_pred)
Пример #16
0
    def __init__(self, fields, readers, data, dirs, filter_pred=None):
        dynamic_dict = 'src_map' in fields and 'alignment' in fields

        read_iters = [
            r.read(dat[1], dat[0], dir_)
            for r, dat, dir_ in zip(readers, data, dirs)
        ]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(self._join_dicts, zip(*read_iters)):
            if dynamic_dict:
                src_field = fields['src'][0][1]
                tgt_field = fields['tgt'][0][1]
                # this assumes src_field and tgt_field are both text
                src_vocab, ex_dict = self._dynamic_dict(
                    ex_dict, src_field.base_field, tgt_field.base_field)
                self.src_vocabs.append(src_vocab)
            ex_fields = {k: v for k, v in fields.items() if k in ex_dict}
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # the dataset's self.fields should have the same attributes as examples
        fields = dict(chain.from_iterable(ex_fields.values()))

        super(DatasetBase, self).__init__(examples, fields, filter_pred)
Пример #17
0
    def __init__(self,
                 fields,
                 readers,
                 data,
                 dirs,
                 sort_key,
                 filter_pred=None,
                 pointers_file=None):
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields

        read_iters = [
            r.read(dat[1], dat[0], dir_)
            for r, dat, dir_ in zip(readers[:2], data[:2], dirs[:2])
        ]
        # for the label we can directly read the element
        if len(readers) == 3:
            read_iters += [[{'label': i} for i in data[2][1]]]

        if pointers_file is not None:
            with open(pointers_file) as f:
                pointers = [line.strip() for line in f]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for i, ex_dict in enumerate(starmap(_join_dicts, zip(*read_iters))):
            if can_copy:
                ex_pointers = pointers[i] if pointers_file is not None else None

                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(ex_dict,
                                                      src_field.base_field,
                                                      tgt_field.base_field,
                                                      pointers=ex_pointers)
                self.src_vocabs.append(src_ex_vocab)
            ex_fields = {
                k: [(k, v)]
                for k, v in fields.items() if k in ex_dict
            }
            #print(ex_fields)
            ex = Example.fromdict(ex_dict, ex_fields)
            #print(ex.src)
            #import sys
            #sys.exit()
            #print(ex.src_map.shape)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(Dataset, self).__init__(examples, fields, filter_pred)
Пример #18
0
    def load_dataset(self):
        fields = self._get_fields()
        data_dict_generator = self._data_dict_generator()

        examples = [
            Example.fromdict(data_dict, fields)
            for data_dict in data_dict_generator
        ]
        fields = {k: v for k, v in fields.values()}
        return Dataset(examples, fields)
Пример #19
0
def datapoint2example(datapoint, cws=False):
    if cws:
        TXT_field = TEXT_cws
    else:
        TXT_field = TEXT
    return Example.fromdict(datapoint,
                            fields={
                                "author": ("author", AUTHOR),
                                "book": ("book", BOOK),
                                "text": ("text", TXT_field)
                            })
Пример #20
0
 def _to_examples(self, bucket, is_train=False):
     examples = []
     for item in bucket:
         maybe_example = self._process(item, is_train=is_train)
         if maybe_example is not None:
             example = self._maybe_add_dynamic_dict(
                 maybe_example, self.fields_dict)
             ex_fields = {k: [(k, v)] for k, v in self.fields_dict.items()
                          if k in example}
             ex = TorchtextExample.fromdict(example, ex_fields)
             examples.append(ex)
     return examples
Пример #21
0
    def __init__(self,
                 fields,
                 readers,
                 data,
                 dirs,
                 sort_key,
                 filter_pred=None):
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields

        read_iters = [
            r.read(dat[1], dat[0], dir_)
            for r, dat, dir_ in zip(readers, data, dirs)
        ]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(ex_dict,
                                                      src_field.base_field,
                                                      tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
            ex_fields = {
                k: [(k, v)]
                for k, v in fields.items() if k in ex_dict
            }
            ex = Example.fromdict(ex_dict, ex_fields)
            if 'tgt' in ex_dict.keys():
                ex_tgt = []
                for word in getattr(ex, 'tgt')[0]:
                    if word not in getattr(ex, 'src')[0]:
                        ex_tgt += word.split(' ')
                    else:
                        ex_tgt.append(word)
                for i, word in enumerate(ex_tgt):
                    if word == '':
                        del ex_tgt[i]
                ex_tgt = [ex_tgt]
                setattr(ex, 'tgt', ex_tgt)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(Dataset, self).__init__(examples, fields, filter_pred)
Пример #22
0
    def _get_data(self, file):
        examples = []

        with jsonlines.open(file) as json_lines:
            for e in json_lines:
                example = {
                    "context": e["title"] + ' ' + e["passage"],
                    "question": e["question"],
                    "answer": e["answer"]
                }
                examples.append(
                    Example.fromdict(example, fields=self.dict_fields))

        return Dataset(examples, self.fields)
    def __init__(self,
                 fields,
                 readers,
                 data,
                 dirs,
                 sort_key,
                 filter_pred=None,
                 tgt_type=None):
        # this is set at line 594 in inputter.py and line 303 in translator.py
        self.tgt_type = tgt_type
        # concatenate multiple tgt sequences with <sep> or keep them separate as a list of seqs (2D tensor)
        self.concat_tgt = False
        self.sort_key = sort_key

        # will be specified before training, one of [one2one, original, random, verbatim]

        # build src_map/alignment no matter field is available
        can_copy = True

        read_iters = [
            r.read(dat[1], dat[0], dir_)
            for r, dat, dir_ in zip(readers, data, dirs)
        ]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(ex_dict,
                                                      src_field.base_field,
                                                      tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
            ex_fields = {
                k: [(k, v)]
                for k, v in fields.items() if k in ex_dict
            }
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(KeyphraseDataset, self).__init__(examples, fields, filter_pred)
Пример #24
0
    def __init__(self, fields, path, bpe_path, filter_pred=None, lang_src=False,
                 high_oversampling=1, low_oversampling=1):
        # if isinstance(path, str):
        #     paths = [path]
        examples = []
        #for path in paths:
            #with open(path) as f:
        if not isinstance(path, str):
            path=path[0]
        #print(bpe_path)
        f = open(path)
        f_bpe = open(bpe_path)
        language = lang_name(path) if 'language' in fields else None
        setting = data_setting(path)

        for line, line_bpe in zip(f,f_bpe):
            ex_dict = dict()
            if language is not None:
                ex_dict["language"] = language
            line_fields = line.strip().split('\t')
            if len(line_fields) == 3:
                src, tgt, inflection = line_fields
                ex_dict['tgt'] = tgt
            else:
                src, inflection = line_fields
                fields.pop("tgt", None)
            if "inflection" in fields:
                ex_dict["src"] = src
                ex_dict["inflection"] = inflection
            else:
                respaced_inflection = " ".join(inflection.split(";"))
                respaced_src = " ".join(
                    [c if c != " " else "<space>" for c in src])
                src_seq = []
                if language is not None and lang_src:
                    src_seq.append(language)
                src_seq.extend([respaced_inflection, respaced_src])
                ex_dict["src"] = " ".join(src_seq)
            bpe_src = line_bpe.strip().split('|')
            bpe_lens = [len(split) for split in bpe_src]
            ex_dict["word_split"] = bpe_lens
            #print(ex_dict)
            ex = Example.fromdict(ex_dict, fields)
            if setting == "low":
                examples.extend((ex for i in range(low_oversampling)))
            else:
                examples.extend((ex for i in range(high_oversampling)))
        fields = dict(chain.from_iterable(fields.values()))
        #print(fields)
        super(SigmorphonDatasetBPE, self).__init__(examples, fields, filter_pred)
Пример #25
0
 def _deserialize_example(cls, serialized, fields):
     """ Return (example, src_vocab) """
     ex_dict = {}
     src_vocab = None
     for k, v in serialized.items():
         if k == 'src_vocab':
             src_vocab = MiniVocab(v)
         elif k in ('src_map', 'alignment'):
             ex_dict[k] = torch.tensor(v)
         else:
             ex_dict[k] = v
     ex_fields = {k: [(k, v)] for k, v in fields.items() if k in ex_dict}
     example = Example.fromdict(ex_dict, ex_fields)
     return example, src_vocab
Пример #26
0
def make_examples(df: pd.DataFrame):
    examples = []
    fields = {'tag': ('tag', tag_field), 'word': ('word', text_field)}

    for _, row in tqdm(
            df.groupby(["sent"]).agg({
                "word": list,
                "tag": list
            }).iterrows()):
        # for row in sent:
        example = Example.fromdict(row, fields)
        examples.append(example)

    return Dataset(examples, fields=[('tag', tag_field),
                                     ('word', text_field)]).split()
Пример #27
0
def build_examples(data: List[Dict[str, str]], src_lang: str, dest_lang: str,
                   logger: Logger) -> Tuple[List[Example], Field, Field]:
    logger.info('BUILD EXAMPLES')
    src_field = Field(lower=True, tokenize='spacy', tokenizer_language=src_lang, include_lengths=True)
    dest_field = Field(init_token='<sos>', eos_token='<eos>', lower=True, tokenize='spacy',
                       tokenizer_language=dest_lang, include_lengths=True)
    examples = [Example.fromdict(
        data=pair,
        fields={
            f'{src_lang}': ('src', src_field),
            f'{dest_lang}': ('dest', dest_field)
        }
    ) for pair in tqdm.tqdm(data)]
    logger.info(f'Number of examples: {len(examples):,}')
    return examples, src_field, dest_field
Пример #28
0
def read_data(X, y, SRC, TRG, preprocess=None, limit=1000):

    examples = []
    fields = {'text-tokens': ('text', SRC),
              'summ-tokens': ('summ', TRG)}

    for i,(x,y) in enumerate(zip(LineSentenceGenerator(X, preprocess),LineSentenceGenerator(y, preprocess))):
        text_field = x
        summ_field = y

        if limit is not None and i > limit:
            break

        e = Example.fromdict({"text-tokens": text_field, "summ-tokens": summ_field}, fields=fields)
        examples.append(e)

    return Dataset(examples, fields=[('text', SRC), ('summ', TRG)])
Пример #29
0
    def __init__(self,
                 fields,
                 readers,
                 data,
                 dirs,
                 sort_key,
                 filter_pred=None):
        self.sort_key = sort_key
        can_copy = 'src_map' in fields and 'alignment' in fields

        read_iters = [
            r.read(dat[1], dat[0], dir_)
            for r, dat, dir_ in zip(readers, data, dirs)
        ]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        #starmap()
        # 接受实参:func, seq
        # 结果:func(*seq[0]), func(*seq[1]), ...
        # 例子:starmap(pow, [(2,5), (3,2), (10,3)]) --> 32 9 1000
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(ex_dict,
                                                      src_field.base_field,
                                                      tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
            ex_fields = {
                k: [(k, v)]
                for k, v in fields.items() if k in ex_dict
            }
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(Dataset, self).__init__(examples, fields, filter_pred)
Пример #30
0
    def __init__(self,
            src_types: List[str],
            fields,
            readers: List,
            data: List[Tuple[str, Any]],
            dirs: List[str],
            sort_key,
            filter_pred=None,
            can_copy: bool = False,
    ):
        self.sort_key = sort_key

        read_iters = [r.read(dat[1], dat[0], dir_) for r, dat, dir_
                      in zip(readers, data, dirs)]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                tgt_field = fields['tgt']
                src_types_fields = {
                    src_type: fields[f"src.{src_type}"].base_field
                    for src_type in src_types
                }
                src_ex_vocab, ex_dict = _dynamic_dict(
                    ex_dict, src_types, src_types_fields, tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
            # end if
            ex_fields = {k: [(k, v)] for k, v in fields.items() if
                         k in ex_dict}
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(MultiSourceAPDataset, self).__init__(examples, fields, filter_pred)
Пример #31
0
    def __init__(self, root_path, img_dir, filename, fields, train, **kwargs):
        with open(os.path.join(root_path, filename), 'rb') as f:
            data = pickle.load(f)

        examples = []
        rand_crop = 'rand_crop' in kwargs and kwargs['rand_crop']
        self.img_transform = preprocess_rc if train and rand_crop else preprocess_1c
        self.train = train
        self.cap_field = fields['caption'][1]
        for cnt, ex in enumerate(data['captions']):
            img_id = ex['image_id']
            img_path = ex['image_path']
            examples.append({
                'image_id': img_id,
                'img_to_load': os.path.join(root_path, img_dir, img_path) if rand_crop else None,
                'img_1c_feat': torch.Tensor(data['features'][img_id]),
                'caption': ex['caption'],
                'caption_id': cnt
            })
        examples = [Example.fromdict(ex, fields) for ex in examples]
        super(ImageCaptionDataset, self).__init__(examples, fields.values())
Пример #32
0
    def __init__(self, fields, path, filter_pred=None, lang_src=False):
        if isinstance(path, str):
            path = [path]
        examples = []
        for p in path:
            with open(p) as f:
                language = lang_name(p) if 'language' in fields else None

                for line in f:
                    line = line.strip()
                    if line:
                        ex_dict = dict()
                        if language is not None:
                            ex_dict["language"] = language
                        line_fields = line.strip().split('\t')
                        if len(line_fields) == 3:
                            src, trg, inflection = line_fields
                            ex_dict['trg'] = trg
                        else:
                            src, inflection = line_fields
                            fields.pop("trg", None)  # hmm

                        # kludgey stuff for handling inflections
                        respaced_inflection = " ".join(inflection.split(";"))
                        respaced_src = " ".join(
                            [c if c != " " else "<space>" for c in src])
                        src_seq = []
                        if language is not None and lang_src:
                            src_seq.append(language)
                        src_seq.extend([respaced_inflection, respaced_src])

                        ex_dict["src"] = " ".join(src_seq)

                        ex = Example.fromdict(ex_dict, fields)
                        examples.append(ex)

        fields = dict(chain.from_iterable(fields.values()))
        super(SimpleSigmorphonDataset, self).__init__(examples, fields,
                                                      filter_pred)