Exemplo n.º 1
0

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


SRC = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

TRG = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

print('Loading Data...')
data_train, data_val, data_test = TabularDataset.splits(
    path='data/',
    format='json',
    train='cnn_dataset_train.json',
    validation='cnn_dataset_valid.json',
Exemplo n.º 2
0
    def __init__(self, boxes_and_transcripts_file: Path, image_file: Path, label_file: Path,
                 entities_list: List[str], resized_image_size: Tuple[int, int] = (480, 960),
                 iob_tagging_type: str = 'box_level', entities_file: Path = None, training: bool = True,
                 image_index=None):
        '''
        An item returned by dataset.

        :param boxes_and_transcripts_file: gt or ocr results file
        :param image_file: whole images file
        :param entities_list: list with entities
        :param resized_image_size: resize whole image size, (w, h)
        :param iob_tagging_type: 'box_level', 'document_level', 'box_and_within_box_level'
        :param entities_file: exactly entity type and entity value of documents, json file
        :param training: True for train and validation mode, False for test mode. True will also load labels,
        and entities_file must be set.
        :param image_index: image index, used to get image file name
        '''
        # text string label converter
        self.text_segments_field = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True)
        self.text_segments_field.vocab = vocab_cls['keys']
        # iob string label converter
        self.iob_tags_field = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True)
        self.iob_tags_field.vocab = vocab_cls['iob_labels']
        self.resized_image_size = resized_image_size
        self.training = training
        assert iob_tagging_type in ['box_level', 'document_level', 'box_and_within_box_level'], \
            'iob tagging type {} is not supported'.format(iob_tagging_type)
        self.iob_tagging_type = iob_tagging_type

        # For easier debug:
        # we will know what we are running on.
        self.image_filename = image_file.as_posix()

        try:
            # read boxes, transcripts, and entity types of boxes in one documents from boxes_and_transcripts file
            # match with regex pattern: index,x1,y1,x2,y2,x3,y3,x4,y4,transcript,type from boxes_and_transcripts tsv file
            # data format as [(index, points, transcription, entity_type)...]
            if self.training:
                # boxes_and_transcripts_data = [(index, [x1, y1, ...], transcript, entity_type), ...]
                boxes_and_transcripts_data = read_gt_file_with_box_entity_type(boxes_and_transcripts_file.as_posix())
            else:
                boxes_and_transcripts_data = read_ocr_file_without_box_entity_type(
                    boxes_and_transcripts_file.as_posix())

            # Sort the box based on the position.
            # boxes_and_transcripts_data = sort_box_with_list(boxes_and_transcripts_data)

            # read image
            image = cv2.imread(image_file.as_posix())
            label = pd.read_csv(label_file.as_posix(),sep='\n',header=None)[0].to_list()
        except Exception as e:
            raise IOError('Error occurs in image {}: {}'.format(image_file.stem, e.args))

        boxes, transcripts, box_entity_types = [], [], []
        if self.training:
            for index, points, transcript, entity_type in boxes_and_transcripts_data:
                if len(transcript) == 0:
                    transcript = ' '
                boxes.append(points)
                transcripts.append(transcript)
                box_entity_types.append(entity_type)
        else:
            for index, points, transcript in boxes_and_transcripts_data:
                if len(transcript) == 0:
                    transcript = ' '
                boxes.append(points)
                transcripts.append(transcript)

        # Limit the number of boxes and number of transcripts to process.
        boxes_num = min(len(boxes), MAX_BOXES_NUM)
        transcript_len = min(max([len(t) for t in transcripts[:boxes_num]]), MAX_TRANSCRIPT_LEN)
        mask = np.zeros((boxes_num, transcript_len), dtype=int)

        relation_features = np.zeros((boxes_num, boxes_num, 6))

        try:

            height, width, _ = image.shape

            # resize image
            image = cv2.resize(image, self.resized_image_size, interpolation=cv2.INTER_LINEAR)
            x_scale = self.resized_image_size[0] / width
            y_scale = self.resized_image_size[1] / height

            # get min area box for each (original) boxes, for calculate initial relation features
            min_area_boxes = [cv2.minAreaRect(np.array(box, dtype=np.float32).reshape(4, 2)) for box in
                              boxes[:boxes_num]]

            # calculate resized image box coordinate, and initial relation features between boxes (nodes)
            resized_boxes = []
            for i in range(boxes_num):
                box_i = boxes[i]
                transcript_i = transcripts[i]

                # get resized images's boxes coordinate, used to ROIAlign in Encoder layer
                resized_box_i = [int(np.round(pos * x_scale)) if i % 2 == 0 else int(np.round(pos * y_scale))
                                 for i, pos in enumerate(box_i)]

                # resized_rect_output_i = cv2.minAreaRect(np.array(resized_box_i, dtype=np.float32).reshape(4, 2))
                # resized_box_i = cv2.boxPoints(resized_rect_output_i)
                resized_box_i = np.array(resized_box_i).reshape((8,))
                resized_boxes.append(resized_box_i)

                # enumerate each box, calculate relation features between i and other nodes.
                # formula (9)
                self.relation_features_between_ij_nodes(boxes_num, i, min_area_boxes, relation_features, transcript_i,
                                                        transcripts)

            relation_features = normalize_relation_features(relation_features, width=width, height=height)
            # The length of texts of each segment.
            text_segments = [list(trans) for trans in transcripts[:boxes_num]]

            if self.training:
                # assign iob label to input text through exactly match way, this process needs entity-level label
                if self.iob_tagging_type != 'box_level':
                    with entities_file.open() as f:
                        entities = json.load(f)

                if self.iob_tagging_type == 'box_level':
                    # convert transcript of every boxes to iob label, using entity type of corresponding box
                    iob_tags_label = text2iob_label_with_box_level_match(box_entity_types[:boxes_num],
                                                                         transcripts[:boxes_num],
                                                                         entities_list=entities_list)
                elif self.iob_tagging_type == 'document_level':
                    # convert transcripts to iob label using document level tagging match method, all transcripts will
                    # be concatenated as a sequences
                    iob_tags_label = text2iob_label_with_document_level_exactly_match(transcripts[:boxes_num], entities,
                                                                                      entities_list=entities_list)

                elif self.iob_tagging_type == 'box_and_within_box_level':
                    # perform exactly tagging within specific box, box_level_entities parames will perform boex level tagging.
                    iob_tags_label = text2iob_label_with_box_and_within_box_exactly_level(box_entity_types[:boxes_num],
                                                                                          transcripts[:boxes_num],
                                                                                          entities, ['address'],
                                                                                          entities_list=entities_list)

                iob_tags_label = self.iob_tags_field.process(iob_tags_label)[:, :transcript_len].numpy()
                box_entity_types = [vocab_cls['entities'].stoi[t] for t in box_entity_types[:boxes_num]]

            # texts shape is (num_texts, max_texts_len), texts_len shape is (num_texts,)
            texts, texts_len = self.text_segments_field.process(text_segments)
            texts = texts[:, :transcript_len].numpy()
            texts_len = np.clip(texts_len.numpy(), 0, transcript_len)
            text_segments = (texts, texts_len)

            for i in range(boxes_num):
                mask[i, :texts_len[i]] = 1

            self.whole_image = RawField().preprocess(image)
            self.text_segments = self.text_segments_field.preprocess(text_segments)  # (text, texts_len)
            self.boxes_coordinate = RawField().preprocess(resized_boxes)
            self.relation_features = RawField().preprocess(relation_features)
            self.mask = RawField().preprocess(mask)
            self.boxes_num = RawField().preprocess(boxes_num)
            self.transcript_len = RawField().preprocess(transcript_len)  # max transcript len of current document
            if self.training:
                self.iob_tags_label = self.iob_tags_field.preprocess(iob_tags_label)
            else:
                self.image_index = RawField().preprocess(image_index)
                self.label = RawField().preprocess(label)

        except Exception as e:
            raise RuntimeError('Error occurs in image {}: {}'.format(boxes_and_transcripts_file.stem, e.args))
Exemplo n.º 3
0
def test_single_gpu_batch_parse():
    trainer = Trainer(gpus=1)

    # non-transferrable types
    primitive_objects = [
        None, {}, [], 1.0, "x", [None, 2], {
            "x": (1, 2),
            "y": None
        }
    ]
    for batch in primitive_objects:
        data = trainer.accelerator.batch_to_device(batch,
                                                   torch.device('cuda:0'))
        assert data == batch

    # batch is just a tensor
    batch = torch.rand(2, 3)
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor'

    # tensor list
    batch = [torch.rand(2, 3), torch.rand(2, 3)]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0].device.index == 0 and batch[0].type(
    ) == 'torch.cuda.FloatTensor'
    assert batch[1].device.index == 0 and batch[1].type(
    ) == 'torch.cuda.FloatTensor'

    # tensor list of lists
    batch = [[torch.rand(2, 3), torch.rand(2, 3)]]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type(
    ) == 'torch.cuda.FloatTensor'
    assert batch[0][1].device.index == 0 and batch[0][1].type(
    ) == 'torch.cuda.FloatTensor'

    # tensor dict
    batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0]['a'].device.index == 0 and batch[0]['a'].type(
    ) == 'torch.cuda.FloatTensor'
    assert batch[0]['b'].device.index == 0 and batch[0]['b'].type(
    ) == 'torch.cuda.FloatTensor'

    # tuple of tensor list and list of tensor dict
    batch = ([torch.rand(2, 3) for _ in range(2)], [{
        'a': torch.rand(2, 3),
        'b': torch.rand(2, 3)
    } for _ in range(2)])
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type(
    ) == 'torch.cuda.FloatTensor'

    assert batch[1][0]['a'].device.index == 0
    assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor'

    assert batch[1][0]['b'].device.index == 0
    assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor'

    # namedtuple of tensor
    BatchType = namedtuple('BatchType', ['a', 'b'])
    batch = [
        BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)
    ]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0].a.device.index == 0
    assert batch[0].a.type() == 'torch.cuda.FloatTensor'

    # non-Tensor that has `.to()` defined
    class CustomBatchType:
        def __init__(self):
            self.a = torch.rand(2, 2)

        def to(self, *args, **kwargs):
            self.a = self.a.to(*args, **kwargs)
            return self

    batch = trainer.accelerator.batch_to_device(CustomBatchType(),
                                                torch.device('cuda:0'))
    assert batch.a.type() == 'torch.cuda.FloatTensor'

    # torchtext.data.Batch
    samples = [{
        'text': 'PyTorch Lightning is awesome!',
        'label': 0
    }, {
        'text': 'Please make it work with torchtext',
        'label': 1
    }]

    text_field = Field()
    label_field = LabelField()
    fields = {'text': ('text', text_field), 'label': ('label', label_field)}

    examples = [Example.fromdict(sample, fields) for sample in samples]
    dataset = Dataset(examples=examples, fields=fields.values())

    # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first
    text_field.build_vocab(dataset)
    label_field.build_vocab(dataset)

    batch = Batch(data=examples, dataset=dataset)
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))

    assert batch.text.type() == 'torch.cuda.LongTensor'
    assert batch.label.type() == 'torch.cuda.LongTensor'
Exemplo n.º 4
0
parser.add_argument('--num_layers',
                    type=int,
                    default=3,
                    metavar='NL',
                    help='number of model tail layers (default: 3)')
parser.add_argument('--vectors',
                    type=str,
                    default="glove.6B.100d",
                    metavar='PV',
                    help='pretrained vectors model (default: glove.6B.100d)')

args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False)

tv_datafields = [("qid", None), ("question_text", TEXT), ("target", LABEL)]

trn, vld = TabularDataset.splits(path=TRAIN_DIR,
                                 train='train.csv',
                                 validation="val.csv",
                                 format='csv',
                                 skip_header=True,
                                 fields=tv_datafields)

tst_datafields = [("qid", None), ("question_text", TEXT)]

tst = TabularDataset(path=TEST_CSV,
                     format='csv',
Exemplo n.º 5
0
def get_fields(src_data_type,
               n_src_feats,
               n_tgt_feats,
               pad='<blank>',
               bos='<s>',
               eos='</s>',
               dynamic_dict=False,
               with_align=False,
               src_truncate=None,
               tgt_truncate=None):
    """
    Args:
        src_data_type: type of the source input. Options are [text|img|audio].
        n_src_feats (int): the number of source features (not counting tokens)
            to create a :class:`torchtext.data.Field` for. (If
            ``src_data_type=="text"``, these fields are stored together
            as a ``TextMultiField``).
        n_tgt_feats (int): See above.
        pad (str): Special pad symbol. Used on src and tgt side.
        bos (str): Special beginning of sequence symbol. Only relevant
            for tgt.
        eos (str): Special end of sequence symbol. Only relevant
            for tgt.
        dynamic_dict (bool): Whether or not to include source map and
            alignment fields.
        with_align (bool): Whether or not to include word align.
        src_truncate: Cut off src sequences beyond this (passed to
            ``src_data_type``'s data reader - see there for more details).
        tgt_truncate: Cut off tgt sequences beyond this (passed to
            :class:`TextDataReader` - see there for more details).

    Returns:
        A dict mapping names to fields. These names need to match
        the dataset example attributes.
    """

    assert src_data_type in ['text', 'img', 'audio', 'vec'], \
        "Data type not implemented"
    assert not dynamic_dict or src_data_type == 'text', \
        'it is not possible to use dynamic_dict with non-text input'
    fields = {}

    fields_getters = {
        "text": text_fields,
        "node": node_fields,
        "img": image_fields,
        "audio": audio_fields,
        "vec": vec_fields
    }

    src_field_kwargs = {
        "n_feats": n_src_feats,
        "include_lengths": True,
        "pad": pad,
        "bos": None,
        "eos": None,
        "truncate": src_truncate,
        "base_name": "src"
    }
    fields["src"] = fields_getters[src_data_type](**src_field_kwargs)

    tgt_field_kwargs = {
        "n_feats": n_tgt_feats,
        "include_lengths": False,
        "pad": pad,
        "bos": bos,
        "eos": eos,
        "truncate": tgt_truncate,
        "base_name": "tgt"
    }
    fields["tgt"] = fields_getters["text"](**tgt_field_kwargs)

    graph_field_kwargs = {"base_name": "graph"}
    fields["graph"] = GraphField(graph_field_kwargs)

    indices = Field(use_vocab=False, dtype=torch.long, sequential=False)
    fields["indices"] = indices

    if dynamic_dict:
        src_map = Field(use_vocab=False,
                        dtype=torch.float,
                        postprocessing=make_src,
                        sequential=False)
        fields["src_map"] = src_map

        src_ex_vocab = RawField()
        fields["src_ex_vocab"] = src_ex_vocab

        align = Field(use_vocab=False,
                      dtype=torch.long,
                      postprocessing=make_tgt,
                      sequential=False)
        fields["alignment"] = align

    if with_align:
        word_align = AlignField()
        fields["align"] = word_align

    return fields
python -m spacy download de
"""

spacy_eng = spacy.load("en")
spacy_ger = spacy.load("de")


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]


english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)

fields = {"English": ("eng", english), "German": ("ger", german)}

train_data, test_data = TabularDataset.splits(
    path="", train="train.json", test="test.json", format="json", fields=fields
)

english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=32, device="cuda"
)
Exemplo n.º 7
0
def load_dataset(batch_size, debug=True):
    spacy_en = spacy.load('en')
    spacy_zh = Chinese()

    def tokenize_en(line):
        return [token.text for token in spacy_zh.tokenizer(line)]

    def tokenize_zh(line):
        return [token.text for token in spacy_en.tokenizer(line)]

    EN = Field(tokenize=tokenize_en,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')
    ZH = Field(tokenize=tokenize_zh,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')

    lines = open(train_file, 'rt', encoding='utf-8').read().splitlines()
    train_samples = [line.split('\t') for line in lines]
    train_docID, train_senID, train_en, train_zh = zip(*train_samples)

    val_docID, val_senID, val_en = extract_data_from_sgm(val_en_file, cols=3)
    val_zh, = extract_data_from_sgm(val_zh_file, cols=1)

    test_docID, test_senID, test_en = extract_data_from_sgm(test_en_file,
                                                            cols=3)

    if debug:
        debug_info_size = 10
        print('\n[Debug] First %d training examples:\n' % debug_info_size)
        for i in range(debug_info_size):
            print(train_docID[i], train_senID[i], train_en[i], train_zh[i])
        print('\n[Debug] First %d validation examples:\n' % debug_info_size)
        for i in range(debug_info_size):
            print(val_docID[i], val_senID[i], val_en[i], val_zh[i])
        print('\n[Debug] First %d test examples:\n' % debug_info_size)
        for i in range(debug_info_size):
            print(test_en[i])

    train_examples = [
        sentence_translation(train_docID[i], train_senID[i], train_en[i],
                             train_zh[i]) for i in range(len(train_docID))
    ]
    val_examples = [
        sentence_translation(val_docID[i], val_senID[i], val_en[i], val_zh[i])
        for i in range(len(val_docID))
    ]

    print("Train size = %d" % len(train_examples))
    print("Eval size = %d" % len(val_examples))

    train_dataset = Dataset(train_examples, {'src': EN, 'trg': ZH})
    val_dataset = Dataset(val_examples, {'src': EN, 'trg': ZH})
    print('Datasets Built!')

    EN.build_vocab(train_dataset.src, min_freq=2)
    ZH.build_vocab(train_dataset.trg, max_size=10000)
    print('Vocabularies Built!')

    train_iter, val_iter = BucketIterator.splits(
        (train_dataset, val_dataset),
        batch_size=batch_size,
        repeat=False,
        sort_key=lambda x: interleave_keys(len(x.src), len(x.trg)))
    print('Training Iterators Built!')
    return train_iter, val_iter, ZH, EN
Exemplo n.º 8
0
def image_fields(base_name, **kwargs):
    img = Field(use_vocab=False,
                dtype=torch.float,
                postprocessing=batch_img,
                sequential=False)
    return [(base_name, img)]
Exemplo n.º 9
0
class IMDBHierarchical(IMDB):
    NESTING_FIELD = Field(batch_first=True, tokenize=clean_string)
    TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
Exemplo n.º 10
0
from torch.utils.tensorboard import SummaryWriter
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

spacy_ger = spacy.load('de')
spacy_eng = spacy.load('en')


def tockenizerGermen(text):
    return [tok.text for tok in spacy_ger.tockenizer(text)]


def tockenizerEnglish(text):
    return [tock.text for tok in spacy_eng.tockenixer(text)]


german = Field(tockenize=tockenizerGermen, lower=True,
               init_tocken='<sos>', eos_tocken='<eos>')

english = Field(tockenize=tockenizerEnglish, lower=True,
                init_tocken='<sos>', eos_tocken='<eos>')

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fileds=(german, english))


german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
Exemplo n.º 11
0
               num_samples: Optional[int] = None,
               add_cls: bool = False,
               random_state: int = 162,
               max_len: Optional[int] = None,
               verbose: bool = True,
               **kwargs):
        path = os.path.join(root, cls.name, 'train.csv')
        full_dataset = YahooAnswers(path=path,
                                    fields=fields,
                                    num_samples=num_samples,
                                    verbose=verbose,
                                    add_cls=add_cls,
                                    random_state=random_state,
                                    max_len=max_len,
                                    **kwargs)
        splitted_data = full_dataset.split(split_ratio=split_ratio,
                                           stratified=stratified,
                                           strata_field=strata_field)
        return splitted_data


if __name__ == '__main__':
    tokenize = lambda x: x.strip().split()
    text_field = Field(sequential=True,
                       use_vocab=True,
                       init_token='<s>',
                       eos_token='</s>',
                       tokenize=tokenize,
                       include_lengths=True)
    print(PTB.splits(fields=(('inp', text_field), ('trg', text_field))))
Exemplo n.º 12
0
def load_data(train_file, test_file, pretrain=None, save_dir=None):
    assert os.path.exists(train_file), f"{train_file} is not exist!"
    assert os.path.exists(test_file), f"{test_file} is not exist!"
    print("=" * 30 + "DATASET LOADER" + "=" * 30)
    sent_field = Field(tokenize=lambda x: x.split(),
                       unk_token='<unk>',
                       pad_token='<pad>',
                       init_token=None,
                       eos_token=None)
    doc_field = NestedField(sent_field,
                            tokenize=sent_tokenize,
                            pad_token='<pad>',
                            init_token=None,
                            eos_token=None,
                            include_lengths=True)
    label_field = LabelField()
    fields = [("raw", RawField()), ("doc", doc_field), ("label", label_field)]
    print(f"Reading {train_file} ...")
    with open(train_file, "r", encoding="utf-8") as reader:
        lines = reader.readlines()
        examples = []
        for line in lines:
            text, label = line.split('\t')
            examples.append(
                Example.fromlist([text, text.lower(), label], fields))
        train_dataset = Dataset(examples, fields)
        reader.close()
    print(f"\tNum of train examples: {len(examples)}")
    print(f"Reading {test_file} ...")
    with open(test_file, "r", encoding="utf-8") as reader:
        lines = reader.readlines()
        examples = []
        for line in lines:
            text, label = line.split('\t')
            examples.append(
                Example.fromlist([text, text.lower(), label], fields))
        test_dataset = Dataset(examples, fields)
        reader.close()
    print(f"\tNum of valid examples: {len(examples)}")
    vectors = FastText('vi')
    doc_field.build_vocab(train_dataset, test_dataset, vectors=vectors)
    label_field.build_vocab(train_dataset, test_dataset)
    print(f"Building vocabulary ...")
    num_vocab = len(doc_field.vocab)
    num_classes = len(label_field.vocab)
    pad_idx = doc_field.vocab.stoi['<pad>']
    print(f"\tNum of vocabulary: {num_vocab}")
    print(f"\tNum of classes: {num_classes}")
    if save_dir:
        with open(save_dir + "/vocab.json", "w", encoding="utf-8") as fv:
            vocabs = {
                "word": doc_field.vocab.stoi,
                "class": label_field.vocab.itos,
                'pad_idx': pad_idx
            }
            json.dump(vocabs, fv)
            fv.close()
        with open(save_dir + "/fileds.json", "w", encoding="utf-8") as ff:
            field_vocabs = {
                "doc": doc_field.vocab.freqs,
                "label": label_field.vocab.freqs
            }
            json.dump(field_vocabs, ff)
            ff.close()
    print("=" * 73)
    return train_dataset, test_dataset, num_vocab, num_classes, pad_idx, vectors.vectors
                 ax=axs[h])
        plt.show()
        print("Decoder Src Layer", layer + 1)
        fig, axs = plt.subplots(1, 4, figsize=(16, 8))
        for h in range(4):
            draw(model.decoder.layers[layer].self_attn.attn[
                0, h].data[:len(tgt_sent), :len(sent)].cpu(),
                 sent,
                 tgt_sent if h == 0 else [],
                 ax=axs[h])
        plt.show()


SRC = Field(tokenize="spacy",
            tokenizer_language="de_core_news_sm",
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

TRG = Field(tokenize="spacy",
            tokenizer_language="en_core_web_sm",
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

MAX_LEN = 100
train_data, valid_data, test_data = Multi30k.splits(
    exts=('.de', '.en'),
    fields=(SRC, TRG),
    filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(
        vars(x)['trg']) <= MAX_LEN)
Exemplo n.º 14
0
def translate(cfg_file, ckpt: str, output_path: str = None) -> None:
    """
    Interactive translation function.
    Loads model from checkpoint and translates either the stdin input or
    asks for input to translate interactively.
    The input has to be pre-processed according to the data that the model
    was trained on, i.e. tokenized or split into subwords.
    Translations are printed to stdout.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    """
    def _load_line_as_data(line):
        """ Create a dataset from one line via a temporary file. """
        # write src input to temporary file
        tmp_name = "tmp"
        tmp_suffix = ".src"
        tmp_filename = tmp_name + tmp_suffix
        with open(tmp_filename, "w") as tmp_file:
            tmp_file.write("{}\n".format(line))

        test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field)

        # remove temporary file
        if os.path.exists(tmp_filename):
            os.remove(tmp_filename)

        return test_data

    def _translate_data(test_data):
        """ Translates given dataset, using parameters from outer scope. """
        # pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores = validate_on_data(
            model, data=test_data, batch_size=batch_size, level=level,
            max_output_length=max_output_length, eval_metric="",
            use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
            beam_alpha=beam_alpha)
        return hypotheses

    cfg = load_config(cfg_file)

    # when checkpoint is not specified, take oldest from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)

    batch_size = cfg["training"].get("batch_size", 1)
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # read vocabs
    src_vocab_file = cfg["training"].get(
        "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt")
    trg_vocab_file = cfg["training"].get(
        "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt")
    src_vocab = Vocabulary(file=src_vocab_file)
    trg_vocab = Vocabulary(file=trg_vocab_file)

    data_cfg = cfg["data"]
    level = data_cfg["level"]
    lowercase = data_cfg["lowercase"]

    tok_fun = lambda s: list(s) if level == "char" else s.split()

    src_field = Field(init_token=None,
                      eos_token=EOS_TOKEN,
                      pad_token=PAD_TOKEN,
                      tokenize=tok_fun,
                      batch_first=True,
                      lower=lowercase,
                      unk_token=UNK_TOKEN,
                      include_lengths=True)
    src_field.vocab = src_vocab

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 0)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 0
        beam_alpha = -1

    if not sys.stdin.isatty():
        # file given
        test_data = MonoDataset(path=sys.stdin, ext="", field=src_field)
        hypotheses = _translate_data(test_data)

        if output_path is not None:
            output_path_set = "{}".format(output_path)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            print("Translations saved to: {}".format(output_path_set))
        else:
            for hyp in hypotheses:
                print(hyp)

    else:
        # enter interactive mode
        batch_size = 1
        while True:
            try:
                src_input = input("\nPlease enter a source sentence "
                                  "(pre-processed): \n")
                if not src_input.strip():
                    break

                # every line has to be made into dataset
                test_data = _load_line_as_data(line=src_input)

                hypotheses = _translate_data(test_data)
                print("JoeyNMT: {}".format(hypotheses[0]))

            except (KeyboardInterrupt, EOFError):
                print("\nBye.")
                break
Exemplo n.º 15
0
def main(colab_args=None):
    if colab_args:
        args = colab_args
    else:
        parser = argparse.ArgumentParser()

        parser.add_argument(
            "--output_dir",
            type=str,
            required=True,
            help=
            "The output directory where the model predictions and checkpoints will be written.",
        )
        parser.add_argument(
            "--should_continue",
            action="store_true",
            help="Whether to continue from latest checkpoint in output_dir")
        parser.add_argument(
            "--model_name_or_path",
            default=None,
            type=str,
            help=
            "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
        )
        parser.add_argument("--train_data_path",
                            default=None,
                            type=str,
                            help="The json file for training the model")
        parser.add_argument("--eval_data_path",
                            default=None,
                            type=str,
                            help="The json file for evaluating the model")
        parser.add_argument(
            "--config_name",
            default=None,
            type=str,
            help=
            "Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
        )
        parser.add_argument(
            "--block_size",
            default=-1,
            type=int,
            help="Optional input sequence length after tokenization."
            "The training dataset will be truncated in block of this size for training."
            "Default to the model max input length for single sentence inputs (take into account special tokens).",
        )
        parser.add_argument("--per_gpu_train_batch_size",
                            default=4,
                            type=int,
                            help="Batch size per GPU/CPU for training.")
        parser.add_argument(
            "--gradient_accumulation_steps",
            type=int,
            default=1,
            help=
            "Number of updates steps to accumulate before performing a backward/update pass.",
        )
        parser.add_argument("--learning_rate",
                            default=1e-5,
                            type=float,
                            help="The initial learning rate for Adam.")
        parser.add_argument("--weight_decay",
                            default=0.0,
                            type=float,
                            help="Weight decay if we apply some.")
        parser.add_argument("--adam_epsilon",
                            default=1e-8,
                            type=float,
                            help="Epsilon for Adam optimizer.")
        parser.add_argument("--max_grad_norm",
                            default=1.0,
                            type=float,
                            help="Max gradient norm.")
        parser.add_argument("--num_train_epochs",
                            default=1.0,
                            type=float,
                            help="Total number of training epochs to perform.")
        parser.add_argument(
            "--max_steps",
            default=-1,
            type=int,
            help=
            "If > 0: set total number of training steps to perform. Override num_train_epochs.",
        )
        parser.add_argument("--log_dir",
                            default=".",
                            type=str,
                            help="Directory to store the logs.")
        parser.add_argument("--warmup_steps",
                            default=0,
                            type=int,
                            help="Linear warmup over warmup_steps.")
        parser.add_argument("--local_rank",
                            type=int,
                            default=-1,
                            help="For distributed training: local_rank")
        parser.add_argument("--logging_steps",
                            type=int,
                            default=500,
                            help="Log every X updates steps.")
        parser.add_argument("--save_steps",
                            type=int,
                            default=500,
                            help="Save checkpoint every X updates steps.")
        parser.add_argument(
            "--save_total_limit",
            type=int,
            default=None,
            help=
            "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
        )
        parser.add_argument(
            "--overwrite_output_dir",
            action="store_true",
            help="Overwrite the content of the output directory")
        parser.add_argument(
            "--overwrite_cache",
            action="store_true",
            help="Overwrite the cached training and evaluation sets")
        parser.add_argument("--seed",
                            type=int,
                            default=42,
                            help="random seed for initialization")
        args = parser.parse_args()

    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError(
                "Used --should_continue but no checkpoint was found in --output_dir."
            )
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and not args.overwrite_output_dir and not args.should_continue):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup CUDA, GPU
    device = torch.device('cuda:{}'.format(torch.cuda.current_device(
    )) if torch.cuda.is_available() else "cpu")
    args.n_gpu = 0 if device == 'cpu' else torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )

    # Set seed
    set_seed(args)

    # setup tokenizer and model
    if os.path.exists(os.path.join(args.output_dir, "tokenizer.pt")):
        new_tokenizer = False
        tokenizer = torch.load(os.path.join(args.output_dir, "tokenizer.pt"))
    else:
        new_tokenizer = True
        tokenizer = Field(tokenize=tokenize_en,
                          init_token='<sos>',
                          eos_token='<eos>',
                          lower=True,
                          batch_first=True)

    train_dataset = VideoBertDataset(tokenizer,
                                     build_tokenizer=new_tokenizer,
                                     data_path=args.train_data_path)
    eval_dataset = VideoBertDataset(train_dataset.tokenizer,
                                    build_tokenizer=False,
                                    data_path=args.eval_data_path)

    data_globals.config.vocab_size = len(
        train_dataset.tokenizer.vocab.itos) + 20736
    print("total vocab size of",
          len(train_dataset.tokenizer.vocab.itos) + 20736)

    if args.model_name_or_path is None:
        # start from inital model
        print('### LOADING INITIAL MODEL ###')
        model = VideoTransformer(config=data_globals.config, args=args)
        model.apply(initialize_weights)
    else:
        # start from checkpoint
        print('### LOADING MODEL FROM CHECKPOINT:', args.model_name_or_path,
              '###')
        model = VideoTransformer.from_pretrained(config=data_globals.config,
                                                 args=args)

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training

    if new_tokenizer:
        torch.save(train_dataset.tokenizer,
                   os.path.join(args.output_dir, "tokenizer.pt"))
        logger.info("Saving tokenizer to %s", args.output_dir)

    # Benchmark Evaluation
    # total_avg_loss, text_avg_loss, video_avg_loss, joint_avg_loss = evaluate(args, model, eval_dataset)
    # print("Benchmark Eval:\n"
    #       "Total: {}\n"
    #       "Text: {}\n"
    #       "Video: {}\n"
    #       "Joint: {}\n".format(total_avg_loss, text_avg_loss, video_avg_loss, joint_avg_loss))
    #
    # print("After Eval:")
    # print(torch.cuda.memory_summary(args.device))

    # Start Training
    model.train()
    global_step, tr_loss = train(args, model, train_dataset, eval_dataset)
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
Exemplo n.º 16
0
class IMDB(TabularDataset):
    NAME = 'IMDB'
    NUM_CLASSES = 10
    TEXT_FIELD = Field(batch_first=True,
                       tokenize=clean_string,
                       include_lengths=True)
    LABEL_FIELD = Field(sequential=False,
                        use_vocab=False,
                        batch_first=True,
                        preprocessing=process_labels)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls,
               path,
               train=os.path.join('IMDB', 'data', 'imdb_train.tsv'),
               validation=os.path.join('IMDB', 'data', 'imdb_validation.tsv'),
               test=os.path.join('IMDB', 'data', 'imdb_test.tsv'),
               **kwargs):
        return super(IMDB, cls).splits(path,
                                       train=train,
                                       validation=validation,
                                       test=test,
                                       format='tsv',
                                       fields=[('label', cls.LABEL_FIELD),
                                               ('text', cls.TEXT_FIELD)])

    @classmethod
    def iters(cls,
              path,
              vectors_name,
              vectors_cache,
              batch_size=64,
              shuffle=True,
              device=0,
              vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_cache: path to directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param unk_init: function used to generate vector for OOV words
        :return:
        """
        if vectors is None:
            vectors = Vectors(name=vectors_name,
                              cache=vectors_cache,
                              unk_init=unk_init)

        train, val, test = cls.splits(path)
        cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors)
        return BucketIterator.splits((train, val, test),
                                     batch_size=batch_size,
                                     repeat=False,
                                     shuffle=shuffle,
                                     sort_within_batch=True,
                                     device=device)
Exemplo n.º 17
0
    config.ls_mode = 'origin'
    # for vae
    config.vae_struct = True
    config.vae_word_dim = 30000
    config.decoder_dataset = 'IMDB_10'
    config.decoder_channel = config.word_num_hidden * 2
    if config.vae_struct:
        assert config.word_num_hidden == config.sentence_num_hidden
    #front-end cnn
    config.frontend_cnn = False

    if args.dataset == 'Yelp2013':
        args.dataset = 'Yelp2014'
        dataset_map[args.dataset].Year = 13
    dataset_map[args.dataset].NESTING_FIELD = Field(
        batch_first=True,
        tokenize=Word_Tokenize(),
        fix_length=config.fix_length)
    dataset_map[args.dataset].TEXT_FIELD = SentenceWord_field(dataset_map[args.dataset].NESTING_FIELD,\
                                                                tokenize=Sentence_Tokenize(),\
                                                                vae_struct=config.vae_struct)

    time_tmp = time.time()
    if args.dataset not in dataset_map:
        raise ValueError('Unrecognized dataset')
    else:
        dataset_class = dataset_map[args.dataset]
        train_iter, dev_iter, test_iter = dataset_class.iters(
            args.data_dir,
            args.word_vectors_file,
            args.word_vectors_dir,
            batch_size=args.batch_size,
Exemplo n.º 18
0
def predict(invocations, result_cnt=5):

    english = Field(tokenize=tokenize_eng,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>")
    bash = Field(tokenize=tokenize_bash,
                 lower=True,
                 init_token="<sos>",
                 eos_token="<eos>")
    fields = {"English": ("eng", english), "Bash": ("bash", bash)}
    train_data, test_data = TabularDataset.splits(
        path="",
        train="src/submission_code/train.json",
        test="src/submission_code/test.json",
        format="json",
        fields=fields)
    english.build_vocab(train_data, max_size=10000, min_freq=2)
    bash.build_vocab(train_data, max_size=10000, min_freq=2)

    # We're ready to define everything we need for training our Seq2Seq model
    device = torch.device("cpu")
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    load_model = True
    save_model = False

    learning_rate = 1e-4

    # Model hyperparameters
    src_vocab_size = len(english.vocab)
    trg_vocab_size = len(bash.vocab)
    embedding_size = 256
    num_heads = 8
    num_encoder_layers = 8
    num_decoder_layers = 8
    dropout = 0.10
    max_len = 100
    forward_expansion = 2048
    src_pad_idx = english.vocab.stoi["<pad>"]

    model = Transformer(
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    if load_model:
        load_checkpoint(
            torch.load("src/my_checkpoint.pth.tar", map_location='cpu'), model,
            optimizer)
    """
    Function called by the evaluation script to interface the participants model
    `predict` function accepts the natural language invocations as input, and returns
    the predicted commands along with confidences as output. For each invocation,
    `result_cnt` number of predicted commands are expected to be returned.

    Args:
        1. invocations : `list (str)` : list of `n_batch` (default 16) natural language invocations
        2. result_cnt : `int` : number of predicted commands to return for each invocation

    Returns:
        1. commands : `list [ list (str) ]` : a list of list of strings of shape (n_batch, result_cnt)
        2. confidences: `list[ list (float) ]` : confidences corresponding to the predicted commands
                                                 confidence values should be between 0.0 and 1.0.
                                                 Shape: (n_batch, result_cnt)
    """

    n_batch = len(invocations)

    # `commands` and `confidences` have shape (n_batch, result_cnt)
    commands = [[''] * result_cnt for _ in range(n_batch)]
    cf = [1.0] * (result_cnt - 1)
    cf.append(0)
    confidences = [cf for _ in range(n_batch)]

    ################################################################################################
    #     Participants should add their codes to fill predict `commands` and `confidences` here    #
    ################################################################################################
    for idx, inv in enumerate(invocations):

        # Call the translate method to retrieve translations and scores
        prediction = translate_sentence(model,
                                        inv,
                                        english,
                                        bash,
                                        device,
                                        max_length=30)[:-1]
        temp = " ".join(prediction)
        top_commands = [temp] * 5
        print(top_commands)
        # For testing evalAI docker push, just fill top command - just need to check
        # if tellina imports work correctly right now
        for i in range(result_cnt):
            commands[idx][i] = top_commands[i]

    ################################################################################################
    #                               Participant code block ends                                    #
    ################################################################################################

    return commands, confidences
Exemplo n.º 19
0
spacy_en = spacy.load('en')


# Tokenize French sentence
def tokenize_fr(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]


# Tokenize English sentence
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]


# Split train / valid / test dataset
SRC = Field(tokenize=tokenize_fr,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)
TRG = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)
train_data, valid_data, test_data = Multi30k.splits(exts=('.fr', '.en'),
                                                    fields=(SRC, TRG))

print("Number of training examples:", len(train_data.examples))
print("Number of validation examples:", len(valid_data.examples))
print("Number of testing examples:", len(test_data.examples))

print("First train data:", vars(train_data.examples[0]))
print("First valid data:", vars(valid_data.examples[0]))
print("First test data:", vars(test_data.examples[0]))
Exemplo n.º 20
0
    best_loss = 65535.0
    temp_model=None
    num = -1
    for item in file_list:
        file = p.match(item)
        if(file):
            num = file.groups()[1]
            loss = file.groups()[2]
            if float(loss)<best_loss:
                temp_model=file.groups()[0]
    return num,temp_model

#Data process
TKNIZER_PATTERN = re.compile("[^\w]+")

LABEL = Field(sequential=(False),batch_first=(True),pad_token=None,unk_token=None)
SENTENCE_FIRST = Field(sequential=(True),tokenize=lambda x: TKNIZER_PATTERN.split(x)[1:-1],lower=True,unk_token='<unk>')
SENTENCE_SECOND = Field(sequential=(True),tokenize=lambda x: TKNIZER_PATTERN.split(x)[1:-1],lower=True,unk_token='<unk>',init_token='<start>')

def dataset2iter(workpath=WORK_PATH,train_path=FILE_TRAIN,validation_path=FILE_VALID,test_path=FILE_TEST):
    fields =[('gold_label',LABEL),
         ('sentence1_binary_parse',SENTENCE_FIRST),
         ('sentence2_binary_parse',SENTENCE_SECOND),
         ]

    data_train = TabularDataset(workpath+train_path, format="tsv", fields=fields, skip_header=True)
    data_valid = TabularDataset(workpath+validation_path, format="tsv", fields=fields, skip_header=True)
    data_test = TabularDataset(workpath+test_path, format="tsv", fields=fields, skip_header=True)

    pretrained_vectors = Vectors(name = GLOVE_PATH+TRAINED_VECTORS+'.txt',cache=GLOVE_PATH)
    SENTENCE_FIRST.build_vocab(data_train,vectors=pretrained_vectors,unk_init= lambda x:torch.nn.init.uniform_(x, a=-0.25, b=0.25) )
Exemplo n.º 21
0
# device = torch.device('cuda' if torch.cuda else "cpu")

## load data
root = "/home/cp/dataSet/text-classification-sample/"
train_path = "/home/cp/dataSet/text-classification-sample/train.csv"
valid_path = '/home/cp/dataSet/text-classification-sample/valid.csv'
test_path = '/home/cp/dataSet/text-classification-sample/test.csv'

data_train = pd.read_csv(train_path).head()
print(data_train)
print(data_train.columns)

# 1 定义Field对象,配置文本处理
tokenize = lambda x: x.split()
text = Field(sequential=True, lower=True, tokenize=tokenize, use_vocab=True)
label = Field(sequential=False, use_vocab=False)

# 2 定义DataSets对象,加载原始语料
tv_datafields = [("id", None), ("comment_text", text), ("toxic", label),
                 ('severe_toxic', label),
                 ('obscene', label), ('threat', label), ('insult', label),
                 ("identity_hate", label)]

train, valid = TabularDataset.splits(path=root,
                                     train='train.csv',
                                     validation='valid.csv',
                                     format='csv',
                                     skip_header=True,
                                     fields=tv_datafields)
Exemplo n.º 22
0
import math
import torch
import torch.nn as nn
from torchtext.data import Field, BucketIterator
from torchtext.datasets import TranslationDataset
from torch import nn
import torch
import torch.nn.functional as F

from torch.nn import TransformerEncoder, TransformerEncoderLayer

device = torch.device("cuda")

Lang1 = Field(eos_token='<eos>')

Lang2 = Field(init_token='<sos>', eos_token='<eos>')
train = TranslationDataset(path='../Datasets/MT_data/',
                           exts=('eng-fra.train.fr', 'eng-fra.train.en'),
                           fields=[('Lang1', Lang1), ('Lang2', Lang2)])

train_iter, val_iter, test_iter = BucketIterator.splits((train, train, train),
                                                        batch_size=16,
                                                        repeat=False)
Lang1.build_vocab(train)
Lang2.build_vocab(train)

# for i, train_batch in enumerate(train_iter):
#     print('Lang1  : \n', [Lang1.vocab.itos[x] for x in train_batch.Lang1[0].data[:, 0]])
#     print('Lang1  : \n', train_batch.Lang1[1].data[0])
#     print('Lang2 : \n', [Lang2.vocab.itos[x] for x in train_batch.Lang2[0].data[:, 0]])
#     print('Lang2 : \n', train_batch.Lang2[1].data[0])
Exemplo n.º 23
0
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
import config_train as args
import pickle
from mgnn import MGNN
import numpy as np
from embeddings import GloveEmbedding
# from sklearn.metrics import classification_report, precision_recall_fscore_support
import spacy
import time

queryF = Field(sequential=True,
               batch_first=True,
               lower=True,
               include_lengths=True)
syntaxF = Field(sequential=True,
                batch_first=True,
                lower=True,
                include_lengths=True)
hierF = Field(sequential=True,
              batch_first=True,
              lower=True,
              include_lengths=True)
relF = Field(sequential=True,
             batch_first=True,
             lower=True,
             include_lengths=True)
labelF = Field(sequential=False, batch_first=True, use_vocab=False)
Exemplo n.º 24
0
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)

  if torch.cuda.is_available(): 
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

TEXT = Field(lower=True,
             use_vocab=True,
             sequential=True,
             batch_first=True,
             include_lengths=True)

LABEL = Field(lower=True,
              use_vocab=True,
              sequential=True,
              unk_token = None,
              batch_first=True)

fields = [('text', TEXT), ('tags', LABEL)]

train_data, valid_data, test_data = datasets.UDPOS.splits(fields)

TEXT.build_vocab(train_data,
                 max_size=25000,
Exemplo n.º 25
0
    def __init__(self, image, boxes_and_transcripts_data, iob_tagging_type: str = 'box_level',
                    image_index=None, resized_image_size: Tuple[int, int] = (480, 960)):
        '''
        An item returned by dataset.
        :param iob_tagging_type: 'box_level', 'document_level', 'box_and_within_box_level'
        :param entities_file: exactly entity type and entity value of documents, json file
        :param image_index: image index, used to get image file name
        '''
        # text string label converter
        self.text_segments_field = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True)
        self.text_segments_field.vocab = vocab_cls['keys']
        # iob string label converter
        self.iob_tags_field = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True)
        self.iob_tags_field.vocab = vocab_cls['iob_labels']
        assert iob_tagging_type in ['box_level', 'document_level', 'box_and_within_box_level'], \
            'iob tagging type {} is not supported'.format(iob_tagging_type)
        self.iob_tagging_type = iob_tagging_type
        self.resized_image_size = resized_image_size

        # read boxes, transcripts, and entity types of boxes in one documents from boxes_and_transcripts file
        # match with regex pattern: index,x1,y1,x2,y2,x3,y3,x4,y4,transcript,type from boxes_and_transcripts tsv file
        # data format as [(index, points, transcription, entity_type)...]
        # label = pd.read_csv(label_file.as_posix(),sep='\n',header=None)[0].to_list()

        boxes, transcripts, box_entity_types = [], [], []
        for index, points, transcript, _ in boxes_and_transcripts_data:
            if len(transcript) == 0:
                transcript = ' '
            boxes.append(points)
            transcripts.append(transcript)

        # Limit the number of boxes and number of transcripts to process.
        boxes_num = min(len(boxes), MAX_BOXES_NUM)
        transcript_len = min(max([len(t) for t in transcripts[:boxes_num]]), MAX_TRANSCRIPT_LEN)
        mask = np.zeros((boxes_num, transcript_len), dtype=int)

        relation_features = np.zeros((boxes_num, boxes_num, 6))

        height, width, _ = image.shape

        image = cv2.resize(image, self.resized_image_size, interpolation=cv2.INTER_LINEAR)
        x_scale = self.resized_image_size[0] / width
        y_scale = self.resized_image_size[1] / height

        # get min area box for each (original) boxes, for calculate initial relation features
        min_area_boxes = [cv2.minAreaRect(np.array(box, dtype=np.float32).reshape(4, 2)) for box in
                          boxes[:boxes_num]]

        # calculate resized image box coordinate, and initial relation features between boxes (nodes)
        resized_boxes = []
        for i in range(boxes_num):
            box_i = boxes[i]
            transcript_i = transcripts[i]

            # get resized images's boxes coordinate, used to ROIAlign in Encoder layer
            resized_box_i = [int(np.round(pos * x_scale)) if i % 2 == 0 else int(np.round(pos * y_scale))
                                 for i, pos in enumerate(box_i)]
            resized_box_i = np.array(resized_box_i).reshape((8,))
            resized_boxes.append(resized_box_i)

            # enumerate each box, calculate relation features between i and other nodes.
            # formula (9)
            self.relation_features_between_ij_nodes(boxes_num, i, min_area_boxes, relation_features, transcript_i,
                                                    transcripts)

        relation_features = normalize_relation_features(relation_features, width=width, height=height)

        # The length of texts of each segment.
        text_segments = [list(trans) for trans in transcripts[:boxes_num]]

        # texts shape is (num_texts, max_texts_len), texts_len shape is (num_texts,)
        texts, texts_len = self.text_segments_field.process(text_segments)
        texts = texts[:, :transcript_len].numpy()
        texts_len = np.clip(texts_len.numpy(), 0, transcript_len)
        text_segments = (texts, texts_len)

        for i in range(boxes_num):
            mask[i, :texts_len[i]] = 1

        self.whole_image = RawField().preprocess(image)
        self.text_segments = self.text_segments_field.preprocess(text_segments)  # (text, texts_len)
        self.boxes_coordinate = RawField().preprocess(resized_boxes)
        self.relation_features = RawField().preprocess(relation_features)
        self.mask = RawField().preprocess(mask)
        self.boxes_num = RawField().preprocess(boxes_num)
        self.transcript_len = RawField().preprocess(transcript_len)  # max transcript len of current document
        self.image_index = RawField().preprocess(image_index)
Exemplo n.º 26
0
def load_naive_lm(args):
    """
    Convenience function to load pickle or dataset
    """
    src = Field(tokenize=tokenize_en,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True,
                include_lengths=True)

    trg = Field(tokenize=tokenize_en,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True)

    if args.expanded_dataset:
        path = ".data/stories/story_commonsense/torchtext_expanded"
    else:
        path = ".data/stories/story_commonsense/torchtext"

    train_data, valid_data, test_data = NaiveDatasetLM.splits(\
                    exts = (args.src_ext, args.trg_ext), fields = (src, trg),
                    path=path)

    # Build vocabularies
    if os.path.isfile(args.prepared_data):
        # Load from pickle
        print(f"Found data pickle, loading from {args.prepared_data}")
        with open(args.prepared_data, 'rb') as p:
            d = pickle.load(p)
            src.vocab = d["src.vocab"]
            trg.vocab = d["trg.vocab"]
            combined_vocab = d["combined_vocab"]
            args.emb_dim = d["emb_dim"]
            loaded_vectors = d["loaded_vectors"]
    else:
        # Build vocabs. Will check `src` or `trg` field in `train_data`
        src.build_vocab(train_data, min_freq=2)
        trg.build_vocab(train_data, min_freq=2)
        # Build single vocab, use
        combined_vocab = build_combined_vocab(src, train_data)

        # Load Glove embeddings
        str_to_idx_combined = combined_vocab.stoi  # word to idx dictionary
        str_to_idx = src.vocab.stoi  # word to idx dictionary

        # `loaded_vectors` is a dictionary of words to embeddings
        # To be sure to include entire vocab, we save the embeddings for the
        # combined vocab
        if "elmo" in args.embedding_type:
            loaded_vectors = []
            embedding_size = 1024
        elif "gpt" in args.embedding_type:
            loaded_vectors = []
            embedding_size = 1024
        else:
            loaded_vectors, embedding_size = load_text_vec(
                str_to_idx_combined, args.embeddings_path)

        args.emb_dim = embedding_size

        # Pickle Field vocab for later faster load
        with open(args.prepared_data, 'wb') as p:
            d = {}
            d["src.vocab"] = src.vocab
            d["trg.vocab"] = trg.vocab
            d["combined_vocab"] = combined_vocab
            d["emb_dim"] = args.emb_dim
            d["loaded_vectors"] = loaded_vectors
            pickle.dump(d, p, protocol=pickle.HIGHEST_PROTOCOL)
            print(
                f"Saved prepared data for future fast load to: {args.prepared_data}"
            )

    # Build single vocab for both src and trg
    if args.single_vocab:
        src.vocab = combined_vocab
        trg.vocab = combined_vocab

    print(f"Source vocab size: {len(src.vocab)}")
    print(f"Target vocab size: {len(trg.vocab)}")

    # Data iterators
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=args.batch_size,
        sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=args.device)

    return train_iterator, valid_iterator, test_iterator, src, trg, loaded_vectors
Exemplo n.º 27
0
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

# loading word
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")

# create tokenizer function
def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

# define vocab form
german = Field(tokenize=tokenizer_ger, lower=True, init_token="<sos>", eos_token="<eos>")
english = Field(tokenize=tokenizer_eng, lower=True, init_token="<sos>", eos_token="<eos>")

train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english))

# build vocab
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
Exemplo n.º 28
0
os.environ["CUDA_VISIBLE_DEVICES"] = args.device_list
# Device setting
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# dont use writer temporarily
writer = None

best_bleu = 999.00
start_epoch = 0

# Train with Transformer
if __name__ == '__main__':
    # Prepare data
    SRC = Field(tokenize='spacy',
                tokenizer_language='de',
                init_token='<sos>',
                eos_token='<eos>',
                lower=True)

    TRG = Field(tokenize='spacy',
                tokenizer_language='en',
                init_token='<sos>',
                eos_token='<eos>',
                lower=True)

    train_data, val_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                      fields=(SRC, TRG))
    train_iter, val_iter, test_iter = BucketIterator.splits(
        (train_data, val_data, test_data), batch_size=args.batch_size)

    print(len(train_iter))
Exemplo n.º 29
0
    def __init__(self, args, params):
        self.batch_size = params.batch_size
        self.fix_length = params.fix_length
        self.root_path = args.data_dir
        self.use_bert = args.bert

        if not self.use_bert:
            with open(args.embedding_pkl_path + '_word2idx.pkl', 'rb') as f:
                word2idx = pickle.load(f)
        else:
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        def word_tokenize(sentence):
            if self.use_bert:
                # tokenized_text = tokenizer.tokenize(sentence)    #会切分单词 导致不对应
                tokenized_text = sentence.split(' ')
                sentence = tokenizer.convert_tokens_to_ids(tokenized_text)
                # sentence = tokenizer.add_special_tokens_single_sentence(sentence)
                sentence = [101] + sentence
            else:

                tokenized_text = sentence.split(' ')
                sentence = [word2idx.get(word, 0) for word in tokenized_text]
                sentence = [0] + sentence  # 与bert统一

            return sentence

        def pos_tokenize(posids):
            return [int(_) for _ in posids.split(' ')]

        # dtype = torch.cuda.LongTensor if args.gpu and torch.cuda.is_available() else torch.int64

        TEXT = Field(
            sequential=True,
            tokenize=word_tokenize,
            use_vocab=False,
            batch_first=True,
            fix_length=self.fix_length + 1,  # 添加了 cls
            pad_token=0)
        POSITION = Field(sequential=True,
                         tokenize=pos_tokenize,
                         use_vocab=False,
                         fix_length=self.fix_length,
                         pad_token=0,
                         batch_first=True,
                         include_lengths=True)
        POSITION_NO_LEN = Field(sequential=True,
                                tokenize=pos_tokenize,
                                use_vocab=False,
                                fix_length=self.fix_length,
                                pad_token=0,
                                batch_first=True)
        LABEL = Field(sequential=False, use_vocab=False, batch_first=True)

        fields = {
            'sentence': ('words', TEXT),
            'label': ('label', LABEL),
            'e1': ('pos_e1', POSITION),
            'e2': ('pos_e2', POSITION_NO_LEN)
        }

        self.train, self.valid = TabularDataset.splits(path=self.root_path,
                                                       train='train.txt',
                                                       validation='test.txt',
                                                       format='json',
                                                       skip_header=False,
                                                       fields=fields)
Exemplo n.º 30
0

src_tokenizer = None
trg_tokenizer = None

if set_source_to == "azerbaijani" or set_source_to == "turkish":
    src_tokenizer = tokenize_custom
elif set_source_to == "english":
    src_tokenizer = tokenize_eng

if set_target_to == "azerbaijani" or set_target_to == "turkish":
    trg_tokenizer = tokenize_custom
elif set_target_to == "english":
    trg_tokenizer = tokenize_eng

source_lang = Field(tokenize=src_tokenizer, lower=True, init_token="<sos>", eos_token="<eos>")

target_lang = Field(tokenize=trg_tokenizer, lower=True, init_token="<sos>", eos_token="<eos>")



fields = {'Source_lang': ('src',source_lang), 'Target_lang' : ('trg',target_lang)}

train_data, test_data = TabularDataset.splits(
        path='',
        train = train_dataset_path,
        test = test_dataset_path,
        format = 'csv',
        fields = fields
        )