def __init__(self, data_file, vocab_file, batch_size=256):
        self.batch_size = batch_size

        smi_field = Field(sequential=True,
                          init_token='<sos>',
                          eos_token=' ',
                          pad_token=' ',
                          include_lengths=True,
                          batch_first=True,
                          tokenize=smi_tokenizer)
        property_field = Field(sequential=False, use_vocab=False)
        # load smile data
        with open(data_file, 'r') as f:
            mol_strs = f.read().strip().split('\n')
            mol_strs = [mol.replace(' ', '') for mol in mol_strs]
        mol_strs = [smi_field.preprocess(mol) for mol in mol_strs]
        smi_examples = []
        fields = [('smile', smi_field), ('property', property_field)]
        for mol in mol_strs:
            ex = Example.fromlist([mol, [1, 2, 3]], fields)
            smi_examples.append(ex)

        # load or build vocab
        if os.path.isfile(vocab_file):
            print('load vocab from:', vocab_file)
            smi_field.vocab = pickle.load(open(vocab_file, 'rb'))
        else:
            print('build and save vocab file:', vocab_file)
            smi_field.build_vocab(mol_strs)
            pickle.dump(smi_field.vocab, open(vocab_file, 'wb'), protocol=2)

        self.vocab = smi_field.vocab
        self.vocab_size = len(smi_field.vocab.itos)
        self.padding_idx = smi_field.vocab.stoi[smi_field.pad_token]
        self.sos_idx = smi_field.vocab.stoi[smi_field.init_token]
        self.eos_idx = smi_field.vocab.stoi[smi_field.eos_token]
        self.unk_idx = smi_field.vocab.stoi[smi_field.unk_token]

        self.dataset_smi = Dataset(smi_examples, fields=fields)
        self.train_smi = Dataset(smi_examples[:-5000], fields=fields)
        self.test_smi = Dataset(smi_examples[-5000:], fields=fields)
Exemplo n.º 2
0
class TestingDocument:
    def __init__(self, image, boxes_and_transcripts_data, iob_tagging_type: str = 'box_level',
                    image_index=None, resized_image_size: Tuple[int, int] = (480, 960)):
        '''
        An item returned by dataset.
        :param iob_tagging_type: 'box_level', 'document_level', 'box_and_within_box_level'
        :param entities_file: exactly entity type and entity value of documents, json file
        :param image_index: image index, used to get image file name
        '''
        # text string label converter
        self.text_segments_field = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True)
        self.text_segments_field.vocab = vocab_cls['keys']
        # iob string label converter
        self.iob_tags_field = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True)
        self.iob_tags_field.vocab = vocab_cls['iob_labels']
        assert iob_tagging_type in ['box_level', 'document_level', 'box_and_within_box_level'], \
            'iob tagging type {} is not supported'.format(iob_tagging_type)
        self.iob_tagging_type = iob_tagging_type
        self.resized_image_size = resized_image_size

        # read boxes, transcripts, and entity types of boxes in one documents from boxes_and_transcripts file
        # match with regex pattern: index,x1,y1,x2,y2,x3,y3,x4,y4,transcript,type from boxes_and_transcripts tsv file
        # data format as [(index, points, transcription, entity_type)...]
        # label = pd.read_csv(label_file.as_posix(),sep='\n',header=None)[0].to_list()

        boxes, transcripts, box_entity_types = [], [], []
        for index, points, transcript, _ in boxes_and_transcripts_data:
            if len(transcript) == 0:
                transcript = ' '
            boxes.append(points)
            transcripts.append(transcript)

        # Limit the number of boxes and number of transcripts to process.
        boxes_num = min(len(boxes), MAX_BOXES_NUM)
        transcript_len = min(max([len(t) for t in transcripts[:boxes_num]]), MAX_TRANSCRIPT_LEN)
        mask = np.zeros((boxes_num, transcript_len), dtype=int)

        relation_features = np.zeros((boxes_num, boxes_num, 6))

        height, width, _ = image.shape

        image = cv2.resize(image, self.resized_image_size, interpolation=cv2.INTER_LINEAR)
        x_scale = self.resized_image_size[0] / width
        y_scale = self.resized_image_size[1] / height

        # get min area box for each (original) boxes, for calculate initial relation features
        min_area_boxes = [cv2.minAreaRect(np.array(box, dtype=np.float32).reshape(4, 2)) for box in
                          boxes[:boxes_num]]

        # calculate resized image box coordinate, and initial relation features between boxes (nodes)
        resized_boxes = []
        for i in range(boxes_num):
            box_i = boxes[i]
            transcript_i = transcripts[i]

            # get resized images's boxes coordinate, used to ROIAlign in Encoder layer
            resized_box_i = [int(np.round(pos * x_scale)) if i % 2 == 0 else int(np.round(pos * y_scale))
                                 for i, pos in enumerate(box_i)]
            resized_box_i = np.array(resized_box_i).reshape((8,))
            resized_boxes.append(resized_box_i)

            # enumerate each box, calculate relation features between i and other nodes.
            # formula (9)
            self.relation_features_between_ij_nodes(boxes_num, i, min_area_boxes, relation_features, transcript_i,
                                                    transcripts)

        relation_features = normalize_relation_features(relation_features, width=width, height=height)

        # The length of texts of each segment.
        text_segments = [list(trans) for trans in transcripts[:boxes_num]]

        # texts shape is (num_texts, max_texts_len), texts_len shape is (num_texts,)
        texts, texts_len = self.text_segments_field.process(text_segments)
        texts = texts[:, :transcript_len].numpy()
        texts_len = np.clip(texts_len.numpy(), 0, transcript_len)
        text_segments = (texts, texts_len)

        for i in range(boxes_num):
            mask[i, :texts_len[i]] = 1

        self.whole_image = RawField().preprocess(image)
        self.text_segments = self.text_segments_field.preprocess(text_segments)  # (text, texts_len)
        self.boxes_coordinate = RawField().preprocess(resized_boxes)
        self.relation_features = RawField().preprocess(relation_features)
        self.mask = RawField().preprocess(mask)
        self.boxes_num = RawField().preprocess(boxes_num)
        self.transcript_len = RawField().preprocess(transcript_len)  # max transcript len of current document
        self.image_index = RawField().preprocess(image_index)


    def relation_features_between_ij_nodes(self, boxes_num, i, min_area_boxes, relation_features, transcript_i,
                                           transcripts):
        '''
        calculate node i and other nodes' initial relation features.
        :param boxes_num:
        :param i:
        :param min_area_boxes: the min rectangle of (original) points.
        :param relation_features: np.array, boxes_num x boxes_num x 6
        :param transcript_i:  transcripts[i]
        :param transcripts:
        :return:
        '''
        for j in range(boxes_num):
            transcript_j = transcripts[j]

            rect_output_i = min_area_boxes[i]
            rect_output_j = min_area_boxes[j]

            # Centers of rect_of_box_i and rect_of_box_j.
            center_i = rect_output_i[0]
            center_j = rect_output_j[0]

            width_i, height_i = rect_output_i[1]
            width_j, height_j = rect_output_j[1]

            # Center distances of boxes on x-axis.
            relation_features[i, j, 0] = np.abs(center_i[0] - center_j[0]) \
                if np.abs(center_i[0] - center_j[0]) is not None else -1  # x_ij

            # Center distances of boxes on y-axis.
            relation_features[i, j, 1] = np.abs(center_i[1] - center_j[1]) \
                if np.abs(center_i[1] - center_j[1]) is not None else -1  # y_ij

            relation_features[i, j, 2] = width_i / (height_i) \
                if height_i != 0 and width_i / (height_i) is not None else -1  # w_i/h_i

            relation_features[i, j, 3] = height_j / (height_i) \
                if height_i != 0 and height_j / (height_i) is not None else -1  # h_j/h_i

            relation_features[i, j, 4] = width_j / (height_i) \
                if height_i != 0 and width_j / (height_i) is not None else -1  # w_j/h_i

            relation_features[i, j, 5] = len(transcript_j) / (len(transcript_i)) \
                if len(transcript_j) / (len(transcript_i)) is not None else -1  # T_j/T_i
Exemplo n.º 3
0
class Document:
    def __init__(self, boxes_and_transcripts_file: Path, image_file: Path, label_file: Path,
                 entities_list: List[str], resized_image_size: Tuple[int, int] = (480, 960),
                 iob_tagging_type: str = 'box_level', entities_file: Path = None, training: bool = True,
                 image_index=None):
        '''
        An item returned by dataset.

        :param boxes_and_transcripts_file: gt or ocr results file
        :param image_file: whole images file
        :param entities_list: list with entities
        :param resized_image_size: resize whole image size, (w, h)
        :param iob_tagging_type: 'box_level', 'document_level', 'box_and_within_box_level'
        :param entities_file: exactly entity type and entity value of documents, json file
        :param training: True for train and validation mode, False for test mode. True will also load labels,
        and entities_file must be set.
        :param image_index: image index, used to get image file name
        '''
        # text string label converter
        self.text_segments_field = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True)
        self.text_segments_field.vocab = vocab_cls['keys']
        # iob string label converter
        self.iob_tags_field = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True)
        self.iob_tags_field.vocab = vocab_cls['iob_labels']
        self.resized_image_size = resized_image_size
        self.training = training
        assert iob_tagging_type in ['box_level', 'document_level', 'box_and_within_box_level'], \
            'iob tagging type {} is not supported'.format(iob_tagging_type)
        self.iob_tagging_type = iob_tagging_type

        # For easier debug:
        # we will know what we are running on.
        self.image_filename = image_file.as_posix()

        try:
            # read boxes, transcripts, and entity types of boxes in one documents from boxes_and_transcripts file
            # match with regex pattern: index,x1,y1,x2,y2,x3,y3,x4,y4,transcript,type from boxes_and_transcripts tsv file
            # data format as [(index, points, transcription, entity_type)...]
            if self.training:
                # boxes_and_transcripts_data = [(index, [x1, y1, ...], transcript, entity_type), ...]
                boxes_and_transcripts_data = read_gt_file_with_box_entity_type(boxes_and_transcripts_file.as_posix())
            else:
                boxes_and_transcripts_data = read_ocr_file_without_box_entity_type(
                    boxes_and_transcripts_file.as_posix())

            # Sort the box based on the position.
            # boxes_and_transcripts_data = sort_box_with_list(boxes_and_transcripts_data)

            # read image
            image = cv2.imread(image_file.as_posix())
            label = pd.read_csv(label_file.as_posix(),sep='\n',header=None)[0].to_list()
        except Exception as e:
            raise IOError('Error occurs in image {}: {}'.format(image_file.stem, e.args))

        boxes, transcripts, box_entity_types = [], [], []
        if self.training:
            for index, points, transcript, entity_type in boxes_and_transcripts_data:
                if len(transcript) == 0:
                    transcript = ' '
                boxes.append(points)
                transcripts.append(transcript)
                box_entity_types.append(entity_type)
        else:
            for index, points, transcript in boxes_and_transcripts_data:
                if len(transcript) == 0:
                    transcript = ' '
                boxes.append(points)
                transcripts.append(transcript)

        # Limit the number of boxes and number of transcripts to process.
        boxes_num = min(len(boxes), MAX_BOXES_NUM)
        transcript_len = min(max([len(t) for t in transcripts[:boxes_num]]), MAX_TRANSCRIPT_LEN)
        mask = np.zeros((boxes_num, transcript_len), dtype=int)

        relation_features = np.zeros((boxes_num, boxes_num, 6))

        try:

            height, width, _ = image.shape

            # resize image
            image = cv2.resize(image, self.resized_image_size, interpolation=cv2.INTER_LINEAR)
            x_scale = self.resized_image_size[0] / width
            y_scale = self.resized_image_size[1] / height

            # get min area box for each (original) boxes, for calculate initial relation features
            min_area_boxes = [cv2.minAreaRect(np.array(box, dtype=np.float32).reshape(4, 2)) for box in
                              boxes[:boxes_num]]

            # calculate resized image box coordinate, and initial relation features between boxes (nodes)
            resized_boxes = []
            for i in range(boxes_num):
                box_i = boxes[i]
                transcript_i = transcripts[i]

                # get resized images's boxes coordinate, used to ROIAlign in Encoder layer
                resized_box_i = [int(np.round(pos * x_scale)) if i % 2 == 0 else int(np.round(pos * y_scale))
                                 for i, pos in enumerate(box_i)]

                # resized_rect_output_i = cv2.minAreaRect(np.array(resized_box_i, dtype=np.float32).reshape(4, 2))
                # resized_box_i = cv2.boxPoints(resized_rect_output_i)
                resized_box_i = np.array(resized_box_i).reshape((8,))
                resized_boxes.append(resized_box_i)

                # enumerate each box, calculate relation features between i and other nodes.
                # formula (9)
                self.relation_features_between_ij_nodes(boxes_num, i, min_area_boxes, relation_features, transcript_i,
                                                        transcripts)

            relation_features = normalize_relation_features(relation_features, width=width, height=height)
            # The length of texts of each segment.
            text_segments = [list(trans) for trans in transcripts[:boxes_num]]

            if self.training:
                # assign iob label to input text through exactly match way, this process needs entity-level label
                if self.iob_tagging_type != 'box_level':
                    with entities_file.open() as f:
                        entities = json.load(f)

                if self.iob_tagging_type == 'box_level':
                    # convert transcript of every boxes to iob label, using entity type of corresponding box
                    iob_tags_label = text2iob_label_with_box_level_match(box_entity_types[:boxes_num],
                                                                         transcripts[:boxes_num],
                                                                         entities_list=entities_list)
                elif self.iob_tagging_type == 'document_level':
                    # convert transcripts to iob label using document level tagging match method, all transcripts will
                    # be concatenated as a sequences
                    iob_tags_label = text2iob_label_with_document_level_exactly_match(transcripts[:boxes_num], entities,
                                                                                      entities_list=entities_list)

                elif self.iob_tagging_type == 'box_and_within_box_level':
                    # perform exactly tagging within specific box, box_level_entities parames will perform boex level tagging.
                    iob_tags_label = text2iob_label_with_box_and_within_box_exactly_level(box_entity_types[:boxes_num],
                                                                                          transcripts[:boxes_num],
                                                                                          entities, ['address'],
                                                                                          entities_list=entities_list)

                iob_tags_label = self.iob_tags_field.process(iob_tags_label)[:, :transcript_len].numpy()
                box_entity_types = [vocab_cls['entities'].stoi[t] for t in box_entity_types[:boxes_num]]

            # texts shape is (num_texts, max_texts_len), texts_len shape is (num_texts,)
            texts, texts_len = self.text_segments_field.process(text_segments)
            texts = texts[:, :transcript_len].numpy()
            texts_len = np.clip(texts_len.numpy(), 0, transcript_len)
            text_segments = (texts, texts_len)

            for i in range(boxes_num):
                mask[i, :texts_len[i]] = 1

            self.whole_image = RawField().preprocess(image)
            self.text_segments = self.text_segments_field.preprocess(text_segments)  # (text, texts_len)
            self.boxes_coordinate = RawField().preprocess(resized_boxes)
            self.relation_features = RawField().preprocess(relation_features)
            self.mask = RawField().preprocess(mask)
            self.boxes_num = RawField().preprocess(boxes_num)
            self.transcript_len = RawField().preprocess(transcript_len)  # max transcript len of current document
            if self.training:
                self.iob_tags_label = self.iob_tags_field.preprocess(iob_tags_label)
            else:
                self.image_index = RawField().preprocess(image_index)
                self.label = RawField().preprocess(label)

        except Exception as e:
            raise RuntimeError('Error occurs in image {}: {}'.format(boxes_and_transcripts_file.stem, e.args))

    def relation_features_between_ij_nodes(self, boxes_num, i, min_area_boxes, relation_features, transcript_i,
                                           transcripts):
        '''
        calculate node i and other nodes' initial relation features.
        :param boxes_num:
        :param i:
        :param min_area_boxes: the min rectangle of (original) points.
        :param relation_features: np.array, boxes_num x boxes_num x 6
        :param transcript_i:  transcripts[i]
        :param transcripts:
        :return:
        '''
        w,h = (480,960)
        for j in range(boxes_num):
            transcript_j = transcripts[j]

            rect_output_i = min_area_boxes[i]
            rect_output_j = min_area_boxes[j]

            # Centers of rect_of_box_i and rect_of_box_j.
            center_i = rect_output_i[0]
            center_j = rect_output_j[0]

            width_i, height_i = rect_output_i[1]
            width_j, height_j = rect_output_j[1]

            # Center distances of boxes on x-axis.
            relation_features[i, j, 0] = np.abs(center_i[0] - center_j[0]) \
                if np.abs(center_i[0] - center_j[0])/w is not None else -1  # x_ij

            # Center distances of boxes on y-axis.
            relation_features[i, j, 1] = np.abs(center_i[1] - center_j[1]) \
                if np.abs(center_i[1] - center_j[1]) is not None else -1  # y_ij

            relation_features[i, j, 2] = width_i / (height_i) \
                if height_i != 0 and width_i / (height_i) is not None else -1  # w_i/h_i

            relation_features[i, j, 3] = height_j / (height_i) \
                if height_i != 0 and height_j / (height_i) is not None else -1  # h_j/h_i

            relation_features[i, j, 4] = width_j / (height_i) \
                if height_i != 0 and width_j / (height_i) is not None else -1  # w_j/h_i

            relation_features[i, j, 5] = len(transcript_j) / (len(transcript_i)) \
                if len(transcript_j) / (len(transcript_i)) is not None else -1  # T_j/T_i
    state = torch.load("models/states/harvard_transformer2_state.pt",
                       map_location=device)
    model.load_state_dict(state["state_dict"])
    losses = state["loss"]

    test_losses = eval(test_iter, model, criterion_test)
    losses["test"].append(test_losses)
    test_loss = torch.tensor(sum(test_losses) / len(test_losses))
    print(test_loss)
    print('Perplexity:', torch.exp(test_loss))

    model.eval()

    sentence = [
        SRC.preprocess(
            "ein mann in einem blauen hemd steht auf einer leiter und putzt ein fenster"
        )
    ]
    real_translation = TRG.preprocess(
        "a man in a blue shirt is standing on a ladder and cleaning a window")

    src = SRC.process(sentence).to(device).T
    src_mask = (src != SRC.vocab.stoi["<pad>"]).unsqueeze(-2)
    out = greedy_decode(model,
                        src,
                        src_mask,
                        max_len=60,
                        start_symbol=TRG.vocab.stoi["<sos>"])
    translation = []
    for i in range(1, out.size(1)):
        sym = TRG.vocab.itos[out[0, i]]
Exemplo n.º 5
0
df_train = df_train.drop(df_test.index)
df_val = df_train.groupby('label').head(0)
df_train = df_train.drop(df_val.index)

text_field = Field(sequential=True,
                   tokenize='spacy',
                   fix_length=INPUTS_LEN,
                   lower=True,
                   use_vocab=True,
                   include_lengths=True,
                   batch_first=True)
label_field = Field(sequential=False, use_vocab=False)

# we preprocess on train so that tokens only in test and val
# will be labelled as "unknown"
preprocessed_text = df_train['text'].apply(lambda x: text_field.preprocess(x))

# In[ ]:

data_fields = [
    ('text', text_field),
    ('label', label_field),
]

trainds = DataFrameDataset(df_train, data_fields)
testds = DataFrameDataset(df_test, data_fields)
valds = DataFrameDataset(df_val, data_fields)

traindl, testdl, valdl = data.BucketIterator.splits(
    datasets=(trainds, testds,
              valds),  # specify train and validation Tabulardataset
class NexusMillenialTorchTextRepresenter(NexusBaseDataInventory):
    """
    This module will be used to represent feature for deep learning in Pytorch
    """
    def __init__(self,
                 tokenizer_name='nltk_wordtokenize',
                 seq_len: int = 50,
                 lowercase: bool = True,
                 batch_size: int = 64,
                 vocab_size: int = None,
                 min_freq: int = 1,
                 fit_first: str = 'fit_to_train',
                 fit_first_args=None,
                 fit_first_custom_data=None,
                 binary=False):
        """
        Torch Text iterator/dataloader creator
        Parameters
        ----------
        tokenizer_name: str
            How to split the text. Usage = 'nltk_wordtokenize', 'default','spacy'
        seq_len: int
            Max sequence length for the input of the model
        lowercase: bool
            Whether to lowercase the text or not
        batch_size: int
            Batch size on loading the data
        vocab_size: int
            Max Vocabulary size. None if unlimited.
        min_freq: int
            Minimum frequency of a token to be added to vocabulary
        fit_first: str
            Fitting strategy. None if fitting manually on training set
        fit_first_args: dict
            Fitting arguments strategy.
            'manual_split' strategy needs data_choice_type, data_reader_type
            data_reader_args
        fit_first_custom_data: str
            Other data NOT IMPLEMENTED YET
        binary: bool
            To change the label type.
            torch.Long if multiclass
            torch.float if binary
        """
        from torchtext.data import Field
        import torch
        dtype = torch.float if binary else torch.long
        self.logger.info("Tokenizing data with {}".format(tokenizer_name))
        self.tokenizer = self.get_tokenizer(tokenizer_name)
        self.models = []
        self.seq_len = seq_len
        self.text_field = Field(sequential=True,
                                tokenize=self.tokenizer,
                                lower=lowercase,
                                init_token='<START>',
                                eos_token='<END>',
                                fix_length=seq_len)
        self.label_field = Field(sequential=False,
                                 use_vocab=False,
                                 dtype=dtype)
        self.batch_size = batch_size
        self.already_fit = False
        self.vocab_size = vocab_size
        self.min_freq = min_freq
        self.fit_first(fit_first, fit_first_args, fit_first_custom_data)
        self.label_dist = []

    def fit_first(self,
                  fit_to: str = 'fit_to_train',
                  fit_first_args=None,
                  fit_first_custom_data=None):
        """
        Fit the data by selecting variety of choice.

        fit_first_args arguments -> {
            'data_choice_type' : 'manual_split,
            'data_reader_args' : {},
            'data_reader_type' :
        }
        """
        if fit_to == 'fit_all_dataset':
            if fit_first_args['data_choice_type'] == 'manual_split':
                data_reader_type = fit_first_args['data_reader_type']
                data_reader_args = fit_first_args['data_reader_args']
                reader_func = import_class(
                    *(nexus_inv_data_reader[data_reader_type]))
                x_train, _ = reader_func(**data_reader_args['train'])
                x_dev, _ = reader_func(**data_reader_args['dev'])
                x_test, _ = reader_func(**data_reader_args['test'])
                x_combined = np.concatenate([x_train, x_dev, x_test])
                logger.info(
                    "Fitting Vocabulary (Torch Text) with option {}".format(
                        fit_to))
                self.fit(x_combined)
            else:
                raise NotImplementedError('{} is not implemented yet'.format(
                    fit_first_args['data_choice_type']))

    def get_tokenizer(self, tokenizer_name='nltk_wordtokenize'):
        if tokenizer_name == 'nltk_wordtokenize':
            return word_tokenize
        elif tokenizer_name == 'default':
            return lambda x: x.split()
        elif tokenizer_name == 'spacy':
            return 'spacy'
        else:
            raise NotImplementedError(
                "Tokenizer {} is not implemented".format(tokenizer_name))

    def get_model(self):
        return self.text_field

    def _transform(self, x):
        pass

    def fit(self, x):
        """

        Parameters
        ----------
        x: list[str]
            List of string

        Returns
        -------

        """
        x = [self.text_field.preprocess(x_elem) for x_elem in x]
        self.already_fit = True
        self.text_field.build_vocab(x,
                                    max_size=self.vocab_size,
                                    min_freq=self.min_freq)

    def __call__(self, x, y, fit_to_data=False, shuffle=True, *args, **kwargs):
        from torchtext.data import BucketIterator, Iterator
        # REFACTOR THIS
        from ..nexula_inventory.representer_torchtext.torchtext_helper import DataFrameDataset

        if not self.already_fit:
            self.fit(x)

        df = pd.DataFrame(dict(x=x, y=y))
        if fit_to_data:
            self.label_dist = df.y.value_counts().to_dict()
        ds = DataFrameDataset(df, dict(x=self.text_field, y=self.label_field))
        if fit_to_data:
            iterate = BucketIterator(dataset=ds,
                                     batch_size=self.batch_size,
                                     sort_key=lambda x: len(x.text),
                                     shuffle=True)
        else:
            iterate = Iterator(dataset=ds,
                               batch_size=self.batch_size,
                               shuffle=False)
        return iterate, y

    def _tokenize(self, texts: np.array):
        self.tokenizer.fit_on_texts()
Exemplo n.º 7
0
def train():
    target_field = Field(sequential=True,
                         init_token=START_DECODING,
                         eos_token=STOP_DECODING,
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)

    source_field = Field(sequential=True,
                         init_token=SENTENCE_START,
                         eos_token=SENTENCE_END,
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)
    train_path = '../data/incar_alexa/train_public.pickle'
    dev_path = '../data/incar_alexa/dev_public.pickle'
    test_path = '../data/incar_alexa/test_public.pickle'
    path = '../data/cnn_stories_tokenized'
    summary_writer = SummaryWriter(config.summary_path)

    train_src, train_tgt, train_id = load_data(train_path)
    dev_src, dev_tgt, dev_id = load_data(dev_path)
    test_src, test_tgt, test_id = load_data(test_path)
    # train_data = prepare_data_cnn(path)
    # # print(train_data[0])
    # train_src = [dt['src'] for dt in train_data]
    # train_tgt = [dt['tgt'] for dt in train_data]
    # train_id = [dt['id'] for dt in train_data]
    # train_src, test_src, train_tgt, test_tgt = train_test_split(
    #     train_src, train_tgt, test_size=0.15, random_state=123)
    # train_id, test_id = train_test_split(
    #     train_id, test_size=0.15, random_state=123)
    # # print(f"{len(train_src)}, {len(train_tgt)}")
    # train_src, dev_src, train_tgt, dev_tgt = train_test_split(
    #     train_src, train_tgt, test_size=0.15, random_state=123)
    # train_id, dev_id = train_test_split(
    #     train_id, test_size=0.15, random_state=123)

    # print(source_field.preprocess(train_src[0]))
    # exit()
    train_src_preprocessed = [source_field.preprocess(x) for x in train_src]
    dev_src_preprocessed = [source_field.preprocess(x) for x in dev_src]
    test_src_preprocessed = [source_field.preprocess(x) for x in test_src]

    train_tgt_preprocessed = [target_field.preprocess(x) for x in train_tgt]
    dev_tgt_preprocessed = [target_field.preprocess(x) for x in dev_tgt]
    test_tgt_preprocessed = [target_field.preprocess(x) for x in test_tgt]
    # train_src_preprocessed = source_field.apply(lambda x: source_field.preprocess(x))

    vectors = Vectors(
        name='/home/binhna/Downloads/shared_resources/cc.en.300.vec',
        cache='/home/binhna/Downloads/shared_resources/')

    source_field.build_vocab([
        train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed,
        dev_tgt_preprocessed
    ],
                             vectors=vectors)
    target_field.build_vocab([
        train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed,
        dev_tgt_preprocessed
    ],
                             vectors=vectors)

    train_data = [{
        'src': src,
        'tgt': tgt,
        'id': id
    } for src, tgt, id in zip(train_src, train_tgt, train_id)]
    train_data = Mydataset(data=train_data,
                           fields=(('source', source_field), ('target',
                                                              target_field)))
    dev_data = [{
        'src': src,
        'tgt': tgt,
        'id': id
    } for src, tgt, id in zip(dev_src, dev_tgt, dev_id)]
    # print(dev_data[0])
    dev_data = Mydataset(data=dev_data,
                         fields=(('source', source_field), ('target',
                                                            target_field)))

    test_data = [{
        'src': src,
        'tgt': tgt,
        'id': id
    } for src, tgt, id in zip(test_src, test_tgt, test_id)]
    test_data = Mydataset(data=test_data,
                          fields=(('source', source_field), ('target',
                                                             target_field)))
    # print(train_data[10].source)
    # print(train_data[10].target)
    # print(len(target_field.vocab))
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    train_iter, test_iter, dev_iter = BucketIterator.splits(
        datasets=(train_data, test_data, dev_data),
        batch_sizes=(config.batch_size, config.batch_size, config.batch_size),
        device=device,
        sort_key=lambda x: len(x.source),
        sort_within_batch=True)

    args = ARGS()
    setattr(args, 'vectors', source_field.vocab.vectors)
    setattr(args, 'vocab_size', len(source_field.vocab.itos))
    setattr(args, 'emb_dim', vectors.dim)
    model = Model(args)

    params = list(model.encoder.parameters()) + list(
        model.decoder.parameters()) + list(model.reduce_state.parameters())
    initial_lr = config.lr_coverage if config.is_coverage else config.lr
    optimizer = Adagrad(params,
                        lr=initial_lr,
                        initial_accumulator_value=config.adagrad_init_acc)

    iter, running_avg_loss = 0, 0
    start = time.time()
    for epoch in range(500):
        print(f"Epoch: {epoch+1}")
        for i, batch in tqdm(enumerate(train_iter), total=len(train_iter)):
            # print(batch.source[0].size())
            # exit()
            batch_size = batch.batch_size
            # encoder part
            enc_padding_mask = get_mask(batch.source, device)
            enc_batch = batch.source[0]
            enc_lens = batch.source[1]
            encoder_outputs, encoder_feature, encoder_hidden = model.encoder(
                enc_batch, enc_lens)
            s_t_1 = model.reduce_state(encoder_hidden)
            coverage = Variable(torch.zeros(batch.source[0].size())).to(device)
            c_t_1 = Variable(torch.zeros(
                (batch_size, 2 * config.hidden_dim))).to(device)
            extra_zeros, enc_batch_extend_vocab, max_art_oovs = get_extra_features(
                batch.source[0], source_field.vocab)
            extra_zeros = extra_zeros.to(device)
            enc_batch_extend_vocab = enc_batch_extend_vocab.to(device)
            # decoder part
            dec_batch = batch.target[0][:, :-1]
            # print(dec_batch.size())
            target_batch = batch.target[0][:, 0:]
            dec_lens_var = batch.target[1]
            dec_padding_mask = get_mask(batch.target, device)
            max_dec_len = max(dec_lens_var)

            step_losses = []
            for di in range(min(max_dec_len, config.max_dec_steps) - 1):
                y_t_1 = dec_batch[:, di]  # Teacher forcing
                final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = model.decoder(
                    y_t_1, s_t_1, encoder_outputs, encoder_feature,
                    enc_padding_mask, c_t_1, extra_zeros,
                    enc_batch_extend_vocab, coverage, di)
                target = target_batch[:, di]
                gold_probs = torch.gather(final_dist, 1,
                                          target.unsqueeze(1)).squeeze()
                step_loss = -torch.log(gold_probs + config.eps)
                if config.is_coverage:
                    step_coverage_loss = torch.sum(
                        torch.min(attn_dist, coverage), 1)
                    step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                    coverage = next_coverage

                step_mask = dec_padding_mask[:, di]
                step_loss = step_loss * step_mask
                step_losses.append(step_loss)
            sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
            batch_avg_loss = sum_losses / dec_lens_var
            loss = torch.mean(batch_avg_loss)

            loss.backward()

            norm = clip_grad_norm_(model.encoder.parameters(),
                                   config.max_grad_norm)
            clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm)
            clip_grad_norm_(model.reduce_state.parameters(),
                            config.max_grad_norm)

            optimizer.step()

            running_avg_loss = calc_running_avg_loss(loss.item(),
                                                     running_avg_loss,
                                                     summary_writer, iter)
            iter += 1
            summary_writer.flush()
            # print_interval = 10
            # if iter % print_interval == 0:
            #     print(f'steps {iter}, batch number: {i} with {time.time() - start} seconds, loss: {loss}')
            #     start = time.time()
            if iter % 300 == 0:
                save_model(model, optimizer, running_avg_loss, iter,
                           config.model_dir)
Exemplo n.º 8
0
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)
    train_path = '../data/incar_alexa/train_public.pickle'
    dev_path = '../data/incar_alexa/dev_public.pickle'
    test_path = '../data/incar_alexa/test_public.pickle'
    path = '../data/cnn_stories_tokenized'
    # summary_writer = SummaryWriter(config.summary_path)

    train_src, train_tgt, train_id = load_data(train_path)
    dev_src, dev_tgt, dev_id = load_data(dev_path)
    test_src, test_tgt, test_id = load_data(test_path)

    train_src_preprocessed = [source_field.preprocess(x) for x in train_src]
    dev_src_preprocessed = [source_field.preprocess(x) for x in dev_src]
    test_src_preprocessed = [source_field.preprocess(x) for x in test_src]

    train_tgt_preprocessed = [target_field.preprocess(x) for x in train_tgt]
    dev_tgt_preprocessed = [target_field.preprocess(x) for x in dev_tgt]
    test_tgt_preprocessed = [target_field.preprocess(x) for x in test_tgt]
    # train_src_preprocessed = source_field.apply(lambda x: source_field.preprocess(x))

    vectors = Vectors(
        name='/home/binhna/Downloads/shared_resources/cc.en.300.vec',
        cache='/home/binhna/Downloads/shared_resources/')

    source_field.build_vocab([
        train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed,
        dev_tgt_preprocessed