def __init__(self, data_file, vocab_file, batch_size=256): self.batch_size = batch_size smi_field = Field(sequential=True, init_token='<sos>', eos_token=' ', pad_token=' ', include_lengths=True, batch_first=True, tokenize=smi_tokenizer) property_field = Field(sequential=False, use_vocab=False) # load smile data with open(data_file, 'r') as f: mol_strs = f.read().strip().split('\n') mol_strs = [mol.replace(' ', '') for mol in mol_strs] mol_strs = [smi_field.preprocess(mol) for mol in mol_strs] smi_examples = [] fields = [('smile', smi_field), ('property', property_field)] for mol in mol_strs: ex = Example.fromlist([mol, [1, 2, 3]], fields) smi_examples.append(ex) # load or build vocab if os.path.isfile(vocab_file): print('load vocab from:', vocab_file) smi_field.vocab = pickle.load(open(vocab_file, 'rb')) else: print('build and save vocab file:', vocab_file) smi_field.build_vocab(mol_strs) pickle.dump(smi_field.vocab, open(vocab_file, 'wb'), protocol=2) self.vocab = smi_field.vocab self.vocab_size = len(smi_field.vocab.itos) self.padding_idx = smi_field.vocab.stoi[smi_field.pad_token] self.sos_idx = smi_field.vocab.stoi[smi_field.init_token] self.eos_idx = smi_field.vocab.stoi[smi_field.eos_token] self.unk_idx = smi_field.vocab.stoi[smi_field.unk_token] self.dataset_smi = Dataset(smi_examples, fields=fields) self.train_smi = Dataset(smi_examples[:-5000], fields=fields) self.test_smi = Dataset(smi_examples[-5000:], fields=fields)
class TestingDocument: def __init__(self, image, boxes_and_transcripts_data, iob_tagging_type: str = 'box_level', image_index=None, resized_image_size: Tuple[int, int] = (480, 960)): ''' An item returned by dataset. :param iob_tagging_type: 'box_level', 'document_level', 'box_and_within_box_level' :param entities_file: exactly entity type and entity value of documents, json file :param image_index: image index, used to get image file name ''' # text string label converter self.text_segments_field = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True) self.text_segments_field.vocab = vocab_cls['keys'] # iob string label converter self.iob_tags_field = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True) self.iob_tags_field.vocab = vocab_cls['iob_labels'] assert iob_tagging_type in ['box_level', 'document_level', 'box_and_within_box_level'], \ 'iob tagging type {} is not supported'.format(iob_tagging_type) self.iob_tagging_type = iob_tagging_type self.resized_image_size = resized_image_size # read boxes, transcripts, and entity types of boxes in one documents from boxes_and_transcripts file # match with regex pattern: index,x1,y1,x2,y2,x3,y3,x4,y4,transcript,type from boxes_and_transcripts tsv file # data format as [(index, points, transcription, entity_type)...] # label = pd.read_csv(label_file.as_posix(),sep='\n',header=None)[0].to_list() boxes, transcripts, box_entity_types = [], [], [] for index, points, transcript, _ in boxes_and_transcripts_data: if len(transcript) == 0: transcript = ' ' boxes.append(points) transcripts.append(transcript) # Limit the number of boxes and number of transcripts to process. boxes_num = min(len(boxes), MAX_BOXES_NUM) transcript_len = min(max([len(t) for t in transcripts[:boxes_num]]), MAX_TRANSCRIPT_LEN) mask = np.zeros((boxes_num, transcript_len), dtype=int) relation_features = np.zeros((boxes_num, boxes_num, 6)) height, width, _ = image.shape image = cv2.resize(image, self.resized_image_size, interpolation=cv2.INTER_LINEAR) x_scale = self.resized_image_size[0] / width y_scale = self.resized_image_size[1] / height # get min area box for each (original) boxes, for calculate initial relation features min_area_boxes = [cv2.minAreaRect(np.array(box, dtype=np.float32).reshape(4, 2)) for box in boxes[:boxes_num]] # calculate resized image box coordinate, and initial relation features between boxes (nodes) resized_boxes = [] for i in range(boxes_num): box_i = boxes[i] transcript_i = transcripts[i] # get resized images's boxes coordinate, used to ROIAlign in Encoder layer resized_box_i = [int(np.round(pos * x_scale)) if i % 2 == 0 else int(np.round(pos * y_scale)) for i, pos in enumerate(box_i)] resized_box_i = np.array(resized_box_i).reshape((8,)) resized_boxes.append(resized_box_i) # enumerate each box, calculate relation features between i and other nodes. # formula (9) self.relation_features_between_ij_nodes(boxes_num, i, min_area_boxes, relation_features, transcript_i, transcripts) relation_features = normalize_relation_features(relation_features, width=width, height=height) # The length of texts of each segment. text_segments = [list(trans) for trans in transcripts[:boxes_num]] # texts shape is (num_texts, max_texts_len), texts_len shape is (num_texts,) texts, texts_len = self.text_segments_field.process(text_segments) texts = texts[:, :transcript_len].numpy() texts_len = np.clip(texts_len.numpy(), 0, transcript_len) text_segments = (texts, texts_len) for i in range(boxes_num): mask[i, :texts_len[i]] = 1 self.whole_image = RawField().preprocess(image) self.text_segments = self.text_segments_field.preprocess(text_segments) # (text, texts_len) self.boxes_coordinate = RawField().preprocess(resized_boxes) self.relation_features = RawField().preprocess(relation_features) self.mask = RawField().preprocess(mask) self.boxes_num = RawField().preprocess(boxes_num) self.transcript_len = RawField().preprocess(transcript_len) # max transcript len of current document self.image_index = RawField().preprocess(image_index) def relation_features_between_ij_nodes(self, boxes_num, i, min_area_boxes, relation_features, transcript_i, transcripts): ''' calculate node i and other nodes' initial relation features. :param boxes_num: :param i: :param min_area_boxes: the min rectangle of (original) points. :param relation_features: np.array, boxes_num x boxes_num x 6 :param transcript_i: transcripts[i] :param transcripts: :return: ''' for j in range(boxes_num): transcript_j = transcripts[j] rect_output_i = min_area_boxes[i] rect_output_j = min_area_boxes[j] # Centers of rect_of_box_i and rect_of_box_j. center_i = rect_output_i[0] center_j = rect_output_j[0] width_i, height_i = rect_output_i[1] width_j, height_j = rect_output_j[1] # Center distances of boxes on x-axis. relation_features[i, j, 0] = np.abs(center_i[0] - center_j[0]) \ if np.abs(center_i[0] - center_j[0]) is not None else -1 # x_ij # Center distances of boxes on y-axis. relation_features[i, j, 1] = np.abs(center_i[1] - center_j[1]) \ if np.abs(center_i[1] - center_j[1]) is not None else -1 # y_ij relation_features[i, j, 2] = width_i / (height_i) \ if height_i != 0 and width_i / (height_i) is not None else -1 # w_i/h_i relation_features[i, j, 3] = height_j / (height_i) \ if height_i != 0 and height_j / (height_i) is not None else -1 # h_j/h_i relation_features[i, j, 4] = width_j / (height_i) \ if height_i != 0 and width_j / (height_i) is not None else -1 # w_j/h_i relation_features[i, j, 5] = len(transcript_j) / (len(transcript_i)) \ if len(transcript_j) / (len(transcript_i)) is not None else -1 # T_j/T_i
class Document: def __init__(self, boxes_and_transcripts_file: Path, image_file: Path, label_file: Path, entities_list: List[str], resized_image_size: Tuple[int, int] = (480, 960), iob_tagging_type: str = 'box_level', entities_file: Path = None, training: bool = True, image_index=None): ''' An item returned by dataset. :param boxes_and_transcripts_file: gt or ocr results file :param image_file: whole images file :param entities_list: list with entities :param resized_image_size: resize whole image size, (w, h) :param iob_tagging_type: 'box_level', 'document_level', 'box_and_within_box_level' :param entities_file: exactly entity type and entity value of documents, json file :param training: True for train and validation mode, False for test mode. True will also load labels, and entities_file must be set. :param image_index: image index, used to get image file name ''' # text string label converter self.text_segments_field = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True) self.text_segments_field.vocab = vocab_cls['keys'] # iob string label converter self.iob_tags_field = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True) self.iob_tags_field.vocab = vocab_cls['iob_labels'] self.resized_image_size = resized_image_size self.training = training assert iob_tagging_type in ['box_level', 'document_level', 'box_and_within_box_level'], \ 'iob tagging type {} is not supported'.format(iob_tagging_type) self.iob_tagging_type = iob_tagging_type # For easier debug: # we will know what we are running on. self.image_filename = image_file.as_posix() try: # read boxes, transcripts, and entity types of boxes in one documents from boxes_and_transcripts file # match with regex pattern: index,x1,y1,x2,y2,x3,y3,x4,y4,transcript,type from boxes_and_transcripts tsv file # data format as [(index, points, transcription, entity_type)...] if self.training: # boxes_and_transcripts_data = [(index, [x1, y1, ...], transcript, entity_type), ...] boxes_and_transcripts_data = read_gt_file_with_box_entity_type(boxes_and_transcripts_file.as_posix()) else: boxes_and_transcripts_data = read_ocr_file_without_box_entity_type( boxes_and_transcripts_file.as_posix()) # Sort the box based on the position. # boxes_and_transcripts_data = sort_box_with_list(boxes_and_transcripts_data) # read image image = cv2.imread(image_file.as_posix()) label = pd.read_csv(label_file.as_posix(),sep='\n',header=None)[0].to_list() except Exception as e: raise IOError('Error occurs in image {}: {}'.format(image_file.stem, e.args)) boxes, transcripts, box_entity_types = [], [], [] if self.training: for index, points, transcript, entity_type in boxes_and_transcripts_data: if len(transcript) == 0: transcript = ' ' boxes.append(points) transcripts.append(transcript) box_entity_types.append(entity_type) else: for index, points, transcript in boxes_and_transcripts_data: if len(transcript) == 0: transcript = ' ' boxes.append(points) transcripts.append(transcript) # Limit the number of boxes and number of transcripts to process. boxes_num = min(len(boxes), MAX_BOXES_NUM) transcript_len = min(max([len(t) for t in transcripts[:boxes_num]]), MAX_TRANSCRIPT_LEN) mask = np.zeros((boxes_num, transcript_len), dtype=int) relation_features = np.zeros((boxes_num, boxes_num, 6)) try: height, width, _ = image.shape # resize image image = cv2.resize(image, self.resized_image_size, interpolation=cv2.INTER_LINEAR) x_scale = self.resized_image_size[0] / width y_scale = self.resized_image_size[1] / height # get min area box for each (original) boxes, for calculate initial relation features min_area_boxes = [cv2.minAreaRect(np.array(box, dtype=np.float32).reshape(4, 2)) for box in boxes[:boxes_num]] # calculate resized image box coordinate, and initial relation features between boxes (nodes) resized_boxes = [] for i in range(boxes_num): box_i = boxes[i] transcript_i = transcripts[i] # get resized images's boxes coordinate, used to ROIAlign in Encoder layer resized_box_i = [int(np.round(pos * x_scale)) if i % 2 == 0 else int(np.round(pos * y_scale)) for i, pos in enumerate(box_i)] # resized_rect_output_i = cv2.minAreaRect(np.array(resized_box_i, dtype=np.float32).reshape(4, 2)) # resized_box_i = cv2.boxPoints(resized_rect_output_i) resized_box_i = np.array(resized_box_i).reshape((8,)) resized_boxes.append(resized_box_i) # enumerate each box, calculate relation features between i and other nodes. # formula (9) self.relation_features_between_ij_nodes(boxes_num, i, min_area_boxes, relation_features, transcript_i, transcripts) relation_features = normalize_relation_features(relation_features, width=width, height=height) # The length of texts of each segment. text_segments = [list(trans) for trans in transcripts[:boxes_num]] if self.training: # assign iob label to input text through exactly match way, this process needs entity-level label if self.iob_tagging_type != 'box_level': with entities_file.open() as f: entities = json.load(f) if self.iob_tagging_type == 'box_level': # convert transcript of every boxes to iob label, using entity type of corresponding box iob_tags_label = text2iob_label_with_box_level_match(box_entity_types[:boxes_num], transcripts[:boxes_num], entities_list=entities_list) elif self.iob_tagging_type == 'document_level': # convert transcripts to iob label using document level tagging match method, all transcripts will # be concatenated as a sequences iob_tags_label = text2iob_label_with_document_level_exactly_match(transcripts[:boxes_num], entities, entities_list=entities_list) elif self.iob_tagging_type == 'box_and_within_box_level': # perform exactly tagging within specific box, box_level_entities parames will perform boex level tagging. iob_tags_label = text2iob_label_with_box_and_within_box_exactly_level(box_entity_types[:boxes_num], transcripts[:boxes_num], entities, ['address'], entities_list=entities_list) iob_tags_label = self.iob_tags_field.process(iob_tags_label)[:, :transcript_len].numpy() box_entity_types = [vocab_cls['entities'].stoi[t] for t in box_entity_types[:boxes_num]] # texts shape is (num_texts, max_texts_len), texts_len shape is (num_texts,) texts, texts_len = self.text_segments_field.process(text_segments) texts = texts[:, :transcript_len].numpy() texts_len = np.clip(texts_len.numpy(), 0, transcript_len) text_segments = (texts, texts_len) for i in range(boxes_num): mask[i, :texts_len[i]] = 1 self.whole_image = RawField().preprocess(image) self.text_segments = self.text_segments_field.preprocess(text_segments) # (text, texts_len) self.boxes_coordinate = RawField().preprocess(resized_boxes) self.relation_features = RawField().preprocess(relation_features) self.mask = RawField().preprocess(mask) self.boxes_num = RawField().preprocess(boxes_num) self.transcript_len = RawField().preprocess(transcript_len) # max transcript len of current document if self.training: self.iob_tags_label = self.iob_tags_field.preprocess(iob_tags_label) else: self.image_index = RawField().preprocess(image_index) self.label = RawField().preprocess(label) except Exception as e: raise RuntimeError('Error occurs in image {}: {}'.format(boxes_and_transcripts_file.stem, e.args)) def relation_features_between_ij_nodes(self, boxes_num, i, min_area_boxes, relation_features, transcript_i, transcripts): ''' calculate node i and other nodes' initial relation features. :param boxes_num: :param i: :param min_area_boxes: the min rectangle of (original) points. :param relation_features: np.array, boxes_num x boxes_num x 6 :param transcript_i: transcripts[i] :param transcripts: :return: ''' w,h = (480,960) for j in range(boxes_num): transcript_j = transcripts[j] rect_output_i = min_area_boxes[i] rect_output_j = min_area_boxes[j] # Centers of rect_of_box_i and rect_of_box_j. center_i = rect_output_i[0] center_j = rect_output_j[0] width_i, height_i = rect_output_i[1] width_j, height_j = rect_output_j[1] # Center distances of boxes on x-axis. relation_features[i, j, 0] = np.abs(center_i[0] - center_j[0]) \ if np.abs(center_i[0] - center_j[0])/w is not None else -1 # x_ij # Center distances of boxes on y-axis. relation_features[i, j, 1] = np.abs(center_i[1] - center_j[1]) \ if np.abs(center_i[1] - center_j[1]) is not None else -1 # y_ij relation_features[i, j, 2] = width_i / (height_i) \ if height_i != 0 and width_i / (height_i) is not None else -1 # w_i/h_i relation_features[i, j, 3] = height_j / (height_i) \ if height_i != 0 and height_j / (height_i) is not None else -1 # h_j/h_i relation_features[i, j, 4] = width_j / (height_i) \ if height_i != 0 and width_j / (height_i) is not None else -1 # w_j/h_i relation_features[i, j, 5] = len(transcript_j) / (len(transcript_i)) \ if len(transcript_j) / (len(transcript_i)) is not None else -1 # T_j/T_i
state = torch.load("models/states/harvard_transformer2_state.pt", map_location=device) model.load_state_dict(state["state_dict"]) losses = state["loss"] test_losses = eval(test_iter, model, criterion_test) losses["test"].append(test_losses) test_loss = torch.tensor(sum(test_losses) / len(test_losses)) print(test_loss) print('Perplexity:', torch.exp(test_loss)) model.eval() sentence = [ SRC.preprocess( "ein mann in einem blauen hemd steht auf einer leiter und putzt ein fenster" ) ] real_translation = TRG.preprocess( "a man in a blue shirt is standing on a ladder and cleaning a window") src = SRC.process(sentence).to(device).T src_mask = (src != SRC.vocab.stoi["<pad>"]).unsqueeze(-2) out = greedy_decode(model, src, src_mask, max_len=60, start_symbol=TRG.vocab.stoi["<sos>"]) translation = [] for i in range(1, out.size(1)): sym = TRG.vocab.itos[out[0, i]]
df_train = df_train.drop(df_test.index) df_val = df_train.groupby('label').head(0) df_train = df_train.drop(df_val.index) text_field = Field(sequential=True, tokenize='spacy', fix_length=INPUTS_LEN, lower=True, use_vocab=True, include_lengths=True, batch_first=True) label_field = Field(sequential=False, use_vocab=False) # we preprocess on train so that tokens only in test and val # will be labelled as "unknown" preprocessed_text = df_train['text'].apply(lambda x: text_field.preprocess(x)) # In[ ]: data_fields = [ ('text', text_field), ('label', label_field), ] trainds = DataFrameDataset(df_train, data_fields) testds = DataFrameDataset(df_test, data_fields) valds = DataFrameDataset(df_val, data_fields) traindl, testdl, valdl = data.BucketIterator.splits( datasets=(trainds, testds, valds), # specify train and validation Tabulardataset
class NexusMillenialTorchTextRepresenter(NexusBaseDataInventory): """ This module will be used to represent feature for deep learning in Pytorch """ def __init__(self, tokenizer_name='nltk_wordtokenize', seq_len: int = 50, lowercase: bool = True, batch_size: int = 64, vocab_size: int = None, min_freq: int = 1, fit_first: str = 'fit_to_train', fit_first_args=None, fit_first_custom_data=None, binary=False): """ Torch Text iterator/dataloader creator Parameters ---------- tokenizer_name: str How to split the text. Usage = 'nltk_wordtokenize', 'default','spacy' seq_len: int Max sequence length for the input of the model lowercase: bool Whether to lowercase the text or not batch_size: int Batch size on loading the data vocab_size: int Max Vocabulary size. None if unlimited. min_freq: int Minimum frequency of a token to be added to vocabulary fit_first: str Fitting strategy. None if fitting manually on training set fit_first_args: dict Fitting arguments strategy. 'manual_split' strategy needs data_choice_type, data_reader_type data_reader_args fit_first_custom_data: str Other data NOT IMPLEMENTED YET binary: bool To change the label type. torch.Long if multiclass torch.float if binary """ from torchtext.data import Field import torch dtype = torch.float if binary else torch.long self.logger.info("Tokenizing data with {}".format(tokenizer_name)) self.tokenizer = self.get_tokenizer(tokenizer_name) self.models = [] self.seq_len = seq_len self.text_field = Field(sequential=True, tokenize=self.tokenizer, lower=lowercase, init_token='<START>', eos_token='<END>', fix_length=seq_len) self.label_field = Field(sequential=False, use_vocab=False, dtype=dtype) self.batch_size = batch_size self.already_fit = False self.vocab_size = vocab_size self.min_freq = min_freq self.fit_first(fit_first, fit_first_args, fit_first_custom_data) self.label_dist = [] def fit_first(self, fit_to: str = 'fit_to_train', fit_first_args=None, fit_first_custom_data=None): """ Fit the data by selecting variety of choice. fit_first_args arguments -> { 'data_choice_type' : 'manual_split, 'data_reader_args' : {}, 'data_reader_type' : } """ if fit_to == 'fit_all_dataset': if fit_first_args['data_choice_type'] == 'manual_split': data_reader_type = fit_first_args['data_reader_type'] data_reader_args = fit_first_args['data_reader_args'] reader_func = import_class( *(nexus_inv_data_reader[data_reader_type])) x_train, _ = reader_func(**data_reader_args['train']) x_dev, _ = reader_func(**data_reader_args['dev']) x_test, _ = reader_func(**data_reader_args['test']) x_combined = np.concatenate([x_train, x_dev, x_test]) logger.info( "Fitting Vocabulary (Torch Text) with option {}".format( fit_to)) self.fit(x_combined) else: raise NotImplementedError('{} is not implemented yet'.format( fit_first_args['data_choice_type'])) def get_tokenizer(self, tokenizer_name='nltk_wordtokenize'): if tokenizer_name == 'nltk_wordtokenize': return word_tokenize elif tokenizer_name == 'default': return lambda x: x.split() elif tokenizer_name == 'spacy': return 'spacy' else: raise NotImplementedError( "Tokenizer {} is not implemented".format(tokenizer_name)) def get_model(self): return self.text_field def _transform(self, x): pass def fit(self, x): """ Parameters ---------- x: list[str] List of string Returns ------- """ x = [self.text_field.preprocess(x_elem) for x_elem in x] self.already_fit = True self.text_field.build_vocab(x, max_size=self.vocab_size, min_freq=self.min_freq) def __call__(self, x, y, fit_to_data=False, shuffle=True, *args, **kwargs): from torchtext.data import BucketIterator, Iterator # REFACTOR THIS from ..nexula_inventory.representer_torchtext.torchtext_helper import DataFrameDataset if not self.already_fit: self.fit(x) df = pd.DataFrame(dict(x=x, y=y)) if fit_to_data: self.label_dist = df.y.value_counts().to_dict() ds = DataFrameDataset(df, dict(x=self.text_field, y=self.label_field)) if fit_to_data: iterate = BucketIterator(dataset=ds, batch_size=self.batch_size, sort_key=lambda x: len(x.text), shuffle=True) else: iterate = Iterator(dataset=ds, batch_size=self.batch_size, shuffle=False) return iterate, y def _tokenize(self, texts: np.array): self.tokenizer.fit_on_texts()
def train(): target_field = Field(sequential=True, init_token=START_DECODING, eos_token=STOP_DECODING, pad_token=PAD_TOKEN, batch_first=True, include_lengths=True, unk_token=UNKNOWN_TOKEN, lower=True) source_field = Field(sequential=True, init_token=SENTENCE_START, eos_token=SENTENCE_END, pad_token=PAD_TOKEN, batch_first=True, include_lengths=True, unk_token=UNKNOWN_TOKEN, lower=True) train_path = '../data/incar_alexa/train_public.pickle' dev_path = '../data/incar_alexa/dev_public.pickle' test_path = '../data/incar_alexa/test_public.pickle' path = '../data/cnn_stories_tokenized' summary_writer = SummaryWriter(config.summary_path) train_src, train_tgt, train_id = load_data(train_path) dev_src, dev_tgt, dev_id = load_data(dev_path) test_src, test_tgt, test_id = load_data(test_path) # train_data = prepare_data_cnn(path) # # print(train_data[0]) # train_src = [dt['src'] for dt in train_data] # train_tgt = [dt['tgt'] for dt in train_data] # train_id = [dt['id'] for dt in train_data] # train_src, test_src, train_tgt, test_tgt = train_test_split( # train_src, train_tgt, test_size=0.15, random_state=123) # train_id, test_id = train_test_split( # train_id, test_size=0.15, random_state=123) # # print(f"{len(train_src)}, {len(train_tgt)}") # train_src, dev_src, train_tgt, dev_tgt = train_test_split( # train_src, train_tgt, test_size=0.15, random_state=123) # train_id, dev_id = train_test_split( # train_id, test_size=0.15, random_state=123) # print(source_field.preprocess(train_src[0])) # exit() train_src_preprocessed = [source_field.preprocess(x) for x in train_src] dev_src_preprocessed = [source_field.preprocess(x) for x in dev_src] test_src_preprocessed = [source_field.preprocess(x) for x in test_src] train_tgt_preprocessed = [target_field.preprocess(x) for x in train_tgt] dev_tgt_preprocessed = [target_field.preprocess(x) for x in dev_tgt] test_tgt_preprocessed = [target_field.preprocess(x) for x in test_tgt] # train_src_preprocessed = source_field.apply(lambda x: source_field.preprocess(x)) vectors = Vectors( name='/home/binhna/Downloads/shared_resources/cc.en.300.vec', cache='/home/binhna/Downloads/shared_resources/') source_field.build_vocab([ train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed, dev_tgt_preprocessed ], vectors=vectors) target_field.build_vocab([ train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed, dev_tgt_preprocessed ], vectors=vectors) train_data = [{ 'src': src, 'tgt': tgt, 'id': id } for src, tgt, id in zip(train_src, train_tgt, train_id)] train_data = Mydataset(data=train_data, fields=(('source', source_field), ('target', target_field))) dev_data = [{ 'src': src, 'tgt': tgt, 'id': id } for src, tgt, id in zip(dev_src, dev_tgt, dev_id)] # print(dev_data[0]) dev_data = Mydataset(data=dev_data, fields=(('source', source_field), ('target', target_field))) test_data = [{ 'src': src, 'tgt': tgt, 'id': id } for src, tgt, id in zip(test_src, test_tgt, test_id)] test_data = Mydataset(data=test_data, fields=(('source', source_field), ('target', target_field))) # print(train_data[10].source) # print(train_data[10].target) # print(len(target_field.vocab)) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') train_iter, test_iter, dev_iter = BucketIterator.splits( datasets=(train_data, test_data, dev_data), batch_sizes=(config.batch_size, config.batch_size, config.batch_size), device=device, sort_key=lambda x: len(x.source), sort_within_batch=True) args = ARGS() setattr(args, 'vectors', source_field.vocab.vectors) setattr(args, 'vocab_size', len(source_field.vocab.itos)) setattr(args, 'emb_dim', vectors.dim) model = Model(args) params = list(model.encoder.parameters()) + list( model.decoder.parameters()) + list(model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) iter, running_avg_loss = 0, 0 start = time.time() for epoch in range(500): print(f"Epoch: {epoch+1}") for i, batch in tqdm(enumerate(train_iter), total=len(train_iter)): # print(batch.source[0].size()) # exit() batch_size = batch.batch_size # encoder part enc_padding_mask = get_mask(batch.source, device) enc_batch = batch.source[0] enc_lens = batch.source[1] encoder_outputs, encoder_feature, encoder_hidden = model.encoder( enc_batch, enc_lens) s_t_1 = model.reduce_state(encoder_hidden) coverage = Variable(torch.zeros(batch.source[0].size())).to(device) c_t_1 = Variable(torch.zeros( (batch_size, 2 * config.hidden_dim))).to(device) extra_zeros, enc_batch_extend_vocab, max_art_oovs = get_extra_features( batch.source[0], source_field.vocab) extra_zeros = extra_zeros.to(device) enc_batch_extend_vocab = enc_batch_extend_vocab.to(device) # decoder part dec_batch = batch.target[0][:, :-1] # print(dec_batch.size()) target_batch = batch.target[0][:, 0:] dec_lens_var = batch.target[1] dec_padding_mask = get_mask(batch.target, device) max_dec_len = max(dec_lens_var) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps) - 1): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum( torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() norm = clip_grad_norm_(model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(model.reduce_state.parameters(), config.max_grad_norm) optimizer.step() running_avg_loss = calc_running_avg_loss(loss.item(), running_avg_loss, summary_writer, iter) iter += 1 summary_writer.flush() # print_interval = 10 # if iter % print_interval == 0: # print(f'steps {iter}, batch number: {i} with {time.time() - start} seconds, loss: {loss}') # start = time.time() if iter % 300 == 0: save_model(model, optimizer, running_avg_loss, iter, config.model_dir)
pad_token=PAD_TOKEN, batch_first=True, include_lengths=True, unk_token=UNKNOWN_TOKEN, lower=True) train_path = '../data/incar_alexa/train_public.pickle' dev_path = '../data/incar_alexa/dev_public.pickle' test_path = '../data/incar_alexa/test_public.pickle' path = '../data/cnn_stories_tokenized' # summary_writer = SummaryWriter(config.summary_path) train_src, train_tgt, train_id = load_data(train_path) dev_src, dev_tgt, dev_id = load_data(dev_path) test_src, test_tgt, test_id = load_data(test_path) train_src_preprocessed = [source_field.preprocess(x) for x in train_src] dev_src_preprocessed = [source_field.preprocess(x) for x in dev_src] test_src_preprocessed = [source_field.preprocess(x) for x in test_src] train_tgt_preprocessed = [target_field.preprocess(x) for x in train_tgt] dev_tgt_preprocessed = [target_field.preprocess(x) for x in dev_tgt] test_tgt_preprocessed = [target_field.preprocess(x) for x in test_tgt] # train_src_preprocessed = source_field.apply(lambda x: source_field.preprocess(x)) vectors = Vectors( name='/home/binhna/Downloads/shared_resources/cc.en.300.vec', cache='/home/binhna/Downloads/shared_resources/') source_field.build_vocab([ train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed, dev_tgt_preprocessed