def epoch_time(start_time, end_time): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) SRC = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) print('Loading Data...') data_train, data_val, data_test = TabularDataset.splits( path='data/', format='json', train='cnn_dataset_train.json', validation='cnn_dataset_valid.json',
def __init__(self, boxes_and_transcripts_file: Path, image_file: Path, label_file: Path, entities_list: List[str], resized_image_size: Tuple[int, int] = (480, 960), iob_tagging_type: str = 'box_level', entities_file: Path = None, training: bool = True, image_index=None): ''' An item returned by dataset. :param boxes_and_transcripts_file: gt or ocr results file :param image_file: whole images file :param entities_list: list with entities :param resized_image_size: resize whole image size, (w, h) :param iob_tagging_type: 'box_level', 'document_level', 'box_and_within_box_level' :param entities_file: exactly entity type and entity value of documents, json file :param training: True for train and validation mode, False for test mode. True will also load labels, and entities_file must be set. :param image_index: image index, used to get image file name ''' # text string label converter self.text_segments_field = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True) self.text_segments_field.vocab = vocab_cls['keys'] # iob string label converter self.iob_tags_field = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True) self.iob_tags_field.vocab = vocab_cls['iob_labels'] self.resized_image_size = resized_image_size self.training = training assert iob_tagging_type in ['box_level', 'document_level', 'box_and_within_box_level'], \ 'iob tagging type {} is not supported'.format(iob_tagging_type) self.iob_tagging_type = iob_tagging_type # For easier debug: # we will know what we are running on. self.image_filename = image_file.as_posix() try: # read boxes, transcripts, and entity types of boxes in one documents from boxes_and_transcripts file # match with regex pattern: index,x1,y1,x2,y2,x3,y3,x4,y4,transcript,type from boxes_and_transcripts tsv file # data format as [(index, points, transcription, entity_type)...] if self.training: # boxes_and_transcripts_data = [(index, [x1, y1, ...], transcript, entity_type), ...] boxes_and_transcripts_data = read_gt_file_with_box_entity_type(boxes_and_transcripts_file.as_posix()) else: boxes_and_transcripts_data = read_ocr_file_without_box_entity_type( boxes_and_transcripts_file.as_posix()) # Sort the box based on the position. # boxes_and_transcripts_data = sort_box_with_list(boxes_and_transcripts_data) # read image image = cv2.imread(image_file.as_posix()) label = pd.read_csv(label_file.as_posix(),sep='\n',header=None)[0].to_list() except Exception as e: raise IOError('Error occurs in image {}: {}'.format(image_file.stem, e.args)) boxes, transcripts, box_entity_types = [], [], [] if self.training: for index, points, transcript, entity_type in boxes_and_transcripts_data: if len(transcript) == 0: transcript = ' ' boxes.append(points) transcripts.append(transcript) box_entity_types.append(entity_type) else: for index, points, transcript in boxes_and_transcripts_data: if len(transcript) == 0: transcript = ' ' boxes.append(points) transcripts.append(transcript) # Limit the number of boxes and number of transcripts to process. boxes_num = min(len(boxes), MAX_BOXES_NUM) transcript_len = min(max([len(t) for t in transcripts[:boxes_num]]), MAX_TRANSCRIPT_LEN) mask = np.zeros((boxes_num, transcript_len), dtype=int) relation_features = np.zeros((boxes_num, boxes_num, 6)) try: height, width, _ = image.shape # resize image image = cv2.resize(image, self.resized_image_size, interpolation=cv2.INTER_LINEAR) x_scale = self.resized_image_size[0] / width y_scale = self.resized_image_size[1] / height # get min area box for each (original) boxes, for calculate initial relation features min_area_boxes = [cv2.minAreaRect(np.array(box, dtype=np.float32).reshape(4, 2)) for box in boxes[:boxes_num]] # calculate resized image box coordinate, and initial relation features between boxes (nodes) resized_boxes = [] for i in range(boxes_num): box_i = boxes[i] transcript_i = transcripts[i] # get resized images's boxes coordinate, used to ROIAlign in Encoder layer resized_box_i = [int(np.round(pos * x_scale)) if i % 2 == 0 else int(np.round(pos * y_scale)) for i, pos in enumerate(box_i)] # resized_rect_output_i = cv2.minAreaRect(np.array(resized_box_i, dtype=np.float32).reshape(4, 2)) # resized_box_i = cv2.boxPoints(resized_rect_output_i) resized_box_i = np.array(resized_box_i).reshape((8,)) resized_boxes.append(resized_box_i) # enumerate each box, calculate relation features between i and other nodes. # formula (9) self.relation_features_between_ij_nodes(boxes_num, i, min_area_boxes, relation_features, transcript_i, transcripts) relation_features = normalize_relation_features(relation_features, width=width, height=height) # The length of texts of each segment. text_segments = [list(trans) for trans in transcripts[:boxes_num]] if self.training: # assign iob label to input text through exactly match way, this process needs entity-level label if self.iob_tagging_type != 'box_level': with entities_file.open() as f: entities = json.load(f) if self.iob_tagging_type == 'box_level': # convert transcript of every boxes to iob label, using entity type of corresponding box iob_tags_label = text2iob_label_with_box_level_match(box_entity_types[:boxes_num], transcripts[:boxes_num], entities_list=entities_list) elif self.iob_tagging_type == 'document_level': # convert transcripts to iob label using document level tagging match method, all transcripts will # be concatenated as a sequences iob_tags_label = text2iob_label_with_document_level_exactly_match(transcripts[:boxes_num], entities, entities_list=entities_list) elif self.iob_tagging_type == 'box_and_within_box_level': # perform exactly tagging within specific box, box_level_entities parames will perform boex level tagging. iob_tags_label = text2iob_label_with_box_and_within_box_exactly_level(box_entity_types[:boxes_num], transcripts[:boxes_num], entities, ['address'], entities_list=entities_list) iob_tags_label = self.iob_tags_field.process(iob_tags_label)[:, :transcript_len].numpy() box_entity_types = [vocab_cls['entities'].stoi[t] for t in box_entity_types[:boxes_num]] # texts shape is (num_texts, max_texts_len), texts_len shape is (num_texts,) texts, texts_len = self.text_segments_field.process(text_segments) texts = texts[:, :transcript_len].numpy() texts_len = np.clip(texts_len.numpy(), 0, transcript_len) text_segments = (texts, texts_len) for i in range(boxes_num): mask[i, :texts_len[i]] = 1 self.whole_image = RawField().preprocess(image) self.text_segments = self.text_segments_field.preprocess(text_segments) # (text, texts_len) self.boxes_coordinate = RawField().preprocess(resized_boxes) self.relation_features = RawField().preprocess(relation_features) self.mask = RawField().preprocess(mask) self.boxes_num = RawField().preprocess(boxes_num) self.transcript_len = RawField().preprocess(transcript_len) # max transcript len of current document if self.training: self.iob_tags_label = self.iob_tags_field.preprocess(iob_tags_label) else: self.image_index = RawField().preprocess(image_index) self.label = RawField().preprocess(label) except Exception as e: raise RuntimeError('Error occurs in image {}: {}'.format(boxes_and_transcripts_file.stem, e.args))
def test_single_gpu_batch_parse(): trainer = Trainer(gpus=1) # non-transferrable types primitive_objects = [ None, {}, [], 1.0, "x", [None, 2], { "x": (1, 2), "y": None } ] for batch in primitive_objects: data = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert data == batch # batch is just a tensor batch = torch.rand(2, 3) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor' # tensor list batch = [torch.rand(2, 3), torch.rand(2, 3)] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0].device.index == 0 and batch[0].type( ) == 'torch.cuda.FloatTensor' assert batch[1].device.index == 0 and batch[1].type( ) == 'torch.cuda.FloatTensor' # tensor list of lists batch = [[torch.rand(2, 3), torch.rand(2, 3)]] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type( ) == 'torch.cuda.FloatTensor' assert batch[0][1].device.index == 0 and batch[0][1].type( ) == 'torch.cuda.FloatTensor' # tensor dict batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0]['a'].device.index == 0 and batch[0]['a'].type( ) == 'torch.cuda.FloatTensor' assert batch[0]['b'].device.index == 0 and batch[0]['b'].type( ) == 'torch.cuda.FloatTensor' # tuple of tensor list and list of tensor dict batch = ([torch.rand(2, 3) for _ in range(2)], [{ 'a': torch.rand(2, 3), 'b': torch.rand(2, 3) } for _ in range(2)]) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type( ) == 'torch.cuda.FloatTensor' assert batch[1][0]['a'].device.index == 0 assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor' assert batch[1][0]['b'].device.index == 0 assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor' # namedtuple of tensor BatchType = namedtuple('BatchType', ['a', 'b']) batch = [ BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2) ] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0].a.device.index == 0 assert batch[0].a.type() == 'torch.cuda.FloatTensor' # non-Tensor that has `.to()` defined class CustomBatchType: def __init__(self): self.a = torch.rand(2, 2) def to(self, *args, **kwargs): self.a = self.a.to(*args, **kwargs) return self batch = trainer.accelerator.batch_to_device(CustomBatchType(), torch.device('cuda:0')) assert batch.a.type() == 'torch.cuda.FloatTensor' # torchtext.data.Batch samples = [{ 'text': 'PyTorch Lightning is awesome!', 'label': 0 }, { 'text': 'Please make it work with torchtext', 'label': 1 }] text_field = Field() label_field = LabelField() fields = {'text': ('text', text_field), 'label': ('label', label_field)} examples = [Example.fromdict(sample, fields) for sample in samples] dataset = Dataset(examples=examples, fields=fields.values()) # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first text_field.build_vocab(dataset) label_field.build_vocab(dataset) batch = Batch(data=examples, dataset=dataset) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch.text.type() == 'torch.cuda.LongTensor' assert batch.label.type() == 'torch.cuda.LongTensor'
parser.add_argument('--num_layers', type=int, default=3, metavar='NL', help='number of model tail layers (default: 3)') parser.add_argument('--vectors', type=str, default="glove.6B.100d", metavar='PV', help='pretrained vectors model (default: glove.6B.100d)') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True) LABEL = Field(sequential=False, use_vocab=False) tv_datafields = [("qid", None), ("question_text", TEXT), ("target", LABEL)] trn, vld = TabularDataset.splits(path=TRAIN_DIR, train='train.csv', validation="val.csv", format='csv', skip_header=True, fields=tv_datafields) tst_datafields = [("qid", None), ("question_text", TEXT)] tst = TabularDataset(path=TEST_CSV, format='csv',
def get_fields(src_data_type, n_src_feats, n_tgt_feats, pad='<blank>', bos='<s>', eos='</s>', dynamic_dict=False, with_align=False, src_truncate=None, tgt_truncate=None): """ Args: src_data_type: type of the source input. Options are [text|img|audio]. n_src_feats (int): the number of source features (not counting tokens) to create a :class:`torchtext.data.Field` for. (If ``src_data_type=="text"``, these fields are stored together as a ``TextMultiField``). n_tgt_feats (int): See above. pad (str): Special pad symbol. Used on src and tgt side. bos (str): Special beginning of sequence symbol. Only relevant for tgt. eos (str): Special end of sequence symbol. Only relevant for tgt. dynamic_dict (bool): Whether or not to include source map and alignment fields. with_align (bool): Whether or not to include word align. src_truncate: Cut off src sequences beyond this (passed to ``src_data_type``'s data reader - see there for more details). tgt_truncate: Cut off tgt sequences beyond this (passed to :class:`TextDataReader` - see there for more details). Returns: A dict mapping names to fields. These names need to match the dataset example attributes. """ assert src_data_type in ['text', 'img', 'audio', 'vec'], \ "Data type not implemented" assert not dynamic_dict or src_data_type == 'text', \ 'it is not possible to use dynamic_dict with non-text input' fields = {} fields_getters = { "text": text_fields, "node": node_fields, "img": image_fields, "audio": audio_fields, "vec": vec_fields } src_field_kwargs = { "n_feats": n_src_feats, "include_lengths": True, "pad": pad, "bos": None, "eos": None, "truncate": src_truncate, "base_name": "src" } fields["src"] = fields_getters[src_data_type](**src_field_kwargs) tgt_field_kwargs = { "n_feats": n_tgt_feats, "include_lengths": False, "pad": pad, "bos": bos, "eos": eos, "truncate": tgt_truncate, "base_name": "tgt" } fields["tgt"] = fields_getters["text"](**tgt_field_kwargs) graph_field_kwargs = {"base_name": "graph"} fields["graph"] = GraphField(graph_field_kwargs) indices = Field(use_vocab=False, dtype=torch.long, sequential=False) fields["indices"] = indices if dynamic_dict: src_map = Field(use_vocab=False, dtype=torch.float, postprocessing=make_src, sequential=False) fields["src_map"] = src_map src_ex_vocab = RawField() fields["src_ex_vocab"] = src_ex_vocab align = Field(use_vocab=False, dtype=torch.long, postprocessing=make_tgt, sequential=False) fields["alignment"] = align if with_align: word_align = AlignField() fields["align"] = word_align return fields
python -m spacy download de """ spacy_eng = spacy.load("en") spacy_ger = spacy.load("de") def tokenize_eng(text): return [tok.text for tok in spacy_eng.tokenizer(text)] def tokenize_ger(text): return [tok.text for tok in spacy_ger.tokenizer(text)] english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True) german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True) fields = {"English": ("eng", english), "German": ("ger", german)} train_data, test_data = TabularDataset.splits( path="", train="train.json", test="test.json", format="json", fields=fields ) english.build_vocab(train_data, max_size=10000, min_freq=2) german.build_vocab(train_data, max_size=10000, min_freq=2) train_iterator, test_iterator = BucketIterator.splits( (train_data, test_data), batch_size=32, device="cuda" )
def load_dataset(batch_size, debug=True): spacy_en = spacy.load('en') spacy_zh = Chinese() def tokenize_en(line): return [token.text for token in spacy_zh.tokenizer(line)] def tokenize_zh(line): return [token.text for token in spacy_en.tokenizer(line)] EN = Field(tokenize=tokenize_en, include_lengths=True, init_token='<sos>', eos_token='<eos>') ZH = Field(tokenize=tokenize_zh, include_lengths=True, init_token='<sos>', eos_token='<eos>') lines = open(train_file, 'rt', encoding='utf-8').read().splitlines() train_samples = [line.split('\t') for line in lines] train_docID, train_senID, train_en, train_zh = zip(*train_samples) val_docID, val_senID, val_en = extract_data_from_sgm(val_en_file, cols=3) val_zh, = extract_data_from_sgm(val_zh_file, cols=1) test_docID, test_senID, test_en = extract_data_from_sgm(test_en_file, cols=3) if debug: debug_info_size = 10 print('\n[Debug] First %d training examples:\n' % debug_info_size) for i in range(debug_info_size): print(train_docID[i], train_senID[i], train_en[i], train_zh[i]) print('\n[Debug] First %d validation examples:\n' % debug_info_size) for i in range(debug_info_size): print(val_docID[i], val_senID[i], val_en[i], val_zh[i]) print('\n[Debug] First %d test examples:\n' % debug_info_size) for i in range(debug_info_size): print(test_en[i]) train_examples = [ sentence_translation(train_docID[i], train_senID[i], train_en[i], train_zh[i]) for i in range(len(train_docID)) ] val_examples = [ sentence_translation(val_docID[i], val_senID[i], val_en[i], val_zh[i]) for i in range(len(val_docID)) ] print("Train size = %d" % len(train_examples)) print("Eval size = %d" % len(val_examples)) train_dataset = Dataset(train_examples, {'src': EN, 'trg': ZH}) val_dataset = Dataset(val_examples, {'src': EN, 'trg': ZH}) print('Datasets Built!') EN.build_vocab(train_dataset.src, min_freq=2) ZH.build_vocab(train_dataset.trg, max_size=10000) print('Vocabularies Built!') train_iter, val_iter = BucketIterator.splits( (train_dataset, val_dataset), batch_size=batch_size, repeat=False, sort_key=lambda x: interleave_keys(len(x.src), len(x.trg))) print('Training Iterators Built!') return train_iter, val_iter, ZH, EN
def image_fields(base_name, **kwargs): img = Field(use_vocab=False, dtype=torch.float, postprocessing=batch_img, sequential=False) return [(base_name, img)]
class IMDBHierarchical(IMDB): NESTING_FIELD = Field(batch_first=True, tokenize=clean_string) TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
from torch.utils.tensorboard import SummaryWriter from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint spacy_ger = spacy.load('de') spacy_eng = spacy.load('en') def tockenizerGermen(text): return [tok.text for tok in spacy_ger.tockenizer(text)] def tockenizerEnglish(text): return [tock.text for tok in spacy_eng.tockenixer(text)] german = Field(tockenize=tockenizerGermen, lower=True, init_tocken='<sos>', eos_tocken='<eos>') english = Field(tockenize=tockenizerEnglish, lower=True, init_tocken='<sos>', eos_tocken='<eos>') train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fileds=(german, english)) german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) class Encoder(nn.Module): def __init__(self, input_size, embedding_size, hidden_size, num_layers, p): super(Encoder, self).__init__()
num_samples: Optional[int] = None, add_cls: bool = False, random_state: int = 162, max_len: Optional[int] = None, verbose: bool = True, **kwargs): path = os.path.join(root, cls.name, 'train.csv') full_dataset = YahooAnswers(path=path, fields=fields, num_samples=num_samples, verbose=verbose, add_cls=add_cls, random_state=random_state, max_len=max_len, **kwargs) splitted_data = full_dataset.split(split_ratio=split_ratio, stratified=stratified, strata_field=strata_field) return splitted_data if __name__ == '__main__': tokenize = lambda x: x.strip().split() text_field = Field(sequential=True, use_vocab=True, init_token='<s>', eos_token='</s>', tokenize=tokenize, include_lengths=True) print(PTB.splits(fields=(('inp', text_field), ('trg', text_field))))
def load_data(train_file, test_file, pretrain=None, save_dir=None): assert os.path.exists(train_file), f"{train_file} is not exist!" assert os.path.exists(test_file), f"{test_file} is not exist!" print("=" * 30 + "DATASET LOADER" + "=" * 30) sent_field = Field(tokenize=lambda x: x.split(), unk_token='<unk>', pad_token='<pad>', init_token=None, eos_token=None) doc_field = NestedField(sent_field, tokenize=sent_tokenize, pad_token='<pad>', init_token=None, eos_token=None, include_lengths=True) label_field = LabelField() fields = [("raw", RawField()), ("doc", doc_field), ("label", label_field)] print(f"Reading {train_file} ...") with open(train_file, "r", encoding="utf-8") as reader: lines = reader.readlines() examples = [] for line in lines: text, label = line.split('\t') examples.append( Example.fromlist([text, text.lower(), label], fields)) train_dataset = Dataset(examples, fields) reader.close() print(f"\tNum of train examples: {len(examples)}") print(f"Reading {test_file} ...") with open(test_file, "r", encoding="utf-8") as reader: lines = reader.readlines() examples = [] for line in lines: text, label = line.split('\t') examples.append( Example.fromlist([text, text.lower(), label], fields)) test_dataset = Dataset(examples, fields) reader.close() print(f"\tNum of valid examples: {len(examples)}") vectors = FastText('vi') doc_field.build_vocab(train_dataset, test_dataset, vectors=vectors) label_field.build_vocab(train_dataset, test_dataset) print(f"Building vocabulary ...") num_vocab = len(doc_field.vocab) num_classes = len(label_field.vocab) pad_idx = doc_field.vocab.stoi['<pad>'] print(f"\tNum of vocabulary: {num_vocab}") print(f"\tNum of classes: {num_classes}") if save_dir: with open(save_dir + "/vocab.json", "w", encoding="utf-8") as fv: vocabs = { "word": doc_field.vocab.stoi, "class": label_field.vocab.itos, 'pad_idx': pad_idx } json.dump(vocabs, fv) fv.close() with open(save_dir + "/fileds.json", "w", encoding="utf-8") as ff: field_vocabs = { "doc": doc_field.vocab.freqs, "label": label_field.vocab.freqs } json.dump(field_vocabs, ff) ff.close() print("=" * 73) return train_dataset, test_dataset, num_vocab, num_classes, pad_idx, vectors.vectors
ax=axs[h]) plt.show() print("Decoder Src Layer", layer + 1) fig, axs = plt.subplots(1, 4, figsize=(16, 8)) for h in range(4): draw(model.decoder.layers[layer].self_attn.attn[ 0, h].data[:len(tgt_sent), :len(sent)].cpu(), sent, tgt_sent if h == 0 else [], ax=axs[h]) plt.show() SRC = Field(tokenize="spacy", tokenizer_language="de_core_news_sm", init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize="spacy", tokenizer_language="en_core_web_sm", init_token='<sos>', eos_token='<eos>', lower=True) MAX_LEN = 100 train_data, valid_data, test_data = Multi30k.splits( exts=('.de', '.en'), fields=(SRC, TRG), filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN)
def translate(cfg_file, ckpt: str, output_path: str = None) -> None: """ Interactive translation function. Loads model from checkpoint and translates either the stdin input or asks for input to translate interactively. The input has to be pre-processed according to the data that the model was trained on, i.e. tokenized or split into subwords. Translations are printed to stdout. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load """ def _load_line_as_data(line): """ Create a dataset from one line via a temporary file. """ # write src input to temporary file tmp_name = "tmp" tmp_suffix = ".src" tmp_filename = tmp_name + tmp_suffix with open(tmp_filename, "w") as tmp_file: tmp_file.write("{}\n".format(line)) test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field) # remove temporary file if os.path.exists(tmp_filename): os.remove(tmp_filename) return test_data def _translate_data(test_data): """ Translates given dataset, using parameters from outer scope. """ # pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( model, data=test_data, batch_size=batch_size, level=level, max_output_length=max_output_length, eval_metric="", use_cuda=use_cuda, loss_function=None, beam_size=beam_size, beam_alpha=beam_alpha) return hypotheses cfg = load_config(cfg_file) # when checkpoint is not specified, take oldest from model dir if ckpt is None: model_dir = cfg["training"]["model_dir"] ckpt = get_latest_checkpoint(model_dir) batch_size = cfg["training"].get("batch_size", 1) use_cuda = cfg["training"].get("use_cuda", False) level = cfg["data"]["level"] max_output_length = cfg["training"].get("max_output_length", None) # read vocabs src_vocab_file = cfg["training"].get( "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt") trg_vocab_file = cfg["training"].get( "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt") src_vocab = Vocabulary(file=src_vocab_file) trg_vocab = Vocabulary(file=trg_vocab_file) data_cfg = cfg["data"] level = data_cfg["level"] lowercase = data_cfg["lowercase"] tok_fun = lambda s: list(s) if level == "char" else s.split() src_field = Field(init_token=None, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tok_fun, batch_first=True, lower=lowercase, unk_token=UNK_TOKEN, include_lengths=True) src_field.vocab = src_vocab # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() # whether to use beam search for decoding, 0: greedy decoding if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 0) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 0 beam_alpha = -1 if not sys.stdin.isatty(): # file given test_data = MonoDataset(path=sys.stdin, ext="", field=src_field) hypotheses = _translate_data(test_data) if output_path is not None: output_path_set = "{}".format(output_path) with open(output_path_set, mode="w", encoding="utf-8") as out_file: for hyp in hypotheses: out_file.write(hyp + "\n") print("Translations saved to: {}".format(output_path_set)) else: for hyp in hypotheses: print(hyp) else: # enter interactive mode batch_size = 1 while True: try: src_input = input("\nPlease enter a source sentence " "(pre-processed): \n") if not src_input.strip(): break # every line has to be made into dataset test_data = _load_line_as_data(line=src_input) hypotheses = _translate_data(test_data) print("JoeyNMT: {}".format(hypotheses[0])) except (KeyboardInterrupt, EOFError): print("\nBye.") break
def main(colab_args=None): if colab_args: args = colab_args else: parser = argparse.ArgumentParser() parser.add_argument( "--output_dir", type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir") parser.add_argument( "--model_name_or_path", default=None, type=str, help= "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.", ) parser.add_argument("--train_data_path", default=None, type=str, help="The json file for training the model") parser.add_argument("--eval_data_path", default=None, type=str, help="The json file for evaluating the model") parser.add_argument( "--config_name", default=None, type=str, help= "Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.", ) parser.add_argument( "--block_size", default=-1, type=int, help="Optional input sequence length after tokenization." "The training dataset will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens).", ) parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--log_dir", default=".", type=str, help="Directory to store the logs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--save_total_limit", type=int, default=None, help= "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default", ) parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") args = parser.parse_args() if args.should_continue: sorted_checkpoints = _sorted_checkpoints(args) if len(sorted_checkpoints) == 0: raise ValueError( "Used --should_continue but no checkpoint was found in --output_dir." ) else: args.model_name_or_path = sorted_checkpoints[-1] if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir and not args.should_continue): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU device = torch.device('cuda:{}'.format(torch.cuda.current_device( )) if torch.cuda.is_available() else "cpu") args.n_gpu = 0 if device == 'cpu' else torch.cuda.device_count() args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) # Set seed set_seed(args) # setup tokenizer and model if os.path.exists(os.path.join(args.output_dir, "tokenizer.pt")): new_tokenizer = False tokenizer = torch.load(os.path.join(args.output_dir, "tokenizer.pt")) else: new_tokenizer = True tokenizer = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) train_dataset = VideoBertDataset(tokenizer, build_tokenizer=new_tokenizer, data_path=args.train_data_path) eval_dataset = VideoBertDataset(train_dataset.tokenizer, build_tokenizer=False, data_path=args.eval_data_path) data_globals.config.vocab_size = len( train_dataset.tokenizer.vocab.itos) + 20736 print("total vocab size of", len(train_dataset.tokenizer.vocab.itos) + 20736) if args.model_name_or_path is None: # start from inital model print('### LOADING INITIAL MODEL ###') model = VideoTransformer(config=data_globals.config, args=args) model.apply(initialize_weights) else: # start from checkpoint print('### LOADING MODEL FROM CHECKPOINT:', args.model_name_or_path, '###') model = VideoTransformer.from_pretrained(config=data_globals.config, args=args) model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if new_tokenizer: torch.save(train_dataset.tokenizer, os.path.join(args.output_dir, "tokenizer.pt")) logger.info("Saving tokenizer to %s", args.output_dir) # Benchmark Evaluation # total_avg_loss, text_avg_loss, video_avg_loss, joint_avg_loss = evaluate(args, model, eval_dataset) # print("Benchmark Eval:\n" # "Total: {}\n" # "Text: {}\n" # "Video: {}\n" # "Joint: {}\n".format(total_avg_loss, text_avg_loss, video_avg_loss, joint_avg_loss)) # # print("After Eval:") # print(torch.cuda.memory_summary(args.device)) # Start Training model.train() global_step, tr_loss = train(args, model, train_dataset, eval_dataset) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
class IMDB(TabularDataset): NAME = 'IMDB' NUM_CLASSES = 10 TEXT_FIELD = Field(batch_first=True, tokenize=clean_string, include_lengths=True) LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=process_labels) @staticmethod def sort_key(ex): return len(ex.text) @classmethod def splits(cls, path, train=os.path.join('IMDB', 'data', 'imdb_train.tsv'), validation=os.path.join('IMDB', 'data', 'imdb_validation.tsv'), test=os.path.join('IMDB', 'data', 'imdb_test.tsv'), **kwargs): return super(IMDB, cls).splits(path, train=train, validation=validation, test=test, format='tsv', fields=[('label', cls.LABEL_FIELD), ('text', cls.TEXT_FIELD)]) @classmethod def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
config.ls_mode = 'origin' # for vae config.vae_struct = True config.vae_word_dim = 30000 config.decoder_dataset = 'IMDB_10' config.decoder_channel = config.word_num_hidden * 2 if config.vae_struct: assert config.word_num_hidden == config.sentence_num_hidden #front-end cnn config.frontend_cnn = False if args.dataset == 'Yelp2013': args.dataset = 'Yelp2014' dataset_map[args.dataset].Year = 13 dataset_map[args.dataset].NESTING_FIELD = Field( batch_first=True, tokenize=Word_Tokenize(), fix_length=config.fix_length) dataset_map[args.dataset].TEXT_FIELD = SentenceWord_field(dataset_map[args.dataset].NESTING_FIELD,\ tokenize=Sentence_Tokenize(),\ vae_struct=config.vae_struct) time_tmp = time.time() if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') else: dataset_class = dataset_map[args.dataset] train_iter, dev_iter, test_iter = dataset_class.iters( args.data_dir, args.word_vectors_file, args.word_vectors_dir, batch_size=args.batch_size,
def predict(invocations, result_cnt=5): english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>") bash = Field(tokenize=tokenize_bash, lower=True, init_token="<sos>", eos_token="<eos>") fields = {"English": ("eng", english), "Bash": ("bash", bash)} train_data, test_data = TabularDataset.splits( path="", train="src/submission_code/train.json", test="src/submission_code/test.json", format="json", fields=fields) english.build_vocab(train_data, max_size=10000, min_freq=2) bash.build_vocab(train_data, max_size=10000, min_freq=2) # We're ready to define everything we need for training our Seq2Seq model device = torch.device("cpu") # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") load_model = True save_model = False learning_rate = 1e-4 # Model hyperparameters src_vocab_size = len(english.vocab) trg_vocab_size = len(bash.vocab) embedding_size = 256 num_heads = 8 num_encoder_layers = 8 num_decoder_layers = 8 dropout = 0.10 max_len = 100 forward_expansion = 2048 src_pad_idx = english.vocab.stoi["<pad>"] model = Transformer( embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout, max_len, device, ).to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) if load_model: load_checkpoint( torch.load("src/my_checkpoint.pth.tar", map_location='cpu'), model, optimizer) """ Function called by the evaluation script to interface the participants model `predict` function accepts the natural language invocations as input, and returns the predicted commands along with confidences as output. For each invocation, `result_cnt` number of predicted commands are expected to be returned. Args: 1. invocations : `list (str)` : list of `n_batch` (default 16) natural language invocations 2. result_cnt : `int` : number of predicted commands to return for each invocation Returns: 1. commands : `list [ list (str) ]` : a list of list of strings of shape (n_batch, result_cnt) 2. confidences: `list[ list (float) ]` : confidences corresponding to the predicted commands confidence values should be between 0.0 and 1.0. Shape: (n_batch, result_cnt) """ n_batch = len(invocations) # `commands` and `confidences` have shape (n_batch, result_cnt) commands = [[''] * result_cnt for _ in range(n_batch)] cf = [1.0] * (result_cnt - 1) cf.append(0) confidences = [cf for _ in range(n_batch)] ################################################################################################ # Participants should add their codes to fill predict `commands` and `confidences` here # ################################################################################################ for idx, inv in enumerate(invocations): # Call the translate method to retrieve translations and scores prediction = translate_sentence(model, inv, english, bash, device, max_length=30)[:-1] temp = " ".join(prediction) top_commands = [temp] * 5 print(top_commands) # For testing evalAI docker push, just fill top command - just need to check # if tellina imports work correctly right now for i in range(result_cnt): commands[idx][i] = top_commands[i] ################################################################################################ # Participant code block ends # ################################################################################################ return commands, confidences
spacy_en = spacy.load('en') # Tokenize French sentence def tokenize_fr(text): return [tok.text for tok in spacy_fr.tokenizer(text)] # Tokenize English sentence def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] # Split train / valid / test dataset SRC = Field(tokenize=tokenize_fr, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) train_data, valid_data, test_data = Multi30k.splits(exts=('.fr', '.en'), fields=(SRC, TRG)) print("Number of training examples:", len(train_data.examples)) print("Number of validation examples:", len(valid_data.examples)) print("Number of testing examples:", len(test_data.examples)) print("First train data:", vars(train_data.examples[0])) print("First valid data:", vars(valid_data.examples[0])) print("First test data:", vars(test_data.examples[0]))
best_loss = 65535.0 temp_model=None num = -1 for item in file_list: file = p.match(item) if(file): num = file.groups()[1] loss = file.groups()[2] if float(loss)<best_loss: temp_model=file.groups()[0] return num,temp_model #Data process TKNIZER_PATTERN = re.compile("[^\w]+") LABEL = Field(sequential=(False),batch_first=(True),pad_token=None,unk_token=None) SENTENCE_FIRST = Field(sequential=(True),tokenize=lambda x: TKNIZER_PATTERN.split(x)[1:-1],lower=True,unk_token='<unk>') SENTENCE_SECOND = Field(sequential=(True),tokenize=lambda x: TKNIZER_PATTERN.split(x)[1:-1],lower=True,unk_token='<unk>',init_token='<start>') def dataset2iter(workpath=WORK_PATH,train_path=FILE_TRAIN,validation_path=FILE_VALID,test_path=FILE_TEST): fields =[('gold_label',LABEL), ('sentence1_binary_parse',SENTENCE_FIRST), ('sentence2_binary_parse',SENTENCE_SECOND), ] data_train = TabularDataset(workpath+train_path, format="tsv", fields=fields, skip_header=True) data_valid = TabularDataset(workpath+validation_path, format="tsv", fields=fields, skip_header=True) data_test = TabularDataset(workpath+test_path, format="tsv", fields=fields, skip_header=True) pretrained_vectors = Vectors(name = GLOVE_PATH+TRAINED_VECTORS+'.txt',cache=GLOVE_PATH) SENTENCE_FIRST.build_vocab(data_train,vectors=pretrained_vectors,unk_init= lambda x:torch.nn.init.uniform_(x, a=-0.25, b=0.25) )
# device = torch.device('cuda' if torch.cuda else "cpu") ## load data root = "/home/cp/dataSet/text-classification-sample/" train_path = "/home/cp/dataSet/text-classification-sample/train.csv" valid_path = '/home/cp/dataSet/text-classification-sample/valid.csv' test_path = '/home/cp/dataSet/text-classification-sample/test.csv' data_train = pd.read_csv(train_path).head() print(data_train) print(data_train.columns) # 1 定义Field对象,配置文本处理 tokenize = lambda x: x.split() text = Field(sequential=True, lower=True, tokenize=tokenize, use_vocab=True) label = Field(sequential=False, use_vocab=False) # 2 定义DataSets对象,加载原始语料 tv_datafields = [("id", None), ("comment_text", text), ("toxic", label), ('severe_toxic', label), ('obscene', label), ('threat', label), ('insult', label), ("identity_hate", label)] train, valid = TabularDataset.splits(path=root, train='train.csv', validation='valid.csv', format='csv', skip_header=True, fields=tv_datafields)
import math import torch import torch.nn as nn from torchtext.data import Field, BucketIterator from torchtext.datasets import TranslationDataset from torch import nn import torch import torch.nn.functional as F from torch.nn import TransformerEncoder, TransformerEncoderLayer device = torch.device("cuda") Lang1 = Field(eos_token='<eos>') Lang2 = Field(init_token='<sos>', eos_token='<eos>') train = TranslationDataset(path='../Datasets/MT_data/', exts=('eng-fra.train.fr', 'eng-fra.train.en'), fields=[('Lang1', Lang1), ('Lang2', Lang2)]) train_iter, val_iter, test_iter = BucketIterator.splits((train, train, train), batch_size=16, repeat=False) Lang1.build_vocab(train) Lang2.build_vocab(train) # for i, train_batch in enumerate(train_iter): # print('Lang1 : \n', [Lang1.vocab.itos[x] for x in train_batch.Lang1[0].data[:, 0]]) # print('Lang1 : \n', train_batch.Lang1[1].data[0]) # print('Lang2 : \n', [Lang2.vocab.itos[x] for x in train_batch.Lang2[0].data[:, 0]]) # print('Lang2 : \n', train_batch.Lang2[1].data[0])
import torch import torch.nn as nn import torch.nn.functional as F from torchtext.data import Field, TabularDataset, BucketIterator, Iterator import config_train as args import pickle from mgnn import MGNN import numpy as np from embeddings import GloveEmbedding # from sklearn.metrics import classification_report, precision_recall_fscore_support import spacy import time queryF = Field(sequential=True, batch_first=True, lower=True, include_lengths=True) syntaxF = Field(sequential=True, batch_first=True, lower=True, include_lengths=True) hierF = Field(sequential=True, batch_first=True, lower=True, include_lengths=True) relF = Field(sequential=True, batch_first=True, lower=True, include_lengths=True) labelF = Field(sequential=False, batch_first=True, use_vocab=False)
random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True seed_everything(SEED) TEXT = Field(lower=True, use_vocab=True, sequential=True, batch_first=True, include_lengths=True) LABEL = Field(lower=True, use_vocab=True, sequential=True, unk_token = None, batch_first=True) fields = [('text', TEXT), ('tags', LABEL)] train_data, valid_data, test_data = datasets.UDPOS.splits(fields) TEXT.build_vocab(train_data, max_size=25000,
def __init__(self, image, boxes_and_transcripts_data, iob_tagging_type: str = 'box_level', image_index=None, resized_image_size: Tuple[int, int] = (480, 960)): ''' An item returned by dataset. :param iob_tagging_type: 'box_level', 'document_level', 'box_and_within_box_level' :param entities_file: exactly entity type and entity value of documents, json file :param image_index: image index, used to get image file name ''' # text string label converter self.text_segments_field = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True) self.text_segments_field.vocab = vocab_cls['keys'] # iob string label converter self.iob_tags_field = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True) self.iob_tags_field.vocab = vocab_cls['iob_labels'] assert iob_tagging_type in ['box_level', 'document_level', 'box_and_within_box_level'], \ 'iob tagging type {} is not supported'.format(iob_tagging_type) self.iob_tagging_type = iob_tagging_type self.resized_image_size = resized_image_size # read boxes, transcripts, and entity types of boxes in one documents from boxes_and_transcripts file # match with regex pattern: index,x1,y1,x2,y2,x3,y3,x4,y4,transcript,type from boxes_and_transcripts tsv file # data format as [(index, points, transcription, entity_type)...] # label = pd.read_csv(label_file.as_posix(),sep='\n',header=None)[0].to_list() boxes, transcripts, box_entity_types = [], [], [] for index, points, transcript, _ in boxes_and_transcripts_data: if len(transcript) == 0: transcript = ' ' boxes.append(points) transcripts.append(transcript) # Limit the number of boxes and number of transcripts to process. boxes_num = min(len(boxes), MAX_BOXES_NUM) transcript_len = min(max([len(t) for t in transcripts[:boxes_num]]), MAX_TRANSCRIPT_LEN) mask = np.zeros((boxes_num, transcript_len), dtype=int) relation_features = np.zeros((boxes_num, boxes_num, 6)) height, width, _ = image.shape image = cv2.resize(image, self.resized_image_size, interpolation=cv2.INTER_LINEAR) x_scale = self.resized_image_size[0] / width y_scale = self.resized_image_size[1] / height # get min area box for each (original) boxes, for calculate initial relation features min_area_boxes = [cv2.minAreaRect(np.array(box, dtype=np.float32).reshape(4, 2)) for box in boxes[:boxes_num]] # calculate resized image box coordinate, and initial relation features between boxes (nodes) resized_boxes = [] for i in range(boxes_num): box_i = boxes[i] transcript_i = transcripts[i] # get resized images's boxes coordinate, used to ROIAlign in Encoder layer resized_box_i = [int(np.round(pos * x_scale)) if i % 2 == 0 else int(np.round(pos * y_scale)) for i, pos in enumerate(box_i)] resized_box_i = np.array(resized_box_i).reshape((8,)) resized_boxes.append(resized_box_i) # enumerate each box, calculate relation features between i and other nodes. # formula (9) self.relation_features_between_ij_nodes(boxes_num, i, min_area_boxes, relation_features, transcript_i, transcripts) relation_features = normalize_relation_features(relation_features, width=width, height=height) # The length of texts of each segment. text_segments = [list(trans) for trans in transcripts[:boxes_num]] # texts shape is (num_texts, max_texts_len), texts_len shape is (num_texts,) texts, texts_len = self.text_segments_field.process(text_segments) texts = texts[:, :transcript_len].numpy() texts_len = np.clip(texts_len.numpy(), 0, transcript_len) text_segments = (texts, texts_len) for i in range(boxes_num): mask[i, :texts_len[i]] = 1 self.whole_image = RawField().preprocess(image) self.text_segments = self.text_segments_field.preprocess(text_segments) # (text, texts_len) self.boxes_coordinate = RawField().preprocess(resized_boxes) self.relation_features = RawField().preprocess(relation_features) self.mask = RawField().preprocess(mask) self.boxes_num = RawField().preprocess(boxes_num) self.transcript_len = RawField().preprocess(transcript_len) # max transcript len of current document self.image_index = RawField().preprocess(image_index)
def load_naive_lm(args): """ Convenience function to load pickle or dataset """ src = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, include_lengths=True) trg = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) if args.expanded_dataset: path = ".data/stories/story_commonsense/torchtext_expanded" else: path = ".data/stories/story_commonsense/torchtext" train_data, valid_data, test_data = NaiveDatasetLM.splits(\ exts = (args.src_ext, args.trg_ext), fields = (src, trg), path=path) # Build vocabularies if os.path.isfile(args.prepared_data): # Load from pickle print(f"Found data pickle, loading from {args.prepared_data}") with open(args.prepared_data, 'rb') as p: d = pickle.load(p) src.vocab = d["src.vocab"] trg.vocab = d["trg.vocab"] combined_vocab = d["combined_vocab"] args.emb_dim = d["emb_dim"] loaded_vectors = d["loaded_vectors"] else: # Build vocabs. Will check `src` or `trg` field in `train_data` src.build_vocab(train_data, min_freq=2) trg.build_vocab(train_data, min_freq=2) # Build single vocab, use combined_vocab = build_combined_vocab(src, train_data) # Load Glove embeddings str_to_idx_combined = combined_vocab.stoi # word to idx dictionary str_to_idx = src.vocab.stoi # word to idx dictionary # `loaded_vectors` is a dictionary of words to embeddings # To be sure to include entire vocab, we save the embeddings for the # combined vocab if "elmo" in args.embedding_type: loaded_vectors = [] embedding_size = 1024 elif "gpt" in args.embedding_type: loaded_vectors = [] embedding_size = 1024 else: loaded_vectors, embedding_size = load_text_vec( str_to_idx_combined, args.embeddings_path) args.emb_dim = embedding_size # Pickle Field vocab for later faster load with open(args.prepared_data, 'wb') as p: d = {} d["src.vocab"] = src.vocab d["trg.vocab"] = trg.vocab d["combined_vocab"] = combined_vocab d["emb_dim"] = args.emb_dim d["loaded_vectors"] = loaded_vectors pickle.dump(d, p, protocol=pickle.HIGHEST_PROTOCOL) print( f"Saved prepared data for future fast load to: {args.prepared_data}" ) # Build single vocab for both src and trg if args.single_vocab: src.vocab = combined_vocab trg.vocab = combined_vocab print(f"Source vocab size: {len(src.vocab)}") print(f"Target vocab size: {len(trg.vocab)}") # Data iterators train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=args.batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), device=args.device) return train_iterator, valid_iterator, test_iterator, src, trg, loaded_vectors
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint # loading word spacy_ger = spacy.load("de") spacy_eng = spacy.load("en") # create tokenizer function def tokenizer_ger(text): return [tok.text for tok in spacy_ger.tokenizer(text)] def tokenizer_eng(text): return [tok.text for tok in spacy_ger.tokenizer(text)] # define vocab form german = Field(tokenize=tokenizer_ger, lower=True, init_token="<sos>", eos_token="<eos>") english = Field(tokenize=tokenizer_eng, lower=True, init_token="<sos>", eos_token="<eos>") train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english)) # build vocab german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) class Encoder(nn.Module): def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout): super(Encoder, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers self.dropout = nn.Dropout(dropout) self.embedding = nn.Embedding(input_size, embedding_size)
os.environ["CUDA_VISIBLE_DEVICES"] = args.device_list # Device setting device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # dont use writer temporarily writer = None best_bleu = 999.00 start_epoch = 0 # Train with Transformer if __name__ == '__main__': # Prepare data SRC = Field(tokenize='spacy', tokenizer_language='de', init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize='spacy', tokenizer_language='en', init_token='<sos>', eos_token='<eos>', lower=True) train_data, val_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG)) train_iter, val_iter, test_iter = BucketIterator.splits( (train_data, val_data, test_data), batch_size=args.batch_size) print(len(train_iter))
def __init__(self, args, params): self.batch_size = params.batch_size self.fix_length = params.fix_length self.root_path = args.data_dir self.use_bert = args.bert if not self.use_bert: with open(args.embedding_pkl_path + '_word2idx.pkl', 'rb') as f: word2idx = pickle.load(f) else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') def word_tokenize(sentence): if self.use_bert: # tokenized_text = tokenizer.tokenize(sentence) #会切分单词 导致不对应 tokenized_text = sentence.split(' ') sentence = tokenizer.convert_tokens_to_ids(tokenized_text) # sentence = tokenizer.add_special_tokens_single_sentence(sentence) sentence = [101] + sentence else: tokenized_text = sentence.split(' ') sentence = [word2idx.get(word, 0) for word in tokenized_text] sentence = [0] + sentence # 与bert统一 return sentence def pos_tokenize(posids): return [int(_) for _ in posids.split(' ')] # dtype = torch.cuda.LongTensor if args.gpu and torch.cuda.is_available() else torch.int64 TEXT = Field( sequential=True, tokenize=word_tokenize, use_vocab=False, batch_first=True, fix_length=self.fix_length + 1, # 添加了 cls pad_token=0) POSITION = Field(sequential=True, tokenize=pos_tokenize, use_vocab=False, fix_length=self.fix_length, pad_token=0, batch_first=True, include_lengths=True) POSITION_NO_LEN = Field(sequential=True, tokenize=pos_tokenize, use_vocab=False, fix_length=self.fix_length, pad_token=0, batch_first=True) LABEL = Field(sequential=False, use_vocab=False, batch_first=True) fields = { 'sentence': ('words', TEXT), 'label': ('label', LABEL), 'e1': ('pos_e1', POSITION), 'e2': ('pos_e2', POSITION_NO_LEN) } self.train, self.valid = TabularDataset.splits(path=self.root_path, train='train.txt', validation='test.txt', format='json', skip_header=False, fields=fields)
src_tokenizer = None trg_tokenizer = None if set_source_to == "azerbaijani" or set_source_to == "turkish": src_tokenizer = tokenize_custom elif set_source_to == "english": src_tokenizer = tokenize_eng if set_target_to == "azerbaijani" or set_target_to == "turkish": trg_tokenizer = tokenize_custom elif set_target_to == "english": trg_tokenizer = tokenize_eng source_lang = Field(tokenize=src_tokenizer, lower=True, init_token="<sos>", eos_token="<eos>") target_lang = Field(tokenize=trg_tokenizer, lower=True, init_token="<sos>", eos_token="<eos>") fields = {'Source_lang': ('src',source_lang), 'Target_lang' : ('trg',target_lang)} train_data, test_data = TabularDataset.splits( path='', train = train_dataset_path, test = test_dataset_path, format = 'csv', fields = fields )