def _load_fields_2(dataset, data_type, data_dir, checkpoint): if checkpoint is not None: logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) fields = load_fields_from_vocab(checkpoint['vocab'], data_type) else: fields = load_fields_from_vocab(torch.load(data_dir + '.vocab.pt'), data_type) fields = dict([(k, f) for (k, f) in fields.items() if k in dataset.examples[0].__dict__]) logger.info(' * vocabulary size. source = %d; target = %d' % (len(fields['src'].vocab), len(fields['tgt'].vocab))) return fields
def load_vocabulary(vocabulary_path, tag=""): """ Loads a vocabulary from the given path. :param vocabulary_path: path to load vocabulary from :param tag: tag for vocabulary (only used for logging) :return: vocabulary or None if path is null """ vocabulary = None if vocabulary_path: vocabulary = [] logger.info("Loading {} vocabulary from {}".format( tag, vocabulary_path)) if not os.path.exists(vocabulary_path): raise RuntimeError("{} vocabulary not found at {}!".format( tag, vocabulary_path)) else: with codecs.open(vocabulary_path, 'r', 'utf-8') as f: for line in f: if len(line.strip()) == 0: continue word = line.strip().split()[0] vocabulary.append(word) return vocabulary
def _lazy_dataset_loader(pt_file, corpus_type): dataset = torch.load(pt_file) logger.info('Loading %s dataset from %s, number of examples: %d' % (corpus_type, pt_file, len(dataset))) return dataset
def build_vocab(train_dataset_files, fields, data_type, share_vocab, src_vocab_path, src_vocab_size, src_words_min_frequency, tgt_vocab_path, tgt_vocab_size, tgt_words_min_frequency): """ Args: train_dataset_files: a list of train dataset pt file. fields (dict): fields to build vocab for. data_type: "text", "img" or "audio"? share_vocab(bool): share source and target vocabulary? src_vocab_path(string): Path to src vocabulary file. src_vocab_size(int): size of the source vocabulary. src_words_min_frequency(int): the minimum frequency needed to include a source word in the vocabulary. tgt_vocab_path(string): Path to tgt vocabulary file. tgt_vocab_size(int): size of the target vocabulary. tgt_words_min_frequency(int): the minimum frequency needed to include a target word in the vocabulary. Returns: Dict of Fields """ counter = {} # Prop src from field to get lower memory using when training with image if data_type == 'img' or data_type == 'audio': fields.pop("src") for k in fields: counter[k] = Counter() # Load vocabulary src_vocab = load_vocabulary(src_vocab_path, tag="source") if src_vocab is not None: src_vocab_size = len(src_vocab) logger.info('Loaded source vocab has %d tokens.' % src_vocab_size) for i, token in enumerate(src_vocab): # keep the order of tokens specified in the vocab file by # adding them to the counter with decreasing counting values counter['src'][token] = src_vocab_size - i tgt_vocab = load_vocabulary(tgt_vocab_path, tag="target") if tgt_vocab is not None: tgt_vocab_size = len(tgt_vocab) logger.info('Loaded source vocab has %d tokens.' % tgt_vocab_size) for i, token in enumerate(tgt_vocab): counter['tgt'][token] = tgt_vocab_size - i for index, path in enumerate(train_dataset_files): dataset = torch.load(path) logger.info(" * reloading %s." % path) for ex in dataset.examples: for k in fields: val = getattr(ex, k, None) if not fields[k].sequential: continue elif k == 'src' and src_vocab: continue elif k == 'tgt' and tgt_vocab: continue counter[k].update(val) # Drop the none-using from memory but keep the last if (index < len(train_dataset_files) - 1): dataset.examples = None gc.collect() del dataset.examples gc.collect() del dataset gc.collect() _build_field_vocab(fields["tgt"], counter["tgt"], max_size=tgt_vocab_size, min_freq=tgt_words_min_frequency) logger.info(" * tgt vocab size: %d." % len(fields["tgt"].vocab)) # All datasets have same num of n_tgt_features, # getting the last one is OK. for j in range(dataset.n_tgt_feats): key = "tgt_feat_" + str(j) _build_field_vocab(fields[key], counter[key]) logger.info(" * %s vocab size: %d." % (key, len(fields[key].vocab))) if data_type == 'text': _build_field_vocab(fields["src"], counter["src"], max_size=src_vocab_size, min_freq=src_words_min_frequency) logger.info(" * src vocab size: %d." % len(fields["src"].vocab)) # All datasets have same num of n_src_features, # getting the last one is OK. for j in range(dataset.n_src_feats): key = "src_feat_" + str(j) _build_field_vocab(fields[key], counter[key]) logger.info(" * %s vocab size: %d." % (key, len(fields[key].vocab))) # Merge the input and output vocabularies. if share_vocab: # `tgt_vocab_size` is ignored when sharing vocabularies logger.info(" * merging src and tgt vocab...") merged_vocab = merge_vocabs( [fields["src"].vocab, fields["tgt"].vocab], vocab_size=src_vocab_size, min_frequency=src_words_min_frequency) fields["src"].vocab = merged_vocab fields["tgt"].vocab = merged_vocab return fields