def create_demo_batch(sentences, dataset_type, vocab, attribute_name, target_columns, tmp_path='/tmp'): ''' Args: sentences: List of string. ''' # Create a temporary file. tmp_path = os.path.join(tmp_path, common.random_string(5)) index = [i for i in xrange(len(sentences))] dic = { 'index': index, 'sentence': sentences, } for col in target_columns: dic[col] = [EMPTY] df = pd.DataFrame(dic).ix[:, ['index', 'sentence'] + target_columns].set_index('index') sys.stdout = sys.stderr with open(tmp_path, 'w') as f: f.write(df.to_csv() + '\n') pathes = common.dotDict({'train': tmp_path, 'valid':tmp_path, 'test':tmp_path}) num_training_sentences = 0 # Fake value. dataset = getattr(self_module, dataset_type)(dataset_type, pathes, num_training_sentences, vocab, attribute_name, target_columns) dataset.test.load_data() os.system('rm %s' % tmp_path) return dataset.test
def main(args): if args.mode == 'train': sys.stderr.write('Saving config...\n') config = common.dotDict(args.__dict__) save_config(args) else: sys.stderr.write('Loading config...\n') config = load_config(args) model = getattr(myself, config.model_type)(args, config) if args.mode == 'train': model.train() elif args.mode == 'test': tests, origins = read_human_annotations(args.test_file) lines = [line for idx, line, anno in tests] predictions, cluster_ids = model.test(lines, test_filepath=args.test_file) model.evaluate(tests, origins, predictions, cluster_ids=cluster_ids) elif args.mode == 'evaluate': tests, origins = read_human_annotations(args.test_file) predictions = read_dplabels() model.evaluate(tests, origins, predictions) else: raise ValueError('args.mode must be \'train\' or \'test\'.')
def setup_embeddings(self, config, vocab): self.embeddings = dotDict() n_start_vocab = len(vocab.e_word.start_vocab) special_tokens_emb = self.initialize_embeddings( 'SpecialTokens', vocab.e_word.embeddings[:n_start_vocab].shape, initializer=tf.constant_initializer( vocab.e_word.embeddings[:n_start_vocab]), trainable=True) # e_words_emb = tf.constant(vocab.e_word.embeddings[n_start_vocab:], # dtype=tf.float32) # j_words_emb = tf.constant(vocab.j_word.embeddings[n_start_vocab:], # dtype=tf.float32) e_words_emb = self.initialize_embeddings( 'EnWords', vocab.e_word.embeddings[n_start_vocab:].shape, initializer=tf.constant_initializer( vocab.e_word.embeddings[n_start_vocab:]), trainable=config.train_embedding) j_words_emb = self.initialize_embeddings( 'JPWords', vocab.j_word.embeddings[n_start_vocab:].shape, initializer=tf.constant_initializer( vocab.j_word.embeddings[n_start_vocab:]), trainable=config.train_embedding) self.embeddings.e_word = tf.concat([special_tokens_emb, e_words_emb], axis=0) self.embeddings.j_word = tf.concat([special_tokens_emb, j_words_emb], axis=0)
def __init__(self, args, sess, vocab=None): self.sess = sess self.config = self.load_config(args) self.logger = common.logManager(handler=FileHandler(args.log_file)) if args.log_file else common.logManager() sys.stderr.write(str(self.config) + '\n') data_class = getattr(datasets, self.config.dataset_type) self.vocab = common.dotDict() if self.config.embeddings: emb_conf = self.config.embeddings self.vocab.e_word = vocabularies.WordVocabularyWithEmbedding( emb_conf.en.path, vocab_size=self.config.w_vocab_size, lowercase=self.config.lowercase, normalize_digits=self.config.normalize_digits, skip_first=emb_conf.en.skip_first) self.vocab.j_word = vocabularies.WordVocabularyWithEmbedding( emb_conf.ja.path, vocab_size=self.config.w_vocab_size, lowercase=self.config.lowercase, normalize_digits=self.config.normalize_digits, skip_first=emb_conf.ja.skip_first) self.c_vocab = None #self.w_vocab, self.c_vocab = data_class.create_vocab_from_data(self.config) self.dataset = data_class(self.config.dataset_info, self.vocab.e_word, self.c_vocab)
def yield_batch(self, batch_by_column): b_sources, b_targets, b_ori_sources, b_pos= batch_by_column b_targets = list(zip(*b_targets)) # to column-major. return common.dotDict({ 'sources': np.array(b_sources), # Include only the labels in 'target_columns' to batch. 'targets': [np.array(t) for t, col in zip(b_targets, self.all_columns) if col in self.target_columns], 'original_sources': b_ori_sources, 'pos': b_pos, })
def load_data(self): self.load = True sys.stderr.write('Loading dataset from %s ...\n' % (self.path)) df = pd.read_csv(self.path, nrows=self.max_lines) sys.stderr.write('Preprocessing ...\n') contexts, responses, speaker_changes = self.preprocess(df) if not self.wbase and not self.cbase: raise ValueError('Either \'wbase\' or \'cbase\' must be True.') self.speaker_changes = [self.sc_vocab.sent2id(sc) for sc in speaker_changes] # Separate contexts and responses into words (or chars), and convert them into their IDs. self.original = common.dotDict({}) self.symbolized = common.dotDict({}) if self.wbase: self.original.w_contexts = [[self.w_vocab.tokenizer(u) for u in context] for context in contexts] self.symbolized.w_contexts = [[self.w_vocab.sent2id(u) for u in context] for context in self.original.w_contexts] else: self.original.w_contexts = [None for context in contexts] self.symbolized.w_contexts = [None for context in contexts] if self.cbase: self.original.c_contexts = [[self.c_vocab.tokenizer(u) for u in context] for context in contexts] self.symbolized.c_contexts = [[self.c_vocab.sent2id(u) for u in context] for context in self.original.c_contexts] else: self.original.c_contexts = [None for context in contexts] self.symbolized.c_contexts = [None for context in contexts] self.original.responses = [self.w_vocab.tokenizer(r) for r in responses] self.symbolized.responses = [self.w_vocab.sent2id(r) for r in responses] responses = self.symbolized.responses w_contexts = self.symbolized.w_contexts self.texts = common.flatten(w_contexts) + list(responses)
def yield_batch(self, batch): ''' Args - batch: A list of a list containing 'batch_size' examples (specified as an argument to get_batch()), batch[i] contains each of the return values of get_batch_data(). (i.e. the shape of 'batch' = [len(self.get_batch_data(...)), batch_size]). Return : A batch as a dictionary. ''' b_sources, b_targets, b_ori_sources = batch b_targets = list(zip(*b_targets)) # to column-major. return common.dotDict({ 'sources': np.array(b_sources), # Include only the labels in 'target_columns' to batch. 'targets': [np.array(t) for t, col in zip(b_targets, self.all_columns) if col in self.target_columns], 'original_sources': b_ori_sources, })
def get_batch(self, batch_size, input_max_len=None, output_max_len=None, shuffle=False): sources, targets = self.symbolized if input_max_len: paired = [(s, t) for s, t in zip(sources, targets) if not len(s) > input_max_len] sources, targets = list(zip(*paired)) sources = tf.keras.preprocessing.sequence.pad_sequences( sources, maxlen=input_max_len, padding='post', truncating='post', value=PAD_ID) targets = list(zip(*targets)) # to column-major. (for padding) targets = [ tf.keras.preprocessing.sequence.pad_sequences( targets_by_column, maxlen=output_max_len, padding='post', truncating='post', value=PAD_ID) for targets_by_column in targets ] targets = list(zip(*targets)) # to idx-major. (for shuffling) data = [ tuple(x) for x in zip(sources, targets, self.original_sources, self.targets) ] if shuffle: random.shuffle(data) for i, b in itertools.groupby(enumerate(data), lambda x: x[0] // (batch_size)): batch = [x[1] for x in b] b_sources, b_targets, b_ori_sources, b_ori_targets = zip(*batch) b_targets = list(zip(*b_targets)) # to column-major. yield common.dotDict({ 'sources': np.array(b_sources), 'targets': [np.array(t) for t in b_targets], 'original_sources': b_ori_sources, 'original_targets': b_ori_targets, })
def load_config(args): if os.path.exists(os.path.join(args.output_dir, CONFIG_NAME + '.txt')): config = collections.defaultdict() for l in open(os.path.join(args.output_dir, CONFIG_NAME + '.txt')): k, v, type_name = l.replace('\n', '').split('\t') if type_name == 'tuple': config[k] = common.str2tuple(v) elif type_name == 'int': config[k] = int(v) elif type_name == 'float': config[k] = float(v) else: config[k] = v config = common.dotDict(config) else: raise ValueError('No config file is found.') sys.stderr.write(str(config)+'\n') return config
def create_demo_batch(self, text, output_max_len): source = [self.vocab.tokens2ids(text)] targets = [[[0] for _ in self.targets_name]] targets = list(zip(*targets)) # to column-major. (for padding) source = tf.keras.preprocessing.sequence.pad_sequences( source, padding='post', truncating='post', value=PAD_ID) targets = [ tf.keras.preprocessing.sequence.pad_sequences( targets_by_column, maxlen=output_max_len, padding='post', truncating='post', value=PAD_ID) for targets_by_column in targets ] yield common.dotDict({ 'sources': np.array(source), 'targets': [np.array(t) for t in targets], })
def __init__(self, args, sess, vocab=None): self.sess = sess self.config = config = self.get_config(args) self.mode = args.mode self.logger = common.logManager(handler=FileHandler( args.log_file)) if args.log_file else common.logManager() sys.stderr.write(str(self.config) + '\n') # Lazy loading. self.dataset = common.dotDict({ 'train': None, 'valid': None, 'test': None }) self.dataset_type = getattr(datasets, config.dataset_type) if not args.interactive: # For saving time when running in jupyter. self.vocab = WordVocabularyWithEmbedding( config.embeddings, vocab_size=config.vocab_size, lowercase=config.lowercase) if vocab is None else vocab
def __init__(self, args, sess, vocab=None): self.sess = sess self.config = self.load_config(args) self.mode = args.mode self.logger = common.logManager(handler=FileHandler( args.log_file)) if args.log_file else common.logManager() sys.stderr.write(str(self.config) + '\n') if True or not args.interactive: self.vocab = common.dotDict() self.vocab.word = WordVocabularyWithEmbedding( self.config.embeddings, vocab_size=self.config.vocab_size, lowercase=self.config.lowercase, normalize_digits=self.config.normalize_digits, ) if vocab is None else vocab self.dataset = getattr(datasets, self.config.dataset_type)( self.config.dataset_type, self.config.dataset_path, self.config.num_train_data, self.vocab, self.config.target_attribute, self.config.target_columns)
def get_batch(self, batch_size, word_max_len=0, utterance_max_len=0, shuffle=False): if not self.load: self.load_data() # lazy loading. data = self.texts if shuffle: # For training. random.shuffle(data) for i, b in itertools.groupby(enumerate(data), lambda x: x[0] // batch_size): batch = [x[1] for x in b] texts = batch _utterance_max_len_data = max([len(u) for u in texts]) if not utterance_max_len or _utterance_max_len_data < utterance_max_len: _utterance_max_len = _utterance_max_len_data else: _utterance_max_len = utterance_max_len texts = np.array(texts) texts = tf.keras.preprocessing.sequence.pad_sequences( texts, maxlen=_utterance_max_len, padding='post', truncating='post', value=PAD_ID) yield common.dotDict({ 'texts': texts })