def main(config): params_dict = { 'seq2seq': Params('configs/params.json'), 'seq2seq_gru': Params('configs/params_gru.json'), 'seq2seq_attention': Params('configs/params_attention.json') } params = params_dict[config.model] if config.mode == 'train': train_data, valid_data = load_dataset(config.mode) train_iter, valid_iter = make_iter(params.batch_size, config.mode, train_data=train_data, valid_data=valid_data) trainer = Trainer(params, config.mode, train_iter=train_iter, valid_iter=valid_iter) trainer.train() else: test_data = load_dataset(config.mode) test_iter = make_iter(params.batch_size, config.mode, test_data=test_data) trainer = Trainer(params, config.mode, test_iter=test_iter) trainer.inference()
def on_receive_sensor_data(self, data, sensor_id, name): val = data.range max = data.max_range if(val == np.inf): val = 0 else: if(val < 0): val = data.min_range val = max - val val = val / max if sensor_id >= 5: val *= -1 self.sensors_cache_values[sensor_id] = val self.obstacle = np.sum(self.sensors_cache_values) != 0 if self.obstacle: lin_err = np.sum(self.sensors_cache_values) / self.sensors_cache_values.shape[0] ang_err = np.sum(self.sensors_cache_values[:2] - self.sensors_cache_values[3:5]) + (self.sensors_cache_values[5] - self.sensors_cache_values[6]) ang_vel = self.angular_pid.step(ang_err, 0.1) vel = self.linear_pid.step(lin_err, 0.1) self.last_elapsed_sensors = self.time_elapsed self.update_vel(Params(x=-vel), Params(z=-ang_vel))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_dir', default='.', help="") parser.add_argument('--data_dir', default='./data/casp7', help="") parser.add_argument('--restore_dir', default=None, help="") parser.add_argument('--param_path', default='./hparams.json', help="Hyperparameter file") args = parser.parse_args() physical_devices = tf.config.experimental.list_physical_devices('GPU') assert len( physical_devices) > 0, "Not enough GPU hardware devices available" tf.config.experimental.set_memory_growth(physical_devices[0], True) tf.random.set_seed(117) hparams = Params(args.param_path) model = Pronet(hparams) feeder = Feeder(args.data_dir, hparams) #if args.restore: # pass #else: # pass train(model, feeder, hparams)
def main(argv): tf.logging.set_verbosity(tf.logging.DEBUG) #load params params = Params(os.path.join(FLAGS.experiment_dir, "params.json")) # Load model and compute embeddings config = tf.estimator.RunConfig( tf_random_seed=230, model_dir=FLAGS.experiment_dir, save_summary_steps=params.save_summary_steps) estimator = tf.estimator.Estimator(model_fn, params=params, config=config) # Compute embeddings on the test set tf.logging.info("Computing embeddings") predictions = estimator.predict( lambda: fashion_mnist_dataset.test_dataset_fn(params), checkpoint_path=FLAGS.checkpoint) embeddings = np.zeros( (10000, params.embedding_size)) #TODO remove hardcoded value for i, p in enumerate(predictions): embeddings[i] = p['embeddings'] tf.logging.info("Embeddings shape: {}".format(embeddings.shape)) # Get 10 closest examples to query item num_results = 10 query_embedding = embeddings[np.newaxis, FLAGS.query, :] tf.logging.info("Query embedding shape: {}".format(query_embedding.shape)) distances = np.sqrt(np.sum(np.square(query_embedding - embeddings), axis=1)) # Compute distances to all other items tf.logging.info("Distances shape: {}".format(distances.shape)) # Get most similar items, ignore closest item as it is the query (distance=0) closest_idxs = np.argsort(distances)[1:] sorted_distances = np.sort(distances)[1:] print("{} closest distances:\n{}".format(num_results, sorted_distances[:num_results])) # Display query image and 10 most similar results f, ax = plt.subplots(1, num_results + 1, figsize=(8, 2)) query_image = cv2.imread( os.path.join(fashion_mnist_dataset.FASHION_MNIST_DIR, "test/", "{}.png".format(FLAGS.query)), 0) ax[0].imshow(query_image, cmap="gray") ax[0].title.set_text("Query") ax[0].axis('off') for i in range(num_results): current_image = cv2.imread( os.path.join(fashion_mnist_dataset.FASHION_MNIST_DIR, "test/", "{}.png".format(closest_idxs[i])), 0) ax[i + 1].imshow(current_image, cmap='gray') ax[i + 1].title.set_text("{:.3f}".format(sorted_distances[i])) ax[i + 1].axis('off') plt.show()
def setUp(self): self.trainset = config.train_name self.valset = config.val_name self.testset = config.test_name self.tag_padding = -1 self.params = Params(config.datasets_params_file) self.loader = DataLoader(config.data_dir, self.params)
def main(argv): tf.logging.set_verbosity(tf.logging.DEBUG) # Load params params = Params(os.path.join(FLAGS.experiment_dir, "params.json")) # Load images image1 = cv2.imread(FLAGS.image1, 0) image2 = cv2.imread(FLAGS.image2, 0) image1 = image1[None, :, :, None] #make it a batch of 1 and 1 channel image2 = image2[None, :, :, None] # Load model images = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) model = build_model(images, tf.estimator.ModeKeys.PREDICT, params) saver = tf.train.Saver() # Extract embeddings with tf.Session() as sess: if FLAGS.checkpoint: # restore checkpoint saver.restore(sess, FLAGS.checkpoint) else: saver.restore(sess, tf.train.latest_checkpoint(FLAGS.experiment_dir)) embeddings1 = sess.run([model], feed_dict={images: image1})[0] embeddings2 = sess.run([model], feed_dict={images: image2})[0] # Compute distance between embeddings distance = np.linalg.norm(embeddings2 - embeddings1) print("Distance between the embeddings of the two images: ", distance)
def main(argv): tf.logging.set_verbosity(tf.logging.DEBUG) #load params params = Params(os.path.join(FLAGS.experiment_dir, "params.json")) #create model config = tf.estimator.RunConfig( tf_random_seed=230, model_dir=FLAGS. experiment_dir, #directory where model parameters, graph, etc are saved. save_summary_steps=params.save_summary_steps, keep_checkpoint_max=params.keep_checkpoint_max, save_checkpoints_secs=params.save_checkpoints_secs) estimator = tf.estimator.Estimator(model_fn, params=params, config=config) # Train the model tf.logging.info("Starting training for {} steps(s).".format( params.num_steps)) train_spec = tf.estimator.TrainSpec( input_fn=lambda: fashion_mnist_dataset.train_dataset_fn(params), max_steps=params.num_steps) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: fashion_mnist_dataset.val_dataset_fn(params), start_delay_secs=params.start_delay_secs, throttle_secs=params.throttle_secs) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def prediction(text): params = Params('config/params.json') # load tokenizer and torchtext Fields pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_kor = open('pickles/kor.pickle', 'rb') kor = pickle.load(pickle_kor) pickle_eng = open('pickles/eng.pickle', 'rb') eng = pickle.load(pickle_eng) eos_idx = eng.vocab.stoi['<eos>'] # select model and load trained model model = Transformer(params) model.load_state_dict(torch.load(params.save_model)) model.to(params.device) model.eval() # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(text) indexed = [kor.vocab.stoi[token] for token in tokenized] source = torch.LongTensor(indexed).unsqueeze(0).to(params.device) # [1, source_len]: unsqueeze to add batch size target = torch.zeros(1, params.max_len).type_as(source.data) # [1, max_len] encoder_output = model.encoder(source) next_symbol = eng.vocab.stoi['<sos>'] for i in range(0, params.max_len): if next_symbol == eos_idx: break target[0][i] = next_symbol decoder_output, _ = model.decoder(target, source, encoder_output) # [1, target length, output dim] prob = decoder_output.squeeze(0).max(dim=-1, keepdim=False)[1] next_word = prob.data[i] next_symbol = next_word.item() #eos_idx = torch.where(target[0] == eos_idx)[0][0] #eos_idx = eos_idx.item() eos_index = 34 print(eos_idx) target = target[0][:eos_idx].unsqueeze(0) # translation_tensor = [target length] filed with word indices target, attention_map = model(source, target) target = target.squeeze(0).max(dim=-1)[1] reply_token = [eng.vocab.itos[token] for token in target if token != 3] print(reply_token) #translation = translated_token[:translated_token.index('<eos>')] #translation = ''.join(translation) reply = ' '.join(reply_token) #print(reply) #display_attention(tokenized, reply_token, attention_map[4].squeeze(0)[:-1]) return reply
def __init__(self, currentDnnNote, notesDict, appctxt, parentPlayer, parent): super(NeuralNet, self).__init__() self.appctxt = appctxt self.parentPlayer = parentPlayer self.parent = parent self.config = Params(appctxt.get_resource("bachDuet.json")) self.loadVocabularies() self.notesDict = notesDict self.device = torch.device( 'cpu') #('cuda:0' if torch.cuda.is_available() else 'cpu') self.currentDnnNote = currentDnnNote # tensor buffers for the 4 types of input that go in the NN. # the size in the time axis is 4, but currently the NN accepts only 1 every timestep self.tensorBuffer = TensorBuffer(maxLen=4, shape=[2, 1], restIndex=self.restTokenIndex, device=self.device) self.tensorBufferPC = TensorBuffer(maxLen=4, shape=[2, 1], restIndex=12, device=self.device) self.tensorBufferRhythm = TensorBuffer(maxLen=4, shape=[1, 1], restIndex=0, device=self.device) self.tensorBufferKey = TensorBuffer(maxLen=4, shape=[1, 1], restIndex=12, device=self.device) self.first = 0 self.prevPredictionTokenIndex = self.vocabMidiArticGlobal.token2index[ '0_1'] self.prevPredictionTokenIndexPC = 12 self.prevPredictionTokenIndexKey = 12 # C major self.timeSignature2ticks(self.config.timeSignature) self.old = time.time() self.enforceMode = False #True self.voices = 2 self.conditionFlag = False self.useCondition = False self.condition = None self.insideCondition = False self.initializeModel() self.info = { 'type': self.parentPlayer.type, 'name': self.parentPlayer.name, 'midiModel': { 'path': self.midiModelPath, 'args': self.args }, 'keyModel': { 'path': self.keyModelPath, 'args': self.argsKey } }
def test_debug_net(src_lang_batch, trg_lang_batch): config = Params(src='en', trg='hu', num_return_sequences=3, num_beams=3, cuda=False) model = Net(config) # checkpoint = torch.load('../experiments/Net/runs/last.pth.tar', # map_location=torch.device('cpu')) # model.load_state_dict(checkpoint['state_dict']) model(src_lang_batch, trg_lang_batch)
def test_debug_net2(src_lang_batch, trg_lang_batch): config = Params(src='en', trg='hu', num_codes=3, gumbel_temp=0.1, num_beams=3, max_length=20, cuda=False) model = Net2(config) checkpoint = torch.load('../experiments/debug_model/runs/best.pth.tar', map_location=torch.device('cpu')) model.load_state_dict(checkpoint['state_dict']) model(src_lang_batch, trg_lang_batch)
def get_parameters(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', default='0', type=str) parser.add_argument('--model_dir', '--md', default='experiments/base_model', type=str) parser.add_argument('--mode', '--m', default='train', type=str) parser.add_argument('--resume', '--r', action='store_true') parser.add_argument('--beam_size', '--bs', default=0, type=int) parser.add_argument('--heatmap', '--hm', action='store_true') args = parser.parse_args() hps_path = os.path.join(args.model_dir, 'config.json') if not os.path.exists(hps_path): raise FileNotFoundError('there is no config json file') hps = Params(hps_path) hps.dict.update(args.__dict__) if hps.mode == 'test': hps.batch_size = 1 return hps
def __init__(self, appctxt): super(Clock, self).__init__() self.stopit = False self.config = Params(appctxt.get_resource("bachDuet.json")) self.metronomeBPM = self.config.default['metronome']["BPM"] self.setTimeSignature(self.config.timeSignature) self.tick = 0 self.globalTick = 0 self.paused = True
def setUp(self): self.trainset = config.train_name self.valset = config.val_name self.testset = config.test_name self.tag_padding = -1 self.params = Params(config.datasets_params_file) self.params.embedding_dim = 50 self.params.lstm_hidden_dim = 50 self.loader = DataLoader(config.data_dir, self.params) self.logger = Logger.get()
def __init__(self, policy_cls, env, gamma, learning_rate, exploration_fraction, exploration_final_eps, batch_size, learning_starts, log_freq, verbose, train_freq, replay_memory_capacity): super(BaseDQN, self).__init__(policy_cls=policy_cls, env=env, verbose=verbose, replay_memory_capacity=replay_memory_capacity) self.params = Params(gamma=gamma, learning_rate=learning_rate, exploration_fraction=exploration_fraction, exploration_final_eps=exploration_final_eps, batch_size=batch_size, learning_starts=learning_starts, log_freq=log_freq, verbose=verbose, train_freq=train_freq) self.params.DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def __init__(self, pianoRollView, appctxt): super().__init__() self.pianoRollView = pianoRollView self.appctxt = appctxt self.config = Params(appctxt.get_resource("bachDuet.json")) self.showMidiKeyboardFlag = True self.midiPositions = self.config.gui["keyPos"] # buffers that keep the past 205 notes for both voices # the notes in these buffers are plotted to form the # piano roll view. self.busMidiKeyboard = deque(maxlen=205) # TODO self.busDnn = deque(maxlen=205) # TODO
def predict(config): params = Params('config/params.json') # load tokenizer and torchtext Fields pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_kor = open('pickles/kor.pickle', 'rb') kor = pickle.load(pickle_kor) pickle_eng = open('pickles/eng.pickle', 'rb') eng = pickle.load(pickle_eng) # select model and load trained model model = Transformer(params) model.load_state_dict(torch.load(params.save_model)) model.to(params.device) model.eval() input = clean_text(config.input) # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(input) indexed = [kor.vocab.stoi[token] for token in tokenized] source = torch.LongTensor(indexed).unsqueeze(0).to( params.device) # [1, source length]: unsqueeze to add batch size target = torch.zeros(1, params.max_len).type_as(source.data) encoder_output = model.encoder(source) next_symbol = eng.vocab.stoi['<sos>'] for i in range(0, params.max_len): target[0][i] = next_symbol dec_output = model.decoder(target, source, encoder_output) # dec_output = [1, target length, output dim] prob = dec_output.squeeze(0).max(dim=-1, keepdim=False)[1] next_word = prob.data[i] next_symbol = next_word.item() # translation_tensor = [target length] filed with word indices target = model(source, target) target = torch.argmax(target.squeeze(0), -1) # target = target.squeeze(0).max(dim=-1, keepdim=False) translation = [eng.vocab.itos[token] for token in target][1:] translation = ' '.join(translation) print(f'kor> {config.input}') print(f'eng> {translation.capitalize()}')
def on_target(self, target_data): box = self.get_box(target_data) err, offset = self.get_error(box) height, width = self.camera_res err = err / width # normalize in % dt = self.time_elapsed - self.last_elapsed ang_vel = self.object_pid.step(err, dt) if self.DEBUG: print('---------------') print('err : {:.2f}'.format(err)) print('offset: {:.2f}'.format(offset)) print('dt : {:.2f}'.format(dt)) print('vel : {:.2f}'.format(ang_vel)) print('Found {}').format(target_data['class']) self.last_elapsed = self.time_elapsed self.on_target_turn_on_leds(box) self.update_vel(Params(self.FORWARD_VEL), Params(z=-ang_vel))
def main(): parser = ArgumentParser() parser.add_argument("--params_path", type=str, help="path to `params.json`", required=True) args = parser.parse_args() params = Params(args.params_path) print(params) filetype = params.content_target.split(".") if filetype[-1] in ["jpg", "jpeg", "png", "tiff"]: style_transfer_image(params) else: style_transfer_video(params)
def main(config): params = Params('config/params.json') if config.mode == 'train': train_data, valid_data = load_dataset(config.mode) train_iter, valid_iter = make_iter(params.batch_size, config.mode, train_data=train_data, valid_data=valid_data) trainer = Trainer(params, config.mode, train_iter=train_iter, valid_iter=valid_iter) trainer.train() else: test_data = load_dataset(config.mode) test_iter = make_iter(params.batch_size, config.mode, test_data=test_data) trainer = Trainer(params, config.mode, test_iter=test_iter) trainer.inference()
def convert_particle_position_to_params(position): """ Converts a particle position into controller params. :param position: particle position. :type position: numpy array. :return: controller params. """ params = Params() params.max_linear_speed_command = position[0] params.kp = position[1] params.ki = position[2] params.kd = position[3] return params
def __init__(self, token): #, reply_dict, **kwargs): print("Bot initialization.") #initialize telegram bot self.bot = telegram.Bot(token) try: self.update_id = self.bot.get_updates()[0].update_id except IndexError: self.update_id = None #initialize bot responses and parameters self.reply_dict = get_response_dict() self.reply_dict_informal = get_response_dict_informal() self.params = Params() self.config = Config() #initialize database if self.params.MODE == Modes.TEXT: self.db = MongoClient( ).study_Informal_Nov_2019 #MongoClient().textbot_telegram else: self.db = MongoClient().voicebot_telegram #initialize user info self.user_history = defaultdict(list) #self.user_name_dict = self.load_names(self.db.user_history) self.user_bot_state_dict = defaultdict(lambda: (None, None)) self.user_problem_dict = {} self.condition = True #self.user_parameters_dict = self.load_user_parameters(self.db.user_history) self.user_name_dict, self.user_parameters_dict, self.ids = self.load_parameters( self.db.user_history) keyboards = [telegram.InlineKeyboardButton("Choose for me")] + [ telegram.InlineKeyboardButton(name) for idx, name in enumerate(self.params.bot_name_list) if idx not in {4, 7} ] self.bots_keyboard = [[x, y] for x, y in zip(keyboards[0::2], keyboards[1::2]) ] if len(keyboards) % 2 == 1: self.bots_keyboard.append([keyboards[-1]])
def date_report(start_time=-1, end_time=-1, weekday=()): pymongo_client = MongoClient() db = pymongo_client.chatbot collection = db.user_history reply_dict = get_text_from_db() start_time_formatted, end_time_formatted = get_time_interval( start_time, end_time) user_set = set() bot_choice = defaultdict(list) weekday_choice = defaultdict(int) out = open('output/time_report.txt', 'w') params = Params() for each in collection.find({}): if start_time_formatted > each['user_history'][0][-1] \ or end_time_formatted < each['user_history'][-1][-1]: continue each_weekday = get_weekday(each['user_history'][0][-1]) if len(weekday) != 0 and each_weekday not in weekday: continue user_id = each['thread_id'] user_set.add(user_id) bot_choice[each['user_history'][0][0]].append(user_id) weekday_choice[each_weekday] += 1 out.write('total number of user: {}\n'.format(len(user_set))) out.write('==================================\n') out.write('user list\n') for each in user_set: out.write("{}\n".format(each)) out.write('==================================\n') if len(weekday) != 0: for key, val in weekday_choice.items(): #print(key, val) out.write("There are {} users on {}\n".format(val, weekdays[key])) out.write('==================================\n') for key, val in bot_choice.items(): out.write("{} is used {} time(s)\n".format(params.bot_name_list[key], len(val))) out.write("{}\n".format(val)) if len(user_set) > 0: out.write('==================================\n') out.close()
def __init__(self, line_follower, track): """ Creates the simulation. :param line_follower: the line follower robot. :type line_follower: LineFollower. :param track: the line track. :type track: Track. """ self.line_follower = line_follower self.track = track start = self.track.get_initial_point() self.line_follower.reset(Pose(start.x, start.y, 0.0)) self.point_list = [] # To draw the robot's path # Defining the sprite parameters sprite_params = Params() sprite_params.wheel_thickness = 0.01 sprite_params.sensor_thickness = 0.02 sprite_params.wheel_radius = line_follower.wheel_radius sprite_params.wheels_distance = line_follower.wheels_distance sprite_params.sensor_offset = line_follower.sensor_offset sprite_params.array_width = line_follower.line_sensor.array_width # Creating the robot sprite self.sprite = RobotSprite(sprite_params)
def __init__(self, token): #, reply_dict, **kwargs): print("Bot initialization.") #initialize telegram bot self.bot = telegram.Bot(token) self.msg_engine = Message() try: self.update_id = self.bot.get_updates()[0].update_id except IndexError: self.update_id = None self.params = Params() self.config = Config() keyboards = [telegram.InlineKeyboardButton("Choose for me")] + [ telegram.InlineKeyboardButton(name) for idx, name in enumerate(self.params.bot_name_list) if idx not in {4, 7} ] self.bots_keyboard = [[x, y] for x, y in zip(keyboards[0::2], keyboards[1::2]) ] if len(keyboards) % 2 == 1: self.bots_keyboard.append([keyboards[-1]])
def search_hyperparam(hyperparam, parent_dir, data_dir, checkpoint, params_filename): assert isinstance(hyperparam, tuple) params_file = os.path.join(parent_dir, params_filename) msg = "Parent directory doesn't contain params file." assert os.path.isfile(params_file), msg params = Params(params_file) name, candidates = hyperparam model_dirs = [] for candidate in candidates: experiment = '{}_{}'.format(name, candidate) model_dir = os.path.join(parent_dir, experiment) if not os.path.isdir(model_dir): os.makedirs(model_dir) model_dirs.append(model_dir) # create params file for the experiment params.set(name, candidate) params.dump(os.path.join(model_dir, params_filename)) # run subprocess to train model train_model(model_dir, data_dir, checkpoint) return model_dirs
outputs) # (batch_size, seq_len, hidden_size[embedding_dim]) # (seq_len, batch_size, tag_size) lstm_feats = self.bilstm.get_lstm_features( sequence_output.transpose(1, 0), attention_mask.transpose(1, 0)) # CRF if labels is not None: # total scores forward_score = self.crf(lstm_feats, attention_mask.transpose(1, 0)) gold_score = self.crf.score_sentence( lstm_feats, labels.transpose(1, 0), attention_mask.transpose(1, 0)) loss = (forward_score - gold_score).sum() return loss else: # 维特比算法 best_paths = self.crf.viterbi_decode( lstm_feats, attention_mask.transpose(1, 0)) return best_paths if __name__ == '__main__': from utils import Params params = Params() model = BertForTokenClassification.from_pretrained(params.bert_model_dir, params=params) param_optimizer = list(model.named_parameters()) for n, _ in param_optimizer: print(n)
def net2(): config = Params(src='en', trg='hu', num_codes=1, gumbel_temp=0.5, num_beams=3, max_length=20, cuda=False) model = Net2(config) return model
def test_net(src_lang_batch, trg_lang_batch): config = Params(src='en', trg='hu', num_beams=3, max_length=20, num_return_sequences=3, cuda=False) model = Net(config) model(src_lang_batch, trg_lang_batch) return model
def dump(self, data_dir, encoding='utf8', shuffle=True, min_count_word=1, min_count_tag=1): """Do dirty job, you should modify it to suit for your project.""" # datasets params params = Params(data={ 'word_vocab_size': 0, 'tag_vocab_size': 0, 'pad_word': self.PAD_WORD, 'unk_word': self.UNK_WORD, 'pad_tag': self.PAD_TAG }) # dataset and vocab tag_vocab = VocabCounter([self.PAD_TAG]) word_vocab = VocabCounter([self.PAD_WORD, self.UNK_WORD]) datasets = self.datasets(shuffle=shuffle) # save train/val/test dataset for dataset in datasets: name = dataset.name size = len(dataset) self.logger.info('Saving {} dataset...'.format(name)) params.set('{}_size'.format(name), size) # add dataset size dirpath = os.path.join(data_dir, name) if not os.path.isdir(dirpath): os.makedirs(dirpath) sentences_file = os.path.join(dirpath, self.sentences_filename) labels_file = os.path.join(dirpath, self.labels_filename) with open(sentences_file, 'w', encoding=encoding) as fs, \ open(labels_file, 'w', encoding=encoding) as fl: for sample in dataset: words, tags = sample.words, sample.tags fs.write('{}\n'.format(' '.join(words))) fl.write('{}\n'.format(' '.join(tags))) tag_vocab.update(tags) word_vocab.update(words) self.logger.info('- done!') params.word_vocab_size = len(word_vocab) params.tag_vocab_size = len(tag_vocab) # save word vocab self.logger.info('Saving word vocab...') word_vocab_file = os.path.join(data_dir, self.word_vocab_filename) with open(word_vocab_file, 'w', encoding=encoding) as f: for word in word_vocab.get(min_count=min_count_word): f.write('{}\n'.format(word)) self.logger.info('- done!') # save tag vocab self.logger.info('Saving tag vocab...') tag_vocab_file = os.path.join(data_dir, self.tag_vocab_filename) with open(tag_vocab_file, 'w', encoding=encoding) as f: for tag in tag_vocab.get(min_count=min_count_tag): f.write('{}\n'.format(tag)) self.logger.info('- done!') # save datasets parameters self.logger.info('Saving datasets parameters...') params.dump(self.datasets_params_file, encoding=encoding) self.logger.info('- done!') # print dataset characteristics self.logger.info("Characteristics of the dataset:") for key, value in params: self.logger.info("- {}: {}".format(key, value))