def main(config): # Initialise the model type and arguments model, args = init_trainer(config) logger.info(args) # Read training data dataset = Dataset(args['data_path'], args) corpus = dataset.get_corpus() # Train model model.train(corpus) # Save model if args['save_model'] == True: # Save run logger.info('Saving Model') model.save(args['model_dir']) dataset.save(args['data_path']) # Perform validation valid = Validation() x = model.get_vectors() df = dataset.get_df() # valid.plot_pca(x, df['variety_region']) results = valid.cluster_similarities(x, df) logger.info(results) if args['save_validation'] == True: logger.info('Saving Validation') config['output'] = results['similarity'] with open(args['validation_dir'] + '{}.pkl'.format(datetime.now()), "wb") as pickleFile: pickle.dump(config, pickleFile)
def main(args): # Read training data dataset = Dataset(args['data_path']) corpus = dataset.get_corpus() # Train TF-IDF model model = TfidfTrainer() model.train(corpus) # Save run model.save(args['model_dir']) dataset.save(args['model_dir']) # Validate valid = Validation() x = model.get_vectors() df = dataset.get_df() valid.plot_pca(x, df['variety_region']) print(valid.cluster_similarities(x, df))