def main(args): """ Load generated model checkpoints from by default in /checkpoint/run1 and generate new text """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) # load data df = read_csv(input_data_path) lines = list(df['raw_line']) random.seed(config['generate']['random_seed']) sample_seeds = random.choices(lines, k=config['generate']['num']) sess = gpt2.start_tf_sess() gpt2.load_gpt2(sess) pred = [] for i in sample_seeds: out = gpt2.generate(sess, prefix=i, **config['generate']['generator']) pred.append(out) pred_df = pd.DataFrame(pred, columns=['raw_line']) save_csv(pred_df, output_data_path) except Exception as e: logger.error( "Unexpected error occurred when generating dialogues with gpt2: " + str(e))
def main(args): """ main function perform data augmentation with clean data and save the augmented data to csv :param args: (argparse) user-input configuration file """ # try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) # load data df = read_csv(input_data_path) lines = list(df['line']) charactors = list(df['label']) augmented = augment(lines, config['aug']) # Union original lines and augmented lines df2 = pd.DataFrame(list(zip(charactors, augmented)), columns=['label', 'line']) df = df[['label', 'line']] df['type'] = 'original' df2['type'] = 'augmented' result = pd.concat([df, df2]) save_csv(result, output_data_path)
def main(args): """ main function to split data :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input out_train_path = project_path + "/" + args.output_train out_test_path = project_path + "/" + args.output_test config = load_config(config_path) df = read_csv(input_data_path) df_train, df_test = split(df, **config['split_data']) # Write to output file save_csv(df_train, out_train_path) save_csv(df_test, out_test_path) except ValueError as e1: logger.error("ValueError: " + str(e1) + " Please validate Values in the configuration file.") except Exception as e: logger.error("Unexpected error occurred when splitting data: " + str(e))
def main(args): """ main function to load raw data, clean data and save leaned data to csv :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) # load data logger.info("Trying to load data from %s", input_data_path) with open(input_data_path, 'r') as f: text = f.read() logger.info("Successfully loaded data from {}".format(input_data_path)) clean_data = clean(text, **config['clean']) # Write to output file save_csv(clean_data, output_data_path) except KeyError as e3: logger.error("KeyError: " + str(e3)) except FileNotFoundError as e1: logger.error('FileNotFoundError: {}'.format(e1)) except Exception as e: logger.error("Unexpected error occurred when cleaning data: " + str(e))
def main(args): """ main function to load cleaned data, conduct eda, visualize most important tokens with tfidf score :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) # load data df = read_csv(input_data_path) df.loc[:, 'season'] = df['season'].astype('int') sys.stdout = open(output_data_path, 'w') check_balance(df) check_linelen(df, config['eda']['quantile']) groups = config['eda']['groups'] for i in range(len(groups)): df_top_words = most_important_words(df, groups[i], **config['eda']['top_n_words']) fig = plot_tfidf_classfeats_h(df_top_words) fig.savefig('{}/EDA/top_words_{}.png'.format(project_path, i)) except Exception as e: logger.error("Unexpected error occurred when eda: " + str(e))
def reproducibility_tests(args): """Runs commands in config file and compares the generated files to those that are expected.""" config_path = project_path + "/" + args.config modules = load_config(config_path) all_passed = True for module in modules: # log the path for test outcome and expected outcome of the module conf = modules[module] # compare whether csv files generated by the model pipeline is the same as the expected files # located in test/true folder true_dir, test_dir = conf["true_dir"], conf["test_dir"] files_to_compare = [ f for f in conf["files_to_compare"] if f.split('.')[-1] not in dict_file_types ] match, mismatch, errors = filecmp.cmpfiles(true_dir, test_dir, files_to_compare, shallow=True) # if there is a mismatch or no file is match, reproducibility test is failed if len(mismatch) > 0 or len(match) == 0: logger.error( "{} file(s) do(es) not match, reproducibility test of model pipeline step {}': FAILED" .format(mismatch, module)) all_passed = False else: logger.info( "Reproducibility test of model pipeline stage {}: PASSED". format(module)) if all_passed: logger.info("Success, all reproducibility tests passed!")
def main(args): """ main function to create object table for products form the cleaned transactions :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) df = read_csv(input_data_path) product = product_dim(df, **config['product_dim']) # Write to output file save_csv(product, output_data_path) except Exception as e: logger.error("Unexpected error occurred when creating object table for products: " + str(e))
def main(args): """ main function to load raw data, clean data and save leaned data to csv :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) df = read_csv(input_data_path) clean_data = clean(df, **config['clean']) # Write to output file save_csv(clean_data, output_data_path) except KeyError as e3: logger.error("KeyError: " + str(e3)) except Exception as e: logger.error("Unexpected error occurred when cleaning data: " + str(e))
def main(args): """ main function to run the market basket analysis and save the recommendations to csv :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output product_path = project_path + "/" + conf.PRODUCT_DIM config = load_config(config_path) df = read_csv(input_data_path) product = read_csv(product_path) result = train(df, **config['train']) # Join product object table to get the name and price. final_results = join_info(result, product, "StockCode", "StockCode") final_results = join_info(final_results, product, "rec1", "StockCode") final_results = join_info(final_results, product, "rec2", "StockCode") # format conf final_results['conf1'] = round(final_results['conf1'] * 100, 2) final_results['conf2'] = round(final_results['conf2'] * 100, 2) final_results = final_results[config["result_columns"]] # Write to output file save_csv(final_results, output_data_path) except KeyError as e3: logger.error("KeyError: " + str(e3)) except ValueError as e4: logger.error("ValueError: " + str(e4) + " Please validate Values in the configuration file.") except Exception as e: logger.error( "Unexpected error occurred when making recommendations: " + str(e))
def main(args): """ main function to load cleaned data, create baskets and same baskets to csv :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) df = read_csv(input_data_path) basket = create_basket(df, **config['create_basket']) # Write to output file save_csv(basket, output_data_path, index=True) except KeyError as e1: logger.error("KeyError: " + str(e1)) except ValueError as e2: logger.error("ValueError: " + str(e2) + " Please validate Values in the configuration file.") except Exception as e: logger.error("Unexpected error occurred when creating basket: " + str(e))
import torch.nn.functional as F import numpy as np from src.bert_classification import pro_pipline from sklearn import preprocessing import argparse logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) # Initialize the Flask application app = Flask(__name__, template_folder="templates") # Configuration File config_path = config.CONFIG_YAML configs = load_config(config_path) # Load model project_path = path.dirname(path.abspath(__file__)) # which model to load parser = argparse.ArgumentParser() parser.add_argument("--num_epoch", help="number of epochs for training the model") parser.add_argument("--batch_size", help="batch_size for training the model") parser.add_argument("--max_length", help="max length of reviews") args = parser.parse_args() if not args.max_length: max_length = configs['bert']['max_length'] else: max_length = int(args.max_length)
def main(args): """ main function to fune tuning bert classification model :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input model_path = project_path + "/" + args.model evaluation_path = project_path + "/" + args.evaluation config = load_config(config_path) # load data df = read_csv(input_data_path) # # -- debug # df = df[:100] # Encode the classes for BERT. encoder = preprocessing.LabelEncoder() df['label'] = encoder.fit_transform(df['label']) # Split data into training and test sets. X_train, X_test, y_train, y_test = training_test_split( df, **config['bert']['training_test_split']) # Bert tokenization logger.info("Tokenizing...") tokenizer = transformers.BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True) if not args.max_length: max_length = config['bert']['max_length'] else: max_length = int(args.max_length) # DataLoaders for running the model if not args.batch_size: batch_size = config['bert']['batch_size'] else: batch_size = int(args.batch_size) dataloader_train = pro_pipline(X_train, tokenizer, max_length, config['bert']['tokenize'], batch_size, y_train) dataloader_test = pro_pipline(X_test, tokenizer, max_length, config['bert']['tokenize'], batch_size, y_test) # Initialize the model. model = transformers.BertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=df['label'].nunique(), output_attentions=False, output_hidden_states=False) # Setting optimizer optimizer = AdamW(model.parameters(), **config['bert']['optimizer']) # Setting epochs if not args.num_epoch: epochs = config['bert']['num_epoch'] else: epochs = int(args.num_epoch) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs) # Setting seeds seed = config['bert']['seed'] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Write prints to .txt model_name = 'max_length' + str(max_length) + 'batch_size' + str( batch_size) + 'num_epoch' + str(epochs) e_dir = evaluation_path + "/" + model_name if not os.path.exists(e_dir): os.makedirs(e_dir) sys.stdout = open(e_dir + "/" + model_name + '.txt', 'w') logger.info("Training... and evaluations will be saved into %s", e_dir) device = torch.device('cuda') # device = torch.device('cpu') model.to(device) complete_epoch, training_loss, test_accuracy = [], [], [] for epoch in tqdm(range(1, epochs + 1)): model.train() loss_train_total = 0 progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False) for batch in progress_bar: model.zero_grad() batch = tuple(b.to(device) for b in batch) inputs = { 'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device), } outputs = model(**inputs) loss = outputs[0] loss_train_total += loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() progress_bar.set_postfix({ 'training_loss': '{:.3f}'.format(loss.item() / len(batch)) }) # training loss tqdm.write(f'\nEpoch {epoch}') loss_train_avg = loss_train_total / len(dataloader_train) training_loss.append(loss_train_avg) tqdm.write(f'Training loss: {loss_train_avg}') # evaluate the model plt, val_accuracy = run_evaluation(dataloader_test, model, device, encoder) plt.savefig(e_dir + "/" + model_name + '-' + str(epoch) + '.png') test_accuracy.append(val_accuracy) complete_epoch.append(epoch) loss_plt = plot_loss(complete_epoch, training_loss, test_accuracy) loss_plt.savefig(e_dir + "/" + model_name + '_loss' + '.png') # save the model for future use/retrain output_dir = model_path + '/' + model_name + "/" if not os.path.exists(output_dir): os.makedirs(output_dir) logging.info("Saving model to %s" % output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) except KeyError as e3: logger.error("KeyError: " + str(e3)) except Exception as e: logger.error("Unexpected error occurred when training with Bert: " + str(e))