def main(args):
    """
    Load generated model checkpoints from by default in /checkpoint/run1 and generate new text
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)

        # load data
        df = read_csv(input_data_path)
        lines = list(df['raw_line'])
        random.seed(config['generate']['random_seed'])
        sample_seeds = random.choices(lines, k=config['generate']['num'])

        sess = gpt2.start_tf_sess()
        gpt2.load_gpt2(sess)

        pred = []
        for i in sample_seeds:
            out = gpt2.generate(sess,
                                prefix=i,
                                **config['generate']['generator'])
            pred.append(out)

        pred_df = pd.DataFrame(pred, columns=['raw_line'])
        save_csv(pred_df, output_data_path)

    except Exception as e:
        logger.error(
            "Unexpected error occurred when generating dialogues with gpt2: " +
            str(e))
def main(args):
    """
    main function perform data augmentation with clean data and save the augmented data to csv
    :param args: (argparse) user-input configuration file
    """
    # try:
    config_path = project_path + "/" + args.config
    input_data_path = project_path + "/" + args.input
    output_data_path = project_path + "/" + args.output

    config = load_config(config_path)

    # load data
    df = read_csv(input_data_path)

    lines = list(df['line'])
    charactors = list(df['label'])

    augmented = augment(lines, config['aug'])

    # Union original lines and augmented lines
    df2 = pd.DataFrame(list(zip(charactors, augmented)),
                       columns=['label', 'line'])
    df = df[['label', 'line']]

    df['type'] = 'original'
    df2['type'] = 'augmented'
    result = pd.concat([df, df2])

    save_csv(result, output_data_path)
Пример #3
0
def main(args):
    """
    main function to split data
    :param args: (argparse) user-input configuration file
    """
    try:

        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        out_train_path = project_path + "/" + args.output_train
        out_test_path = project_path + "/" + args.output_test

        config = load_config(config_path)
        df = read_csv(input_data_path)
        df_train, df_test = split(df, **config['split_data'])

        # Write to output file
        save_csv(df_train, out_train_path)
        save_csv(df_test, out_test_path)
    except ValueError as e1:
        logger.error("ValueError: " + str(e1) +
                     " Please validate Values in the configuration file.")
    except Exception as e:
        logger.error("Unexpected error occurred when splitting data: " +
                     str(e))
def main(args):
    """
    main function to load raw data, clean data and save leaned data to csv
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)

        # load data
        logger.info("Trying to load data from %s", input_data_path)
        with open(input_data_path, 'r') as f:
            text = f.read()
        logger.info("Successfully loaded data from {}".format(input_data_path))

        clean_data = clean(text, **config['clean'])

        # Write to output file
        save_csv(clean_data, output_data_path)
    except KeyError as e3:
        logger.error("KeyError: " + str(e3))
    except FileNotFoundError as e1:
        logger.error('FileNotFoundError: {}'.format(e1))
    except Exception as e:
        logger.error("Unexpected error occurred when cleaning data: " + str(e))
Пример #5
0
def main(args):
    """
    main function to load cleaned data, conduct eda, visualize most important tokens with tfidf score
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)

        # load data
        df = read_csv(input_data_path)
        df.loc[:, 'season'] = df['season'].astype('int')

        sys.stdout = open(output_data_path, 'w')
        check_balance(df)
        check_linelen(df, config['eda']['quantile'])

        groups = config['eda']['groups']

        for i in range(len(groups)):
            df_top_words = most_important_words(df, groups[i],
                                                **config['eda']['top_n_words'])
            fig = plot_tfidf_classfeats_h(df_top_words)
            fig.savefig('{}/EDA/top_words_{}.png'.format(project_path, i))

    except Exception as e:
        logger.error("Unexpected error occurred when eda: " + str(e))
Пример #6
0
def reproducibility_tests(args):
    """Runs commands in config file and compares the generated files to those that are expected."""
    config_path = project_path + "/" + args.config

    modules = load_config(config_path)

    all_passed = True
    for module in modules:
        # log the path for test outcome and expected outcome of the module
        conf = modules[module]
        # compare whether csv files generated by the model pipeline is the same as the expected files
        # located in test/true folder
        true_dir, test_dir = conf["true_dir"], conf["test_dir"]
        files_to_compare = [
            f for f in conf["files_to_compare"]
            if f.split('.')[-1] not in dict_file_types
        ]
        match, mismatch, errors = filecmp.cmpfiles(true_dir,
                                                   test_dir,
                                                   files_to_compare,
                                                   shallow=True)
        # if there is a mismatch or no file is match, reproducibility test is failed
        if len(mismatch) > 0 or len(match) == 0:
            logger.error(
                "{} file(s) do(es) not match, reproducibility test of model pipeline step {}': FAILED"
                .format(mismatch, module))
            all_passed = False
        else:
            logger.info(
                "Reproducibility test of model pipeline stage {}: PASSED".
                format(module))

    if all_passed:
        logger.info("Success, all reproducibility tests passed!")
def main(args):
    """
    main function to create object table for products form the cleaned transactions
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)
        df = read_csv(input_data_path)
        product = product_dim(df, **config['product_dim'])

        # Write to output file
        save_csv(product, output_data_path)
    except Exception as e:
        logger.error("Unexpected error occurred when creating object table for products: " + str(e))
def main(args):
    """
    main function to load raw data, clean data and save leaned data to csv
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)
        df = read_csv(input_data_path)
        clean_data = clean(df, **config['clean'])

        # Write to output file
        save_csv(clean_data, output_data_path)
    except KeyError as e3:
        logger.error("KeyError: " + str(e3))
    except Exception as e:
        logger.error("Unexpected error occurred when cleaning data: " + str(e))
Пример #9
0
def main(args):
    """
    main function to run the market basket analysis and save the recommendations to csv
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output
        product_path = project_path + "/" + conf.PRODUCT_DIM

        config = load_config(config_path)
        df = read_csv(input_data_path)
        product = read_csv(product_path)

        result = train(df, **config['train'])

        # Join product object table to get the name and price.

        final_results = join_info(result, product, "StockCode", "StockCode")
        final_results = join_info(final_results, product, "rec1", "StockCode")
        final_results = join_info(final_results, product, "rec2", "StockCode")

        # format conf
        final_results['conf1'] = round(final_results['conf1'] * 100, 2)
        final_results['conf2'] = round(final_results['conf2'] * 100, 2)

        final_results = final_results[config["result_columns"]]

        # Write to output file
        save_csv(final_results, output_data_path)
    except KeyError as e3:
        logger.error("KeyError: " + str(e3))
    except ValueError as e4:
        logger.error("ValueError: " + str(e4) +
                     " Please validate Values in the configuration file.")
    except Exception as e:
        logger.error(
            "Unexpected error occurred when making recommendations: " + str(e))
def main(args):
    """
    main function to load cleaned data, create baskets and same baskets to csv
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)
        df = read_csv(input_data_path)
        basket = create_basket(df, **config['create_basket'])

        # Write to output file
        save_csv(basket, output_data_path, index=True)
    except KeyError as e1:
        logger.error("KeyError: " + str(e1))
    except ValueError as e2:
        logger.error("ValueError: " + str(e2) + " Please validate Values in the configuration file.")
    except Exception as e:
        logger.error("Unexpected error occurred when creating basket: " + str(e))
Пример #11
0
import torch.nn.functional as F
import numpy as np
from src.bert_classification import pro_pipline
from sklearn import preprocessing
import argparse


logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize the Flask application
app = Flask(__name__, template_folder="templates")

# Configuration File
config_path = config.CONFIG_YAML
configs = load_config(config_path)

# Load model
project_path = path.dirname(path.abspath(__file__))

# which model to load
parser = argparse.ArgumentParser()
parser.add_argument("--num_epoch", help="number of epochs for training the model")
parser.add_argument("--batch_size", help="batch_size for training the model")
parser.add_argument("--max_length", help="max length of reviews")
args = parser.parse_args()

if not args.max_length:
    max_length = configs['bert']['max_length']
else:
    max_length = int(args.max_length)
def main(args):
    """
    main function to fune tuning bert classification model
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        model_path = project_path + "/" + args.model
        evaluation_path = project_path + "/" + args.evaluation

        config = load_config(config_path)

        # load data
        df = read_csv(input_data_path)
        # # -- debug
        # df = df[:100]
        # Encode the classes for BERT.
        encoder = preprocessing.LabelEncoder()
        df['label'] = encoder.fit_transform(df['label'])

        # Split data into training and test sets.
        X_train, X_test, y_train, y_test = training_test_split(
            df, **config['bert']['training_test_split'])

        # Bert tokenization
        logger.info("Tokenizing...")
        tokenizer = transformers.BertTokenizer.from_pretrained(
            'bert-base-uncased', do_lower_case=True)
        if not args.max_length:
            max_length = config['bert']['max_length']
        else:
            max_length = int(args.max_length)

        # DataLoaders for running the model
        if not args.batch_size:
            batch_size = config['bert']['batch_size']
        else:
            batch_size = int(args.batch_size)

        dataloader_train = pro_pipline(X_train, tokenizer, max_length,
                                       config['bert']['tokenize'], batch_size,
                                       y_train)
        dataloader_test = pro_pipline(X_test, tokenizer, max_length,
                                      config['bert']['tokenize'], batch_size,
                                      y_test)

        # Initialize the model.
        model = transformers.BertForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels=df['label'].nunique(),
            output_attentions=False,
            output_hidden_states=False)
        # Setting optimizer
        optimizer = AdamW(model.parameters(), **config['bert']['optimizer'])

        # Setting epochs
        if not args.num_epoch:
            epochs = config['bert']['num_epoch']
        else:
            epochs = int(args.num_epoch)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=len(dataloader_train) * epochs)

        # Setting seeds
        seed = config['bert']['seed']
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        # Write prints to .txt
        model_name = 'max_length' + str(max_length) + 'batch_size' + str(
            batch_size) + 'num_epoch' + str(epochs)
        e_dir = evaluation_path + "/" + model_name
        if not os.path.exists(e_dir):
            os.makedirs(e_dir)
        sys.stdout = open(e_dir + "/" + model_name + '.txt', 'w')
        logger.info("Training... and evaluations will be saved into %s", e_dir)

        device = torch.device('cuda')
        # device = torch.device('cpu')
        model.to(device)

        complete_epoch, training_loss, test_accuracy = [], [], []

        for epoch in tqdm(range(1, epochs + 1)):
            model.train()
            loss_train_total = 0
            progress_bar = tqdm(dataloader_train,
                                desc='Epoch {:1d}'.format(epoch),
                                leave=False,
                                disable=False)
            for batch in progress_bar:
                model.zero_grad()
                batch = tuple(b.to(device) for b in batch)
                inputs = {
                    'input_ids': batch[0].to(device),
                    'attention_mask': batch[1].to(device),
                    'labels': batch[2].to(device),
                }
                outputs = model(**inputs)

                loss = outputs[0]
                loss_train_total += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                progress_bar.set_postfix({
                    'training_loss':
                    '{:.3f}'.format(loss.item() / len(batch))
                })
            # training loss
            tqdm.write(f'\nEpoch {epoch}')
            loss_train_avg = loss_train_total / len(dataloader_train)
            training_loss.append(loss_train_avg)
            tqdm.write(f'Training loss: {loss_train_avg}')
            # evaluate the model
            plt, val_accuracy = run_evaluation(dataloader_test, model, device,
                                               encoder)
            plt.savefig(e_dir + "/" + model_name + '-' + str(epoch) + '.png')

            test_accuracy.append(val_accuracy)
            complete_epoch.append(epoch)
            loss_plt = plot_loss(complete_epoch, training_loss, test_accuracy)
            loss_plt.savefig(e_dir + "/" + model_name + '_loss' + '.png')

        # save the model for future use/retrain
        output_dir = model_path + '/' + model_name + "/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        logging.info("Saving model to %s" % output_dir)

        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

    except KeyError as e3:
        logger.error("KeyError: " + str(e3))
    except Exception as e:
        logger.error("Unexpected error occurred when training with Bert: " +
                     str(e))