def process_data_for_language_model(val_sample=5000, tokenizer=None):
    """
        Read the IMDB folder files and create language model training and
        validation file

        :params:
            - save_path: Path where to save lm_train.txt and lm_valid.txt
            - imdb_path: Root directory of imdb dataset
            - val_sample: Number of files to select as validation set
    """
    all_files = []
    for root, dirnames, filenames in os.walk(DATA_DIR):
        all_files.extend(
            [os.path.join(root, filename) for filename in filenames])
    train_files, test_files = train_test_split(all_files, test_size=0.1)

    test_text = [
        open(i, encoding='utf-8', errors='ignore').read() for i in test_files
    ]
    train_text = [
        open(i, encoding='utf-8', errors='ignore').read() for i in train_files
    ]

    tokenizer = get_tokenizer(tokenizer)

    process = lambda x: [' '.join(tokenizer(i)) for i in tqdm(x)]
    test = process(test_text)
    train = process(train_text)

    open(os.path.join(save_path, LM_TRAIN_FILE), "w").write("\n".join(train))
    open(os.path.join(save_path, LM_VAL_FILE), "w").write("\n".join(test))
示例#2
0
def process_data_for_language_model(save_path,
                                    imdb_path,
                                    val_sample=5000,
                                    tokenizer=None):
    """
        Read the IMDB folder files and create language model training and
        validation file

        :params:
            - save_path: Path where to save lm_train.txt and lm_valid.txt
            - imdb_path: Root directory of imdb dataset
            - val_sample: Number of files to select as validation set
    """
    tokenizer = get_tokenizer(tokenizer)

    pos = [open(os.path.join(imdb_path, TRAIN_POS_DIR, i)).read() for i in\
                            tqdm(os.listdir(os.path.join(imdb_path, TRAIN_POS_DIR)))]
    neg = [open(os.path.join(imdb_path, TRAIN_NEG_DIR, i)).read() for i in \
                            tqdm(os.listdir(os.path.join(imdb_path, TRAIN_NEG_DIR)))]
    test_pos = [open(os.path.join(imdb_path, TEST_POS_DIR, i)).read() for i in \
                            tqdm(os.listdir(os.path.join(imdb_path, TEST_POS_DIR)))]
    test_neg = [open(os.path.join(imdb_path, TEST_NEG_DIR, i)).read() for i in \
                            tqdm(os.listdir(os.path.join(imdb_path, TEST_NEG_DIR)))]
    unsup = [open(os.path.join(imdb_path, TRAIN_UNSUP_DIR, i)).read() for i in \
                            tqdm(os.listdir(os.path.join(imdb_path, TRAIN_UNSUP_DIR)))]

    process = lambda x: [' '.join(tokenizer(i)) for i in tqdm(x)]
    pos = process(pos)
    neg = process(neg)
    test_pos = process(test_pos)
    test_neg = process(test_neg)
    unsup = process(unsup)

    text = pos + neg + unsup + test_neg + test_pos

    print('Test Data Creation..')
    val_index = random.sample(list(range(len(text))), val_sample)
    train = [i for idx, i in enumerate(text) if idx not in val_index]
    val = [i for idx, i in enumerate(text) if idx in val_index]
    open(os.path.join(save_path, LM_TRAIN_FILE), "w").write("\n".join(train))
    open(os.path.join(save_path, LM_VAL_FILE), "w").write("\n".join(val))
示例#3
0
    params = default_params()

    ntokens = sum(1
                  for _ in open(os.path.join(args.data_dir, 'vocab.txt'))) + 1

    if args.model == 'DAN':
        model = Model(emb_dim=64,
                      ntokens=ntokens,
                      hidden_dim=32,
                      output_dim=16).to(device)
        vocab = load_vocab(os.path.join(args.data_dir, 'vocab.txt'))
        tokenizer = None
    elif args.model in ['GPT', 'GPT-2']:
        model = PretrainedModel(model_name=args.model).to(device)
        vocab = None
        tokenizer = get_tokenizer(args.model)
    else:
        raise NotImplementedError(f'{args.model} --- no such model')

    model = load_checkpoint(
        model, os.path.join(args.model_dir, f'checkpoint_{args.epoch}'))

    pred = Predictor(model, tokenizer)

    dataset = CsvDataset(csv_path=os.path.join(args.data_dir, f'data.csv'),
                         vocab=vocab,
                         max_len=50,
                         tokenizer=tokenizer)
    dataloader = DataLoader(dataset,
                            batch_size=1,
                            shuffle=False,
示例#4
0
def pretrain_encoder(train_file, valid_file,\
                     save_folder='saved_model/base', tokenizer=None,
                     restore=False,
                     **kwargs):
    """
        Module for running the training and validation subroutines.

        :params:
            - train_file: Training File, File with sentences separated by newline
            - valid_file: Validation File, same format as above
            - save_folder: Folder to save output files and models
            - restore: Whether to restore from the save_folder (can be used
                        to finetune on a smaller dataset)
            - tokenizer: Tokenizer to use for tokenizing sentences into tokens
            - **kwargs: other params:
                        * batch_size
                        * hidden_size
                        * num_layers
                        * epochs
                        * seq_length
        :outputs:
            None
    """
    config = FW_CONFIG
    tokenizer = get_tokenizer(tokenizer) if tokenizer else None
    batch_size = kwargs.get("batch_size") or FW_CONFIG["batch_size"]
    hidden_size = kwargs.get("hidden_size") or FW_CONFIG["hidden_size"]
    num_layers = kwargs.get("num_layers") or FW_CONFIG["num_layers"]
    epochs = kwargs.get("epochs") or FW_CONFIG.pop("epochs")
    if "epochs" in FW_CONFIG:
        FW_CONFIG.pop("epochs")
    FW_CONFIG["num_candidate_samples"] = kwargs.get("num_candidate_samples") or FW_CONFIG["num_candidate_samples"]
    seq_length = FW_CONFIG.pop("seq_length")
    learning_rate = kwargs.get("learning_rate", 0.001)
    optimizer = kwargs.get("optimizer", "adam")
    print_progress = kwargs.get("print_progress", False)
    type_ = kwargs.get("type", "rnn")

    learning_rate_decay = 0.1
    lr_cosine_decay_params = {
            "learning_rate": learning_rate,
            "first_decay_steps": 2000,
            "t_mul": 2.0,
            "alpha": 0.01
    }
    tokenizer_json_file = os.path.join(save_folder, "tokenizer.json")
    # Load data and Batchify
    all_data = load_and_process_data(train_file, valid_file,
                                       max_vocab_size=config["max_vocab_size"],
                                       custom_tokenizer_function=tokenizer,
                                       tokenizer_json_file=tokenizer_json_file,
                                       restore_from=tokenizer_json_file if restore else None)

    word_freq, word_index, train_data, valid_data = all_data
    X_train, y_train = batchify(train_data, batch_size)
    X_valid, y_valid = batchify(valid_data, batch_size)

    # Save the Vocab and frequency files
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    # Saving config and word_index file
    json.dump(word_index, open(os.path.join(save_folder, "word_index.json"), "w"))
    json.dump(word_freq, open(os.path.join(save_folder, "word_freq.json"), "w"))

    json.dump(FW_CONFIG, open(os.path.join(save_folder, "config.json"), "w"))

    # Arranging tokens in alist, this will go in vocab file
    vocab = [" "] + [i[0] for i in sorted(word_index.items(), key=lambda x: x[1])][:FW_CONFIG["max_vocab_size"]+1]
    open(os.path.join(save_folder, "vocab.txt"), "w").write("\n".join(vocab))
    open(os.path.join(save_folder, "word_freqs.txt"), "w").write("\n".join(word_index))

    # Check max_vocab_size
    FW_CONFIG["max_vocab_size"] = min(len(word_index) + 1, FW_CONFIG["max_vocab_size"])
    print("Vocabulary Size: {}".format(FW_CONFIG["max_vocab_size"]))

    # Define Placeholder and Initial States
    inputs  = tf.placeholder(dtype=tf.int32, shape=(batch_size,None), name='input')
    targets = tf.placeholder(dtype=tf.int64, shape=(batch_size,None), name='target')
    initial_state_c  = tf.placeholder(dtype=tf.float32, shape=(num_layers, batch_size, hidden_size),\
                                    name='input_state_c')
    initial_state_h = tf.placeholder(dtype=tf.float32, shape=(num_layers, batch_size, hidden_size),\
                                    name='input_state_h')

    # Create the Graph
    train_op, training_flag, sampled_loss,\
    loss, rnn_states, weights, learning_rate_var = language_model_graph(inputs, targets,
                                                     (initial_state_c, initial_state_h),
                                                     vocab_freqs=word_freq,
                                                     optimizer=optimizer,
                                                     type_=type_,
                                                      **config)

    final_state_c, final_state_h = rnn_states
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    print("total number of trainable params {}".format(get_trainbale_params()))

    # Define run epoch function params (passed as kwargs)
    run_epoch_params = {"session": sess,
                        "sampled_loss": sampled_loss,
                        "loss": loss,
                        "num_layers": num_layers,
                        "input_placeholder": inputs,
                        "target_placeholder": targets,
                        "initial_state_c": initial_state_c,
                        "initial_state_h": initial_state_h,
                        "learning_rate_var":learning_rate_var,
                        "learning_rate":learning_rate,
                        "train_op": train_op,
                        "final_state_c": final_state_c,
                        "final_state_h": final_state_h,
                        "seq_length": seq_length,
                        "batch_size": batch_size,
                        "hidden_size":hidden_size,
                        "training_flag": training_flag,
                        "lr_cosine_decay_params": lr_cosine_decay_params}

    valid_losses = [1000]
    vars = tf.trainable_variables()
    vars = [i for i in vars if 'optimizer' not in i.name]
    saver = tf.train.Saver(vars)
    if restore:
        saver.restore(sess, os.path.join(save_folder, "model.ckpt"))
    for epoch in range(epochs):
        decay = (learning_rate_decay ** int((max(epoch - 5, 0)/2)))

        run_epoch_params['learning_rate'] = learning_rate * decay
        # Training Epoch
        train_loss = _run_epoch(X_train, y_train,
                                train=True,
                                epoch=epoch,
                                print_progress=print_progress,
                                **run_epoch_params)
        # Valid Epoch
        valid_loss = _run_epoch(X_valid, y_valid,
                                train=False,
                                print_progress=False,
                                epoch=epoch,
                                **run_epoch_params)

        format_values = [epoch, train_loss, np.exp(train_loss),\
                                valid_loss, np.exp(valid_loss)]

        print("Epoch {0}, Train Loss {1:.2f}, Train Perplexity {2:.2f},\
                    Val Loss {3:.2f}, Val Perplexity {4:.2f}".format(*format_values))

        if valid_loss < min(valid_losses):
            saver.save(sess, os.path.join(save_folder, "model.ckpt"))
            numpy_weights = {}
            weights_ = weights
            for layer in weights:
                numpy_weights[layer] = sess.run(weights[layer])
            weights = weights_
            pickle.dump(numpy_weights, open(os.path.join(save_folder, "weights.pkl"), "wb"))

        valid_losses.append(valid_loss)
示例#5
0
def create_tfrecords(params,
                     write_remainder=True,
                     write_every_n_files=1,
                     save_checkpoints=False,
                     resume_from_checkpoint=False,
                     display_pbar=False):
    # iterates through files in input_dir, splitting into <args.chunk_size> chunks and saving a tfrecords file every <args.files_per> chunks.
    files, args, process_no = params
    enc = get_tokenizer()  # get tokenizer

    # init metadata
    discarded_files = 0
    files_processed = 0
    pbar = tqdm(
        desc=
        f"Writing TFRecord Files to {args.output_dir}. Parsed 0 input files. files_written ",
        disable=not display_pbar)
    checkpoint_path = f"{args.output_dir}/checkpoint.txt"
    resume_files_processed, tfrecord_count = read_checkpoint(
        checkpoint_path, resume_from_checkpoint)

    data_to_prepend = []
    tokenized_files_array = []

    for f in files:
        for tokenized_files in archive_to_tokens(f, enc, args):
            files_processed += 1
            if files_processed < resume_files_processed:
                continue  # resume from checkpoint

            # if the last chunk < chunk size, but > minimum_size, take it and append it to the beginning of the next file
            n_tokens = len(tokenized_files[-1])
            if n_tokens < args.chunk_size:
                data = tokenized_files.pop(-1)
                if n_tokens >= args.minimum_size:
                    data_to_prepend.extend(data)
                else:
                    discarded_files += 1

            if len(data_to_prepend) >= args.chunk_size:
                # if length of data_to_prepend becomes greater than chunk size, add concatted files to tokenized files
                tokenized_files_array.append(data_to_prepend[:args.chunk_size])
                data_to_prepend = data_to_prepend[args.chunk_size:]
            # add tokenized files > chunk size to main array
            tokenized_files_array.extend(tokenized_files)

            if len(
                    tokenized_files_array
            ) >= args.files_per * write_every_n_files:  # write every n files
                _tfrecord_count, remainder = write_files(
                    tokenized_files_array,
                    files_per=args.files_per,
                    output_dir=args.output_dir,
                    out_name=args.name,
                    start_no=tfrecord_count,
                    process_no=process_no)
                pbar.update(_tfrecord_count -
                            tfrecord_count)  # update progress bar
                pbar.set_description(
                    f"Writing TFRecord Files to {args.output_dir}. Parsed {files_processed} input files. files_written "
                )
                tfrecord_count = _tfrecord_count
                tokenized_files_array = remainder if remainder is not None else [
                ]  # add remaining files to next chunk
                with open(checkpoint_path, "w") as checkpoint_file:
                    checkpoint_file.write(
                        f"{files_processed}, {tfrecord_count}")

    if len(tokenized_files_array) >= args.files_per:  # also write at end
        _tfrecord_count, remainder = write_files(tokenized_files_array,
                                                 files_per=args.files_per,
                                                 output_dir=args.output_dir,
                                                 out_name=args.name,
                                                 start_no=tfrecord_count,
                                                 process_no=process_no)
        pbar.update(_tfrecord_count - tfrecord_count)
        pbar.set_description(
            f"Writing TFRecord Files to {args.output_dir}. Parsed {files_processed} input files. files_written "
        )
        tfrecord_count = _tfrecord_count
        with open(checkpoint_path, "w") as checkpoint_file:
            checkpoint_file.write(f"{files_processed}, {tfrecord_count}")
    else:
        remainder = tokenized_files_array  # add remaining to remainder

    if write_remainder:
        # write out the remaining files even if there's less than files_per
        write_files(remainder,
                    files_per=args.files_per,
                    output_dir=args.output_dir,
                    out_name=args.name,
                    start_no=tfrecord_count,
                    write_remainder=True)

    successful_files = files_processed - discarded_files
    return {
        "discarded": discarded_files,
        "processed": files_processed,
        "successful": successful_files
    }
示例#6
0
def process_data_for_classification(imdb_path,
                                    val_sample=5000,
                                    tokenizer='nltk',
                                    maxlen=500):
    """
        Read the IMDB folder files and create language model training and
        validation file

        :params:
            - save_path: Path where to save lm_train.txt and lm_valid.txt
            - imdb_path: Root directory of imdb dataset
            - val_sample: Number of files to select as validation set
    """
    file_name = 'imdb_processed_data_maxlen_{}_tokenizer_{}_val_{}.pkl'.format(
        maxlen, tokenizer, val_sample)
    if not os.path.exists(file_name):
        pos = [open(os.path.join(imdb_path, TRAIN_POS_DIR, i)).read() for i in\
                                tqdm(os.listdir(os.path.join(imdb_path, TRAIN_POS_DIR)))]
        neg = [open(os.path.join(imdb_path, TRAIN_NEG_DIR, i)).read() for i in \
                                tqdm(os.listdir(os.path.join(imdb_path, TRAIN_NEG_DIR)))]
        test_pos = [open(os.path.join(imdb_path, TEST_POS_DIR, i)).read() for i in \
                                tqdm(os.listdir(os.path.join(imdb_path, TEST_POS_DIR)))]
        test_neg = [open(os.path.join(imdb_path, TEST_NEG_DIR, i)).read() for i in \
                                tqdm(os.listdir(os.path.join(imdb_path, TEST_NEG_DIR)))]

        train = pos + neg
        test = test_pos + test_neg

        print('Test Data Creation..')
        val_index = random.sample(list(range(len(train))), val_sample)
        x_train = [i for idx, i in enumerate(train) if idx not in val_index]
        y_train = [
            1 if idx <= len(pos) else 0 for idx, i in enumerate(train)
            if idx not in val_index
        ]
        x_val = [i for idx, i in enumerate(train) if idx in val_index]
        y_val = [
            1 if idx <= len(pos) else 0 for idx, i in enumerate(train)
            if idx in val_index
        ]

        x_test = test
        y_test = [1] * len(test_pos) + [0] * len(test_neg)

        if not tokenizer:
            tokenizer = text_to_word_sequence
        else:
            tokenizer = get_tokenizer(get_tokenizer)
        x_train = list(map(tokenizer, x_train))
        x_val = list(map(tokenizer, x_val))
        x_test = list(map(tokenizer, x_test))

        custom_pad_sequences = lambda x: pad_sequences(
            x, maxlen=maxlen, dtype=object, padding='pre', value="-pad-")
        x_train = custom_pad_sequences(x_train)
        x_val = custom_pad_sequences(x_val)
        x_test = custom_pad_sequences(x_test)
        pickle.dump([x_train, y_train, x_val, y_val, x_test, y_test],
                    open(file_name, 'wb'))
        return x_train, y_train, x_val, y_val, x_test, y_test
    else:
        x_train, y_train, x_val, y_val, x_test, y_test = pickle.load(
            open(file_name, 'rb'))
        return x_train, y_train, x_val, y_val, x_test, y_test
示例#7
0
def main_siamese_lstm(bug_contents_path, code_contents_path, file_oracle_path, sequence_oracle_path, model_dir_path, prediction_dir_path,evaluation_file_path, vocabulary_size, lstm_core_length, word2vec_model_path = None, lstm_seq_length = 200, sample_num = 50, split_ratio = 0.8, activation_function = 'tanh', inner_activation_function = 'hard_sigmoid', distance_function = 'cos', initializer = 'glorot_uniform', inner_initializer = 'orthogonal', regularizer = None, optimizer = RMSprop(lr=0.001, rho = 0.9, epsilon=1e-8, decay=0.0), dropout = 0.0, epoch_num = 100, k_value = 10, rel_threshold = 0.5, embedding_dimension = -1, word2vec = False):

    if not os.path.isdir(model_dir_path):
        os.mkdir(model_dir_path)

    #Loading the pretrained word2vec model
    word2vec_model = None
    if word2vec == True:
        print("loading word2vec model:")
        word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=False)
        print("finished loading word2vec model.")


    #Loading the generated data from file
    print("loading data from file:")
    [bug_contents,code_contents,file_oracle,sequence_oracle] = load_data(bug_contents_path, code_contents_path, file_oracle_path, sequence_oracle_path, split_length = lstm_seq_length, encoding = 'utf-8')
    print("finished loading data from file.")


    #Initializing the tokenizer
    print("initializing tokenizer:")
    tokenizer = get_tokenizer(bug_contents, code_contents, vocabulary_size)
    print("finished initializing tokenizer.")


    #The previous bugs are used for training
    #The remaining bugs are used for testing
    nb_train_bug = int(math.floor(len(bug_contents)* split_ratio))


    #Building the LSTM Siamese Network.
    print("building lstm siamese network:")
    model = siamese_lstm(lstm_seq_length, vocabulary_size, lstm_core_length, activation_function = activation_function, inner_activation_function = inner_activation_function,distance_function = distance_function, initializer = initializer, inner_initializer = inner_initializer, regularizer = regularizer, optimizer = optimizer, dropout = dropout, embedding_dimension = embedding_dimension)

    #Saving the Model Structure to File
    model_structure_path = os.path.join(model_dir_path, "model_structure")
    save_model_structure(model, model_structure_path)

    print("finished building lstm siamese network.")
    
    #Building the LSTM Validation Set
    bug_val = np.zeros((0,lstm_seq_length, vocabulary_size))
    code_val = np.zeros((0,lstm_seq_length, vocabulary_size))
    rel_val = np.zeros((0,))
    bug_contents_val = bug_contents[nb_train_bug:]
    nb_validation_bug = 100
    for bug_batch, code_batch, label_batch in batch_gen(bug_contents_val, sequence_oracle, tokenizer, vocabulary_size, lstm_seq_length, nb_validation_bug, word2vec_model, embedding_dimension= embedding_dimension, sample_num = sample_num,  word2vec = word2vec):
        bug_val = np.vstack((bug_val,bug_batch))
        code_val = np.vstack((code_val,code_batch))
        rel_val = np.append(rel_val,label_batch, axis = 0)
    print(bug_val.shape)

    #Training the LSTM Siamese Network
    print("training lstm siamese network:")
    acc_train_list = []
    acc_val_list = []
    for epoch in range(epoch_num):
	bug_train = np.zeros((0,lstm_seq_length, vocabulary_size))
        code_train = np.zeros((0,lstm_seq_length, vocabulary_size))
        rel_train = np.zeros((0,))
        print("training epoch {}:".format(epoch))
        batch_index = 1
        for bug_batch, code_batch, label_batch in batch_gen(bug_contents, sequence_oracle, tokenizer, vocabulary_size, lstm_seq_length, nb_train_bug, word2vec_model, embedding_dimension= embedding_dimension, sample_num = sample_num,  word2vec = word2vec):
            print("training batch {}, size {}".format(batch_index, len(bug_batch)))
            model.train_on_batch([bug_batch, code_batch], label_batch)
            batch_index = batch_index + 1
	    bug_train = np.vstack((bug_train,bug_batch))
            code_train = np.vstack((code_train,code_batch))
            rel_train = np.append(rel_train,label_batch, axis = 0)

	    #predicting the training accuracy of this batch
	    pred_batch = model.predict([bug_batch, code_batch])
	    print(pred_batch)
	    print(label_batch)
	    acc_batch = predict_accuracy(pred_batch, label_batch)
	    print("training accuracy = {}".format(acc_batch))
	    
	    #predicting the validation accuracy of this batch
	    pred_val = model.predict([bug_val, code_val])
	    print(pred_val)
	    print(rel_val)
            acc_val = predict_accuracy(pred_val, rel_val)
            print("validation accuracy = {}".format(acc_val))
            acc_val_list.append(acc_val)

        #save the model weights after this epoch to file
        one_epoch_weight_path = os.path.join(model_dir_path, "weight_epoch_{}".format(epoch))
        save_model_weights(model,one_epoch_weight_path)
	#compute the valiation accuracy
	#pred_val = model.predict([bug_val, code_val])
        #acc_val = predict_accuracy(pred_val, rel_val)
	#print("validation accuracy = {}".format(acc_val))
        #acc_val_list.append(acc_val)

    print("finished training lstm siamese network.")
    #plt.plot(acc_train_list)
    #plt.plot(acc_val_list)
    #plt.savefig('learning_curve.eps')

    # Generating Predictions on the Test Bugs
    print("computing predictions on the test data:")

    #Code Vectors
    code_vec_list = generate_code_vec(model, code_contents, lstm_seq_length,tokenizer, vocabulary_size, word2vec_model, embedding_dimension = embedding_dimension, word2vec = word2vec)

    #Test Bug Vectors
    bug_vec_list = generate_bug_vec(model, bug_contents[nb_train_bug:], lstm_seq_length, tokenizer, vocabulary_size,word2vec_model, embedding_dimension = embedding_dimension, word2vec = word2vec)

    #Generating Oracles for Test Bugs
    test_oracle = generate_test_oracle(file_oracle[nb_train_bug:])

    #Generating Prediction Scores for Each Test Bug
    predictions = generate_predictions_full(bug_vec_list, code_vec_list)


    if not os.path.isdir(prediction_dir_path):
        os.mkdir(prediction_dir_path)

    i = 1
    #Traversing each bug oracle/prediction results
    for one_test_oracle, prediction in zip(test_oracle, predictions):
        if len(one_test_oracle)>0:

            #Export
            file_path = os.path.join(prediction_dir_path, "bug_num_{}".format(i))
            export_one_bug_prediction(one_test_oracle, prediction, file_path)
            #Some strategies for ...

            #evaluations = evaluate_one_bug(prediction, one_test_oracle)
            # print(evaluations)
            #export_one_evaluation(evaluations, evaluation_file_path)
            i = i+1

    print("finished computing predictions on the test data.")

    #Evaluating Performance on Test Bugs
    print("evaluating performance on the test data:")
    evaluate_prediction_dir(prediction_dir_path, evaluation_file_path)

    print("finished evaluating performance on the test data.")
示例#8
0
def _load_quora_data(data_file,\
                    max_length=60,
                    validation_split=5000,
                    test_split=5000,
                    seed=100,
                    tokenizer="nltk",
                    processor_config_filepath='preprocessor.pkl'):
    """
        Load Quora Dataset from TSV file
        :params:
            - data_file: TSV data file provided by Quora
            - max_length: Max Length of Questions, Questions data will be
                                truncated upto this length
            - validation_split: How much to sample for validation
            - test_split: How much to sample for testing
            - seed: Random seed
            - processor_config_filepath: Where to save tokenizer etc 
    """
    # Read data file and assign column names
    data = pd.read_csv(data_file, sep='\t')

    # Shuffle and split dataframe
    np.random.seed(seed)
    data.iloc[np.random.permutation(len(data))]

    train_df, valid_df, test_df = data.iloc[:-(validation_split+test_split)],\
                                  data.iloc[-(validation_split+test_split):-test_split],\
                                  data.iloc[-test_split:, :]

    convert_list_to_str = lambda x: list(map(str, x))
    train_question1 = convert_list_to_str(train_df['question1'].tolist())
    train_question2 = convert_list_to_str(train_df['question2'].tolist())
    y_train = train_df['is_duplicate']

    valid_question1 = convert_list_to_str(valid_df['question1'].tolist())
    valid_question2 = convert_list_to_str(valid_df['question2'].tolist())
    y_valid = valid_df['is_duplicate']

    test_question1 = convert_list_to_str(test_df['question1'].tolist())
    test_question2 = convert_list_to_str(test_df['question2'].tolist())
    y_test = test_df['is_duplicate']

    if not tokenizer:
        tokenizer = text_to_word_sequence
    else:
        tokenizer = get_tokenizer(get_tokenizer)

    def process_list_of_text(list_of_text):
        tokenized = list(map(tokenizer, x_train))
        return pad_sequences(tokenized,
                             maxlen=max_length,
                             dtype=object,
                             padding='pre',
                             value="-pad-")

    # Processing Training Data
    train_question1 = process_list_of_text(train_question1)
    train_question2 = process_list_of_text(train_question2)

    # Processing Validation Data
    valid_question1 = process_list_of_text(valid_question1)
    valid_question2 = process_list_of_text(valid_question2)

    # Processing Test Data
    test_question1 = process_list_of_text(test_question1)
    test_question2 = process_list_of_text(test_question2)


    return  (train_question1, train_question2), y_train,\
                        (valid_question1, valid_question2), y_valid,\
                        (test_question1, test_question2), y_test