def __init__(self, data_file):
        data_processor = DataProcessor(data_file, seperator=',,,')
        self.data, self.labels = data_processor.get_training_data(
            raw_text=True)
        # self.data , self.labels    = twenty_train.data, twenty_train.target

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.data, self.labels, test_size=0.33, random_state=42)

        # print('Running Naive Bayes...')
        # pipeline, parameters =self.get_naive_bayes_model()

        @use_named_args(space)
        def objective(**params):
            print params
            # max_df,ngram_range,max_features,alpha
            pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=params['max_df'],ngram_range=(1,params['ngram_range']),\
                                max_features=params['max_features'])),
                ('clf', LinearSVC(loss=params['loss'],C=params['C'], max_iter=1000))
            ])
            pipeline.fit(self.X_train, self.y_train)
            accuracy = accuracy_score(pipeline.predict(self.X_test),
                                      self.y_test)
            print('Accuracy {}'.format(accuracy))
            return -accuracy

        res_gp = gp_minimize(objective, space, n_calls=100, random_state=10)
예제 #2
0
def test_on_guild_join(gid, obj_guild):
    # Make sure guild is removed from guilds table
    DataConnector.run_query("DELETE FROM {}.guilds WHERE guild_id='{}'".format(
        SCHEMA_NAME, gid))
    df = DataConnector.read_data(
        "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 0

    # Make sure guild is removed from days table
    DataConnector.run_query("DELETE FROM {}.days WHERE guild_id='{}'".format(
        SCHEMA_NAME, gid))
    df = DataConnector.read_data(
        "SELECT * FROM {}.days WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 0

    # Check on guild join
    DataProcessor._on_guild_join(obj_guild)
    df = DataConnector.read_data(
        "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 1

    # Also very the days table
    df = DataConnector.read_data(
        "SELECT * FROM {}.days WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 1

    # Check that the size doesn't change on the same guild
    DataProcessor._on_guild_join(obj_guild)
    df = DataConnector.read_data(
        "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 1

    df = DataConnector.read_data(
        "SELECT * FROM {}.days WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 1
예제 #3
0
            def valid(full_path, in_vocab):
                data_processor_valid = DataProcessor(full_path, in_vocab)
                pred_scores = []
                true_scores = []
                eval_loss = 0
                num_loss = 0

                while True:
                    a_ids_data, a_context_ids_data, a_keyword_index, a_len_data, p_ids_data, p_context_ids_data, \
                    p_keyword_index, p_len_data, y_data = \
                        data_processor_valid.get_batch_siamese(arg)
                    if len(a_ids_data) != 0:
                        feed_dict = {
                            model.input_a.name: a_ids_data,
                            model.input_a_context.name: a_context_ids_data,
                            model.input_a_keyword_index.name: a_keyword_index,
                            model.input_a_len.name: a_len_data,
                            model.input_n.name: p_ids_data,
                            model.input_n_context.name: p_context_ids_data,
                            model.input_n_keyword_index.name: p_keyword_index,
                            model.input_n_len.name: p_len_data,
                            model.input_y: y_data
                        }
                        ret = sess.run(inference_outputs, feed_dict)
                        eval_loss += np.mean(ret[1])
                        num_loss += 1
                        pred_scores.append(ret[0])
                        true_scores.append(y_data)

                    if data_processor_valid.end == 1:
                        break

                pred_scores = np.concatenate(pred_scores)
                true_scores = np.concatenate(true_scores)
                import sklearn
                fpr, tpr, thresholds = sklearn.metrics.roc_curve(true_scores,
                                                                 pred_scores,
                                                                 pos_label=1)
                auc = sklearn.metrics.auc(fpr, tpr)
                map = average_precision_score(true_scores,
                                              pred_scores,
                                              average='micro')
                df = pd.DataFrame({
                    'model': 'siamese',
                    'score': pred_scores,
                    'class': true_scores
                })
                logging.info('Loss: ' + str(eval_loss / num_loss))
                logging.info('AUC: ' + str(auc))
                logging.info('MAP: ' + str(map))

                data_processor_valid.close()
                return (eval_loss / num_loss), auc, df
예제 #4
0
def test_on_ready(gid, obj_guild):
    # New guild test
    DataConnector.run_query("DELETE FROM {}.guilds WHERE guild_id='{}'".format(
        SCHEMA_NAME, gid))
    df = DataConnector.read_data(
        "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 0

    lst_guilds = []
    lst_guilds.append(obj_guild)

    DataProcessor._on_ready(lst_guilds)
    df = DataConnector.read_data(
        "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 1

    # Same guild list test
    DataProcessor._on_ready(lst_guilds)
    df = DataConnector.read_data(
        "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 1

    # Additional guild test
    new_gid = '54321'
    DataConnector.run_query("DELETE FROM {}.guilds WHERE guild_id='{}'".format(
        SCHEMA_NAME, new_gid))
    new_obj_guild = guilds(new_gid)
    lst_guilds.append(new_obj_guild)
    DataProcessor._on_ready(lst_guilds)
    df = DataConnector.read_data("""SELECT *
                                    FROM {}.guilds
                                    WHERE guild_id='{}' or guild_id='{}'
                                 """.format(SCHEMA_NAME, gid, new_gid))
    assert df.shape[0] == 2

    # Removed guilds test
    empty_lst_guilds = []
    DataProcessor._on_ready(empty_lst_guilds)
    df = DataConnector.read_data("""SELECT *
                                    FROM {}.guilds
                                    WHERE guild_id='{}' or guild_id='{}'
                                 """.format(SCHEMA_NAME, gid, new_gid))
    assert df.shape[0] == 0

    # One last sanity check
    lst_guilds.remove(new_obj_guild)
    DataProcessor._on_ready(lst_guilds)
    df = DataConnector.read_data("""SELECT *
                                    FROM {}.guilds
                                    WHERE guild_id='{}' or guild_id='{}'
                                 """.format(SCHEMA_NAME, gid, new_gid))
    assert df.shape[0] == 1
예제 #5
0
            def inference(full_path, full_inference_label_file, in_vocab):
                data_processor_valid = DataProcessor(full_path, in_vocab)
                pred_scores = []

                while True:
                    a_ids_data, a_context_ids_data, a_keyword_index, a_len_data, p_ids_data, p_context_ids_data, \
                    p_keyword_index, p_len_data, n_ids_data, n_context_ids_data, n_keyword_index, n_len_data = \
                        data_processor_valid.get_batch_triple(arg)

                    if len(a_ids_data) != 0:
                        feed_dict = {
                            model.input_a.name: a_ids_data,
                            model.input_a_context.name: a_context_ids_data,
                            model.input_a_keyword_index.name: a_keyword_index,
                            model.input_a_len.name: a_len_data,
                            model.input_n.name: n_ids_data,
                            model.input_n_context.name: n_context_ids_data,
                            model.input_n_keyword_index.name: n_keyword_index,
                            model.input_n_len.name: n_len_data,
                            model.input_p.name: p_ids_data,
                            model.input_p_context.name: p_context_ids_data,
                            model.input_p_keyword_index.name: p_keyword_index,
                            model.input_p_len.name: p_len_data,
                        }

                        ret = sess.run(inference_outputs, feed_dict)
                        pred_scores.append(ret[0][1])

                    if data_processor_valid.end == 1:
                        break

                pred_scores = np.concatenate(pred_scores, axis=0)
                import pandas as pd
                df = pd.read_csv(full_inference_label_file,
                                 sep='\t',
                                 header=None)
                true_scores = df.iloc[:, 2]
                chunks = df.groupby(df.iloc[:, 0]).groups
                for k_num in [1, 5, 10, 20]:
                    topkp, topkr, topkf1 = evaluateTopN(
                        pred_scores, true_scores, chunks, k_num)
                data_processor_valid.close()
                return
예제 #6
0
def test_on_message_watch_channel(gid, obj_guild):
    DataProcessor._on_guild_join(obj_guild)

    cid = '1234567890'
    DataProcessor._on_message_watch_channel(gid, cid)
    df = DataConnector.read_data(
        "SELECT channel_id FROM {}.guilds WHERE guild_id='{}'".format(
            SCHEMA_NAME, gid))
    assert df['channel_id'][0] == cid

    DataProcessor._on_guild_join(obj_guild)
예제 #7
0
def test_on_message_watch_message(gid, obj_guild):
    DataProcessor._on_guild_join(obj_guild)

    mid = '1234567890'
    DataProcessor._on_message_watch_message(gid, mid)
    df = DataConnector.read_data(
        "SELECT message_id FROM {}.guilds WHERE guild_id='{}'".format(
            SCHEMA_NAME, gid))
    assert df['message_id'][0] == mid

    df = DataConnector.read_data(
        "SELECT COUNT(*) FROM {}.guilds WHERE guild_id='{}'".format(
            SCHEMA_NAME, gid))
    assert df['count'][0] == 1

    DataProcessor._on_guild_join(obj_guild)
예제 #8
0
def calculate(i):
    global last_sum_volume, last_price
    # response = binance_api.send_public_request('/api/v3/trades', payload={'symbol': symbol, "limit": 1000})
    response = binance_api.send_public_request('/fapi/v1/trades',
                                               payload={
                                                   'symbol': symbol,
                                                   "limit": 1000
                                               })

    df = DataProcessor.convert_trade_response_to_dataframe(response)
    # df["volume_buy"] = df[ df["isBuyerMaker"] == False ]["qty"].sum()
    # df["volume_sell"] = df[ df["isBuyerMaker"] == True ]["qty"].sum()
    # df["count_buy"] = df[ df["isBuyerMaker"] == False ].shape[0]
    # df["count_sell"] = df[ df["isBuyerMaker"] == True ].shape[0]
    # df["isBuyerMaker"][ df["isBuyerMaker"] == False ] = "buy"
    # df["isBuyerMaker"][ df["isBuyerMaker"] == True ] = "sell"
    # df.to_csv("current_trade_logs.csv")

    sum_volume = df[df["isBuyerMaker"] == False]["qty"].sum() - df[
        df["isBuyerMaker"] == True]["qty"].sum()
    sum_volume_deque.append(sum_volume)
    diff_price = df["price"].iloc[-1] - last_price
    last_price = df["price"].iloc[-1]
    diff_price_deque.append(diff_price)
    zero_deque.append(0)
    if i < max_lenth:
        x_deque.append(i)

    diff_volume = sum_volume - last_sum_volume
    last_sum_volume = sum_volume
    last_sum_volume_deque.append(str(int(diff_volume)))
    ax.set_title(int(sum_volume), loc="left", fontsize=20)
    ax.set_title(int(diff_volume), loc="right", fontsize=20)
    ax.xaxis.set_label_position('top')
    ax.xaxis.tick_top()
    ax.set_xlabel("    ".join(last_sum_volume_deque))
    line1.set_data(x_deque, sum_volume_deque)
    line2.set_data(x_deque, diff_price_deque)
    return [line1, line2]
예제 #9
0
def test_on_guild_remove(gid, obj_guild):
    DataProcessor._on_guild_join(obj_guild)

    # Ensure that size 1
    df = DataConnector.read_data(
        "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 1
    df = DataConnector.read_data(
        "SELECT * FROM {}.days WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 1

    # Remove the guild and verify none in guilds and days tables
    DataProcessor._on_guild_remove(obj_guild)
    df = DataConnector.read_data(
        "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 0
    df = DataConnector.read_data(
        "SELECT * FROM {}.days WHERE guild_id='{}'".format(SCHEMA_NAME, gid))
    assert df.shape[0] == 0

    DataProcessor._on_guild_join(obj_guild)
예제 #10
0
            def valid(in_path, slot_path, intent_path):
                data_processor_valid = DataProcessor(in_path, slot_path,
                                                     intent_path, in_vocab,
                                                     slot_vocab, intent_vocab)

                pred_intents = []
                correct_intents = []
                slot_outputs = []
                correct_slots = []
                input_words = []

                gate_seq = []
                while True:
                    in_data, slot_data, slot_weight, length, intents, in_seq, slot_seq, intent_seq = data_processor_valid.get_batch(
                        arg.batch_size)
                    if len(in_data) <= 0:
                        break
                    feed_dict = {
                        input_data.name: in_data,
                        sequence_length.name: length
                    }
                    ret = sess.run(inference_outputs, feed_dict)
                    for i in ret[0]:
                        pred_intents.append(np.argmax(i))
                    for i in intents:
                        correct_intents.append(i)

                    pred_slots = ret[1].reshape(
                        (slot_data.shape[0], slot_data.shape[1], -1))
                    for p, t, i, l in zip(pred_slots, slot_data, in_data,
                                          length):
                        if arg.use_crf:
                            p = p.reshape([-1])
                        else:
                            p = np.argmax(p, 1)
                        tmp_pred = []
                        tmp_correct = []
                        tmp_input = []
                        for j in range(l):
                            tmp_pred.append(slot_vocab['rev'][p[j]])
                            tmp_correct.append(slot_vocab['rev'][t[j]])
                            tmp_input.append(in_vocab['rev'][i[j]])

                        slot_outputs.append(tmp_pred)
                        correct_slots.append(tmp_correct)
                        input_words.append(tmp_input)

                    if data_processor_valid.end == 1:
                        break

                pred_intents = np.array(pred_intents)
                correct_intents = np.array(correct_intents)
                accuracy = (pred_intents == correct_intents)
                semantic_acc = accuracy
                accuracy = accuracy.astype(float)
                accuracy = np.mean(accuracy) * 100.0

                index = 0
                for t, p in zip(correct_slots, slot_outputs):
                    # Process Semantic Error
                    if len(t) != len(p):
                        raise ValueError('Error!!')

                    for j in range(len(t)):
                        if p[j] != t[j]:
                            semantic_acc[index] = False
                            break
                    index += 1
                semantic_acc = semantic_acc.astype(float)
                semantic_acc = np.mean(semantic_acc) * 100.0

                f1, precision, recall = computeF1Score(correct_slots,
                                                       slot_outputs)
                if "test" in in_path:
                    print("save result_intent.out")
                    with open(str(epochs) + "intent.out", "w") as outfile:
                        for true, pred in zip(correct_intents, pred_intents):
                            outfile.write("{} {}\n".format(true, pred))

                    print("save slot.out")
                    with open(in_path) as infile:
                        data = infile.readlines()
                        lines = [line.split() for line in data]
                        with open(str(epochs) + "-slot.out", "w") as outfile:
                            print(len(lines), len(correct_slots),
                                  len(slot_outputs))
                            for i in range(len(lines)):
                                for w, true, pred in zip(
                                        lines[i], correct_slots[i],
                                        slot_outputs[i]):
                                    outfile.write("{} {} {}\n".format(
                                        w, true, pred))
                                outfile.write("\n")
                logging.info('slot f1: ' + str(f1))
                logging.info('intent accuracy: ' + str(accuracy))
                logging.info('semantic Acc(intent, slots are all correct): ' +
                             str(semantic_acc))

                data_processor_valid.close()
                return f1, accuracy, semantic_acc, pred_intents, correct_intents, slot_outputs, correct_slots, input_words, gate_seq
예제 #11
0
    num_loss = 0
    step = 0
    no_improve = 0

    valid_slot = 0
    test_slot = 0
    valid_intent = 0
    test_intent = 0
    valid_err = 0
    test_err = 0
    best_epoch_num = 0
    while True:
        if data_processor == None:
            data_processor = DataProcessor(
                os.path.join(full_train_path, arg.input_file),
                os.path.join(full_train_path, arg.slot_file),
                os.path.join(full_train_path, arg.intent_file), in_vocab,
                slot_vocab, intent_vocab)
        in_data, slot_data, slot_weight, length, intents, _, _, _ = data_processor.get_batch(
            arg.batch_size)
        feed_dict = {
            input_data.name: in_data,
            slots.name: slot_data,
            slot_weights.name: slot_weight,
            sequence_length.name: length,
            intent.name: intents
        }
        ret = sess.run(training_outputs, feed_dict)
        #print(feed_dict)
        loss += np.mean(ret[1])
예제 #12
0

###########################################
#### brake discs located at the right, left sides of a train #######
df['ComponentParentLocation'].unique()
df[df['ComponentParentLocation'] ==2]
df[df['ComponentParentLocation'].isna()]

mask = (df['ComponentParentLocation'] == 1) & (df['BrskChangeDate1'].notnull())
col_names = ['BrskThickness1', 'TotalPerformanceSnapshot']
thick_1 = df.loc[mask, col_names]

df_sort = df.sort_values('ComponentParentLocation')

cols = ['BrskThickness1', 'BrskThickness2', 'BrskThickness3', 'BrskThickness4']
data_prep = DataProcessor(df)
df_1 = data_prep.string_to_numeric(df, cols=cols)

def mu_sd(df, col_1):
    df.groupby('ComponentParentLocation').agg({col_1: ['mean', 'std']})

df_1.groupby('ComponentParentLocation').agg({'BrskThickness1': ['mean', 'std']})
df_1.groupby('ComponentParentLocation')[cols].describe()
##########################################

columns_to_remove = ['PostID', 'BrskLatheDate1', 'BrskLatheDate2',
                    'BrskLatheDate3', 'BrskLatheDate4',
                    'ReportingDateTime', 'DataSavedInDBDateTime']
df = data_loader.remove_col(df, column_name_list=columns_to_remove)

# create a new column for broms 1 and 2 thickness
예제 #13
0
    line = 0
    num_loss = 0
    step = 0
    no_improve = 0

    #variables to store highest values among epochs, only use 'valid_err' for now
    valid_slot = 0
    test_slot = 0
    valid_intent = 0
    test_intent = 0
    valid_err = 0
    test_err = 0

    while True:
        if data_processor == None:
            data_processor = DataProcessor(os.path.join(full_train_path, arg.input_file), os.path.join(full_train_path, arg.slot_file), os.path.join(full_train_path, arg.intent_file), in_vocab, slot_vocab, intent_vocab)
        in_data, slot_data, slot_weight, length, intents,_,_,_ = data_processor.get_batch(arg.batch_size)
        feed_dict = {input_data.name: in_data, slots.name: slot_data, slot_weights.name: slot_weight, sequence_length.name: length, intent.name: intents}
        ret = sess.run(training_outputs, feed_dict)
        loss += np.mean(ret[1])

        line += arg.batch_size
        step = ret[0]
        num_loss += 1

        if data_processor.end == 1:
            line = 0
            data_processor.close()
            data_processor = None
            epochs += 1
            logging.info('Step: ' + str(step))
예제 #14
0
    epochs = 0
    eval_loss = 0.0
    data_processor = None
    line = 0
    num_loss = 0
    step = 0
    no_improve = 0

    valid_err = 1
    test_err = 1

    while True:
        if data_processor == None:
            data_processor = DataProcessor(full_train_path,
                                           in_vocab,
                                           shuffle=True)
        a_ids_data, a_context_ids_data, a_keyword_index, a_len_data, n_ids_data, n_context_ids_data, n_keyword_index, n_len_data, y = data_processor.get_batch_siamese(
            arg)
        if len(a_ids_data) != 0:
            feed_dict = {
                model.input_a.name: a_ids_data,
                model.input_a_context.name: a_context_ids_data,
                model.input_a_keyword_index.name: a_keyword_index,
                model.input_a_len.name: a_len_data,
                model.input_n.name: n_ids_data,
                model.input_n_context.name: n_context_ids_data,
                model.input_n_keyword_index.name: n_keyword_index,
                model.input_n_len.name: n_len_data,
                model.input_y.name: y
            }
예제 #15
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--bert_model",
        default='bert-base-uncased',
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        '--task',
        type=str,
        default=None,
        required=True,
        help="Task code in {hotpot_open, hotpot_distractor, squad, nq}")

    # Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=378,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=1,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=5,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam. (def: 5e-5)")
    parser.add_argument("--num_train_epochs",
                        default=5.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument('--local_rank', default=-1, type=int)

    # RNN graph retriever-specific parameters
    parser.add_argument("--example_limit", default=None, type=int)

    parser.add_argument("--max_para_num", default=10, type=int)
    parser.add_argument(
        "--neg_chunk",
        default=8,
        type=int,
        help="The chunk size of negative examples during training (to "
        "reduce GPU memory consumption with negative sampling)")
    parser.add_argument(
        "--eval_chunk",
        default=100000,
        type=int,
        help=
        "The chunk size of evaluation examples (to reduce RAM consumption during evaluation)"
    )
    parser.add_argument(
        "--split_chunk",
        default=300,
        type=int,
        help=
        "The chunk size of BERT encoding during inference (to reduce GPU memory consumption)"
    )

    parser.add_argument('--train_file_path',
                        type=str,
                        default=None,
                        help="File path to the training data")
    parser.add_argument('--dev_file_path',
                        type=str,
                        default=None,
                        help="File path to the eval data")

    parser.add_argument('--beam', type=int, default=1, help="Beam size")
    parser.add_argument('--min_select_num',
                        type=int,
                        default=1,
                        help="Minimum number of selected paragraphs")
    parser.add_argument('--max_select_num',
                        type=int,
                        default=3,
                        help="Maximum number of selected paragraphs")
    parser.add_argument(
        "--use_redundant",
        action='store_true',
        help="Whether to use simulated seqs (only for training)")
    parser.add_argument(
        "--use_multiple_redundant",
        action='store_true',
        help="Whether to use multiple simulated seqs (only for training)")
    parser.add_argument(
        '--max_redundant_num',
        type=int,
        default=100000,
        help=
        "Whether to limit the number of the initial TF-IDF pool (only for open-domain eval)"
    )
    parser.add_argument(
        "--no_links",
        action='store_true',
        help=
        "Whether to omit any links (or in other words, only use TF-IDF-based paragraphs)"
    )
    parser.add_argument("--pruning_by_links",
                        action='store_true',
                        help="Whether to do pruning by links (and top 1)")
    parser.add_argument(
        "--expand_links",
        action='store_true',
        help=
        "Whether to expand links with paragraphs in the same article (for NQ)")
    parser.add_argument(
        '--tfidf_limit',
        type=int,
        default=None,
        help=
        "Whether to limit the number of the initial TF-IDF pool (only for open-domain eval)"
    )

    parser.add_argument("--pred_file",
                        default=None,
                        type=str,
                        help="File name to write paragraph selection results")
    parser.add_argument("--tagme",
                        action='store_true',
                        help="Whether to use tagme at inference")
    parser.add_argument(
        '--topk',
        type=int,
        default=2,
        help="Whether to use how many paragraphs from the previous steps")

    parser.add_argument(
        "--model_suffix",
        default=None,
        type=str,
        help="Suffix to load a model file ('pytorch_model_' + suffix +'.bin')")

    parser.add_argument("--db_save_path",
                        default=None,
                        type=str,
                        help="File path to DB")
    parser.add_argument("--fp16", default=False, action='store_true')
    parser.add_argument("--fp16_opt_level", default="O1", type=str)
    parser.add_argument("--do_label",
                        default=False,
                        action='store_true',
                        help="For pre-processing features only.")

    parser.add_argument("--oss_cache_dir", default=None, type=str)
    parser.add_argument("--cache_dir", default=None, type=str)
    parser.add_argument("--dist",
                        default=False,
                        action='store_true',
                        help='use distributed training.')
    parser.add_argument("--save_steps", default=5000, type=int)
    parser.add_argument("--resume", default=None, type=int)
    parser.add_argument("--oss_pretrain", default=None, type=str)
    parser.add_argument("--model_version", default='v1', type=str)
    parser.add_argument("--disable_rnn_layer_norm",
                        default=False,
                        action='store_true')

    args = parser.parse_args()

    if args.dist:
        dist.init_process_group(backend='nccl')
        print(f"local rank: {args.local_rank}")
        print(f"global rank: {dist.get_rank()}")
        print(f"world size: {dist.get_world_size()}")

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
        dist.init_process_group(backend='nccl')

    if args.dist:
        global_rank = dist.get_rank()
        world_size = dist.get_world_size()
        if world_size > 1:
            args.local_rank = global_rank

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.train_file_path is not None:
        do_train = True

        if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".
                format(args.output_dir))
        if args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir, exist_ok=True)

    elif args.dev_file_path is not None:
        do_train = False

    else:
        raise ValueError(
            'One of train_file_path: {} or dev_file_path: {} must be non-None'.
            format(args.train_file_path, args.dev_file_path))

    processor = DataProcessor()

    # Configurations of the graph retriever
    graph_retriever_config = GraphRetrieverConfig(
        example_limit=args.example_limit,
        task=args.task,
        max_seq_length=args.max_seq_length,
        max_select_num=args.max_select_num,
        max_para_num=args.max_para_num,
        tfidf_limit=args.tfidf_limit,
        train_file_path=args.train_file_path,
        use_redundant=args.use_redundant,
        use_multiple_redundant=args.use_multiple_redundant,
        max_redundant_num=args.max_redundant_num,
        dev_file_path=args.dev_file_path,
        beam=args.beam,
        min_select_num=args.min_select_num,
        no_links=args.no_links,
        pruning_by_links=args.pruning_by_links,
        expand_links=args.expand_links,
        eval_chunk=args.eval_chunk,
        tagme=args.tagme,
        topk=args.topk,
        db_save_path=args.db_save_path,
        disable_rnn_layer_norm=args.disable_rnn_layer_norm)

    logger.info(graph_retriever_config)
    logger.info(args)

    tokenizer = AutoTokenizer.from_pretrained(args.bert_model)

    if args.model_version == 'roberta':
        from modeling_graph_retriever_roberta import RobertaForGraphRetriever
    elif args.model_version == 'v3':
        from modeling_graph_retriever_roberta import RobertaForGraphRetrieverIterV3 as RobertaForGraphRetriever
    else:
        raise RuntimeError()

    ##############################
    # Training                   #
    ##############################
    if do_train:
        _model_state_dict = None
        if args.oss_pretrain is not None:
            _model_state_dict = torch.load(load_pretrain_from_oss(
                args.oss_pretrain),
                                           map_location='cpu')
            logger.info(f"Loaded pretrained model from {args.oss_pretrain}")

        if args.resume is not None:
            _model_state_dict = torch.load(load_buffer_from_oss(
                os.path.join(args.oss_cache_dir,
                             f"pytorch_model_{args.resume}.bin")),
                                           map_location='cpu')

        model = RobertaForGraphRetriever.from_pretrained(
            args.bert_model,
            graph_retriever_config=graph_retriever_config,
            state_dict=_model_state_dict)

        model.to(device)

        global_step = 0

        POSITIVE = 1.0
        NEGATIVE = 0.0

        _cache_file_name = f"cache_roberta_train_{args.max_seq_length}_{args.max_para_num}"
        _examples_cache_file_name = f"examples_{_cache_file_name}"
        _features_cache_file_name = f"features_{_cache_file_name}"

        # Load training examples
        logger.info(f"Loading training examples and features.")
        try:
            if args.cache_dir is not None and os.path.exists(
                    os.path.join(args.cache_dir, _features_cache_file_name)):
                logger.info(
                    f"Loading pre-processed features from {os.path.join(args.cache_dir, _features_cache_file_name)}"
                )
                train_features = torch.load(
                    os.path.join(args.cache_dir, _features_cache_file_name))
            else:
                # train_examples = torch.load(load_buffer_from_oss(os.path.join(oss_features_cache_dir,
                #                                                               _examples_cache_file_name)))
                train_features = torch.load(
                    load_buffer_from_oss(
                        os.path.join(oss_features_cache_dir,
                                     _features_cache_file_name)))
                logger.info(
                    f"Pre-processed features are loaded from oss: "
                    f"{os.path.join(oss_features_cache_dir, _features_cache_file_name)}"
                )
        except:
            train_examples = processor.get_train_examples(
                graph_retriever_config)
            train_features = convert_examples_to_features(
                train_examples,
                args.max_seq_length,
                args.max_para_num,
                graph_retriever_config,
                tokenizer,
                train=True)
            logger.info(
                f"Saving pre-processed features into oss: {oss_features_cache_dir}"
            )
            torch_save_to_oss(
                train_examples,
                os.path.join(oss_features_cache_dir,
                             _examples_cache_file_name))
            torch_save_to_oss(
                train_features,
                os.path.join(oss_features_cache_dir,
                             _features_cache_file_name))

        if args.do_label:
            logger.info("Finished.")
            return

        # len(train_examples) and len(train_features) can be different, depending on the redundant setting
        num_train_steps = int(
            len(train_features) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        t_total = num_train_steps
        if args.local_rank != -1:
            t_total = t_total // dist.get_world_size()

        optimizer = AdamW(optimizer_grouped_parameters,
                          betas=(0.9, 0.98),
                          lr=args.learning_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, int(t_total * args.warmup_proportion), t_total)

        logger.info(optimizer)
        if args.fp16:
            from apex import amp
            amp.register_half_function(torch, "einsum")

            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args.fp16_opt_level)

        if args.local_rank != -1:
            if args.fp16_opt_level == 'O2':
                try:
                    import apex
                    model = apex.parallel.DistributedDataParallel(
                        model, delay_allreduce=True)
                except ImportError:
                    model = torch.nn.parallel.DistributedDataParallel(
                        model, find_unused_parameters=True)
            else:
                model = torch.nn.parallel.DistributedDataParallel(
                    model, find_unused_parameters=True)

        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

        if args.resume is not None:
            _amp_state_dict = os.path.join(args.oss_cache_dir,
                                           f"amp_{args.resume}.bin")
            _optimizer_state_dict = os.path.join(
                args.oss_cache_dir, f"optimizer_{args.resume}.pt")
            _scheduler_state_dict = os.path.join(
                args.oss_cache_dir, f"scheduler_{args.resume}.pt")

            amp.load_state_dict(
                torch.load(load_buffer_from_oss(_amp_state_dict)))
            optimizer.load_state_dict(
                torch.load(load_buffer_from_oss(_optimizer_state_dict)))
            scheduler.load_state_dict(
                torch.load(load_buffer_from_oss(_scheduler_state_dict)))

            logger.info(f"Loaded resumed state dict of step {args.resume}")

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_features))
        logger.info("  Instantaneous batch size per GPU = %d",
                    args.train_batch_size)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            args.train_batch_size * args.gradient_accumulation_steps *
            (dist.get_world_size() if args.local_rank != -1 else 1),
        )
        logger.info("  Gradient Accumulation steps = %d",
                    args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        model.train()
        epc = 0
        # test
        if args.local_rank in [-1, 0]:
            if args.fp16:
                amp_file = os.path.join(args.oss_cache_dir,
                                        f"amp_{global_step}.bin")
                torch_save_to_oss(amp.state_dict(), amp_file)
            optimizer_file = os.path.join(args.oss_cache_dir,
                                          f"optimizer_{global_step}.pt")
            torch_save_to_oss(optimizer.state_dict(), optimizer_file)
            scheduler_file = os.path.join(args.oss_cache_dir,
                                          f"scheduler_{global_step}.pt")
            torch_save_to_oss(scheduler.state_dict(), scheduler_file)

        tr_loss = 0
        for _ in range(int(args.num_train_epochs)):
            logger.info('Epoch ' + str(epc + 1))

            TOTAL_NUM = len(train_features)
            train_start_index = 0
            CHUNK_NUM = 8
            train_chunk = TOTAL_NUM // CHUNK_NUM
            chunk_index = 0

            random.shuffle(train_features)

            save_retry = False
            while train_start_index < TOTAL_NUM:
                train_end_index = min(train_start_index + train_chunk - 1,
                                      TOTAL_NUM - 1)
                chunk_len = train_end_index - train_start_index + 1

                if args.resume is not None and global_step < args.resume:
                    _chunk_steps = int(
                        math.ceil(chunk_len * 1.0 / args.train_batch_size /
                                  (1 if args.local_rank == -1 else
                                   dist.get_world_size())))
                    _chunk_steps = _chunk_steps // args.gradient_accumulation_steps
                    if global_step + _chunk_steps <= args.resume:
                        global_step += _chunk_steps
                        train_start_index = train_end_index + 1
                        continue

                train_features_ = train_features[
                    train_start_index:train_start_index + chunk_len]

                all_input_ids = torch.tensor(
                    [f.input_ids for f in train_features_], dtype=torch.long)
                all_input_masks = torch.tensor(
                    [f.input_masks for f in train_features_], dtype=torch.long)
                all_segment_ids = torch.tensor(
                    [f.segment_ids for f in train_features_], dtype=torch.long)
                all_output_masks = torch.tensor(
                    [f.output_masks for f in train_features_],
                    dtype=torch.float)
                all_num_paragraphs = torch.tensor(
                    [f.num_paragraphs for f in train_features_],
                    dtype=torch.long)
                all_num_steps = torch.tensor(
                    [f.num_steps for f in train_features_], dtype=torch.long)
                train_data = TensorDataset(all_input_ids, all_input_masks,
                                           all_segment_ids, all_output_masks,
                                           all_num_paragraphs, all_num_steps)

                if args.local_rank != -1:
                    train_sampler = torch.utils.data.DistributedSampler(
                        train_data)
                else:
                    train_sampler = RandomSampler(train_data)
                train_dataloader = DataLoader(train_data,
                                              sampler=train_sampler,
                                              batch_size=args.train_batch_size,
                                              pin_memory=True,
                                              num_workers=4)

                if args.local_rank != -1:
                    train_dataloader.sampler.set_epoch(epc)

                logger.info('Examples from ' + str(train_start_index) +
                            ' to ' + str(train_end_index))
                for step, batch in enumerate(
                        tqdm(train_dataloader,
                             desc="Iteration",
                             disable=args.local_rank not in [-1, 0])):
                    if args.resume is not None and global_step < args.resume:
                        if (step + 1) % args.gradient_accumulation_steps == 0:
                            global_step += 1
                        continue

                    input_masks = batch[1]
                    batch_max_len = input_masks.sum(dim=2).max().item()

                    num_paragraphs = batch[4]
                    batch_max_para_num = num_paragraphs.max().item()

                    num_steps = batch[5]
                    batch_max_steps = num_steps.max().item()

                    # output_masks_cpu = (batch[3])[:, :batch_max_steps, :batch_max_para_num + 1]

                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_masks, segment_ids, output_masks, _, _ = batch
                    B = input_ids.size(0)

                    input_ids = input_ids[:, :batch_max_para_num, :
                                          batch_max_len]
                    input_masks = input_masks[:, :batch_max_para_num, :
                                              batch_max_len]
                    segment_ids = segment_ids[:, :batch_max_para_num, :
                                              batch_max_len]
                    output_masks = output_masks[:, :batch_max_steps, :
                                                batch_max_para_num +
                                                1]  # 1 for EOE

                    target = torch.zeros(output_masks.size()).fill_(
                        NEGATIVE)  # (B, NUM_STEPS, |P|+1) <- 1 for EOE
                    for i in range(B):
                        output_masks[i, :num_steps[i], -1] = 1.0  # for EOE

                        for j in range(num_steps[i].item() - 1):
                            target[i, j, j].fill_(POSITIVE)

                        target[i, num_steps[i] - 1, -1].fill_(POSITIVE)
                    target = target.to(device)

                    neg_start = batch_max_steps - 1
                    while neg_start < batch_max_para_num:
                        neg_end = min(neg_start + args.neg_chunk - 1,
                                      batch_max_para_num - 1)
                        neg_len = (neg_end - neg_start + 1)

                        input_ids_ = torch.cat(
                            (input_ids[:, :batch_max_steps - 1, :],
                             input_ids[:, neg_start:neg_start + neg_len, :]),
                            dim=1)
                        input_masks_ = torch.cat(
                            (input_masks[:, :batch_max_steps - 1, :],
                             input_masks[:, neg_start:neg_start + neg_len, :]),
                            dim=1)
                        segment_ids_ = torch.cat(
                            (segment_ids[:, :batch_max_steps - 1, :],
                             segment_ids[:, neg_start:neg_start + neg_len, :]),
                            dim=1)
                        output_masks_ = torch.cat(
                            (output_masks[:, :, :batch_max_steps - 1],
                             output_masks[:, :, neg_start:neg_start + neg_len],
                             output_masks[:, :, batch_max_para_num:
                                          batch_max_para_num + 1]),
                            dim=2)
                        target_ = torch.cat(
                            (target[:, :, :batch_max_steps - 1],
                             target[:, :, neg_start:neg_start + neg_len],
                             target[:, :,
                                    batch_max_para_num:batch_max_para_num +
                                    1]),
                            dim=2)

                        if neg_start != batch_max_steps - 1:
                            output_masks_[:, :, :batch_max_steps - 1] = 0.0
                            output_masks_[:, :, -1] = 0.0

                        loss = model(input_ids_, segment_ids_, input_masks_,
                                     output_masks_, target_, batch_max_steps)

                        if n_gpu > 1:
                            loss = loss.mean(
                            )  # mean() to average on multi-gpu.
                        if args.gradient_accumulation_steps > 1:
                            loss = loss / args.gradient_accumulation_steps

                        if args.fp16:
                            with amp.scale_loss(loss,
                                                optimizer) as scaled_loss:
                                scaled_loss.backward()
                        else:
                            loss.backward()

                        tr_loss += loss.item()
                        neg_start = neg_end + 1

                        # del input_ids_
                        # del input_masks_
                        # del segment_ids_
                        # del output_masks_
                        # del target_

                    if (step + 1) % args.gradient_accumulation_steps == 0:

                        if args.fp16:
                            torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer), 1.0)
                        else:
                            torch.nn.utils.clip_grad_norm_(
                                model.parameters(), 1.0)

                        optimizer.step()
                        scheduler.step()
                        # optimizer.zero_grad()
                        model.zero_grad()
                        global_step += 1

                        if global_step % 50 == 0:
                            _cur_steps = global_step if args.resume is None else global_step - args.resume
                            logger.info(
                                f"Training loss: {tr_loss / _cur_steps}\t"
                                f"Learning rate: {scheduler.get_lr()[0]}\t"
                                f"Global step: {global_step}")

                        if global_step % args.save_steps == 0:
                            if args.local_rank in [-1, 0]:
                                model_to_save = model.module if hasattr(
                                    model, 'module') else model
                                output_model_file = os.path.join(
                                    args.oss_cache_dir,
                                    f"pytorch_model_{global_step}.bin")
                                torch_save_to_oss(model_to_save.state_dict(),
                                                  output_model_file)

                            _suffix = "" if args.local_rank == -1 else f"_{args.local_rank}"
                            if args.fp16:
                                amp_file = os.path.join(
                                    args.oss_cache_dir,
                                    f"amp_{global_step}{_suffix}.bin")
                                torch_save_to_oss(amp.state_dict(), amp_file)
                            optimizer_file = os.path.join(
                                args.oss_cache_dir,
                                f"optimizer_{global_step}{_suffix}.pt")
                            torch_save_to_oss(optimizer.state_dict(),
                                              optimizer_file)
                            scheduler_file = os.path.join(
                                args.oss_cache_dir,
                                f"scheduler_{global_step}{_suffix}.pt")
                            torch_save_to_oss(scheduler.state_dict(),
                                              scheduler_file)

                            logger.info(
                                f"checkpoint of step {global_step} is saved to oss."
                            )

                    # del input_ids
                    # del input_masks
                    # del segment_ids
                    # del output_masks
                    # del target
                    # del batch

                chunk_index += 1
                train_start_index = train_end_index + 1

                # Save the model at the half of the epoch
                if (chunk_index == CHUNK_NUM // 2
                        or save_retry) and args.local_rank in [-1, 0]:
                    status = save(model, args.output_dir, str(epc + 0.5))
                    save_retry = (not status)

                del train_features_
                del all_input_ids
                del all_input_masks
                del all_segment_ids
                del all_output_masks
                del all_num_paragraphs
                del all_num_steps
                del train_data
                del train_sampler
                del train_dataloader
                gc.collect()

            # Save the model at the end of the epoch
            if args.local_rank in [-1, 0]:
                save(model, args.output_dir, str(epc + 1))
                # model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                # output_model_file = os.path.join(args.oss_cache_dir, "pytorch_model_" + str(epc + 1) + ".bin")
                # torch_save_to_oss(model_to_save.state_dict(), output_model_file)

            epc += 1

    if do_train:
        return

    ##############################
    # Evaluation                 #
    ##############################
    assert args.model_suffix is not None

    if graph_retriever_config.db_save_path is not None:
        import sys
        sys.path.append('../')
        from pipeline.tfidf_retriever import TfidfRetriever
        tfidf_retriever = TfidfRetriever(graph_retriever_config.db_save_path,
                                         None)
    else:
        tfidf_retriever = None

    if args.oss_cache_dir is not None:
        file_name = 'pytorch_model_' + args.model_suffix + '.bin'
        model_state_dict = torch.load(
            load_buffer_from_oss(os.path.join(args.oss_cache_dir, file_name)))
    else:
        model_state_dict = load(args.output_dir, args.model_suffix)

    model = RobertaForGraphRetriever.from_pretrained(
        args.bert_model,
        state_dict=model_state_dict,
        graph_retriever_config=graph_retriever_config)
    model.to(device)

    model.eval()

    if args.pred_file is not None:
        pred_output = []

    eval_examples = processor.get_dev_examples(graph_retriever_config)

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args.eval_batch_size)

    TOTAL_NUM = len(eval_examples)
    eval_start_index = 0

    while eval_start_index < TOTAL_NUM:
        eval_end_index = min(
            eval_start_index + graph_retriever_config.eval_chunk - 1,
            TOTAL_NUM - 1)
        chunk_len = eval_end_index - eval_start_index + 1

        eval_features = convert_examples_to_features(
            eval_examples[eval_start_index:eval_start_index + chunk_len],
            args.max_seq_length, args.max_para_num, graph_retriever_config,
            tokenizer)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_masks = torch.tensor([f.input_masks for f in eval_features],
                                       dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_output_masks = torch.tensor(
            [f.output_masks for f in eval_features], dtype=torch.float)
        all_num_paragraphs = torch.tensor(
            [f.num_paragraphs for f in eval_features], dtype=torch.long)
        all_num_steps = torch.tensor([f.num_steps for f in eval_features],
                                     dtype=torch.long)
        all_ex_indices = torch.tensor([f.ex_index for f in eval_features],
                                      dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_masks,
                                  all_segment_ids, all_output_masks,
                                  all_num_paragraphs, all_num_steps,
                                  all_ex_indices)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        for input_ids, input_masks, segment_ids, output_masks, num_paragraphs, num_steps, ex_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            batch_max_len = input_masks.sum(dim=2).max().item()
            batch_max_para_num = num_paragraphs.max().item()

            batch_max_steps = num_steps.max().item()

            input_ids = input_ids[:, :batch_max_para_num, :batch_max_len]
            input_masks = input_masks[:, :batch_max_para_num, :batch_max_len]
            segment_ids = segment_ids[:, :batch_max_para_num, :batch_max_len]
            output_masks = output_masks[:, :batch_max_para_num +
                                        2, :batch_max_para_num + 1]
            output_masks[:, 1:, -1] = 1.0  # Ignore EOE in the first step

            input_ids = input_ids.to(device)
            input_masks = input_masks.to(device)
            segment_ids = segment_ids.to(device)
            output_masks = output_masks.to(device)

            examples = [
                eval_examples[eval_start_index + ex_indices[i].item()]
                for i in range(input_ids.size(0))
            ]

            with torch.no_grad():
                pred, prob, topk_pred, topk_prob = model.beam_search(
                    input_ids,
                    segment_ids,
                    input_masks,
                    examples=examples,
                    tokenizer=tokenizer,
                    retriever=tfidf_retriever,
                    split_chunk=args.split_chunk)

            for i in range(len(pred)):
                e = examples[i]
                titles = [e.title_order[p] for p in pred[i]]

                # Output predictions to a file
                if args.pred_file is not None:
                    pred_output.append({})
                    pred_output[-1]['q_id'] = e.guid

                    pred_output[-1]['titles'] = titles
                    pred_output[-1]['probs'] = []
                    for prob_ in prob[i]:
                        entry = {'EOE': prob_[-1]}
                        for j in range(len(e.title_order)):
                            entry[e.title_order[j]] = prob_[j]
                        pred_output[-1]['probs'].append(entry)

                    topk_titles = [[e.title_order[p] for p in topk_pred[i][j]]
                                   for j in range(len(topk_pred[i]))]
                    pred_output[-1]['topk_titles'] = topk_titles

                    topk_probs = []
                    for k in range(len(topk_prob[i])):
                        topk_probs.append([])
                        for prob_ in topk_prob[i][k]:
                            entry = {'EOE': prob_[-1]}
                            for j in range(len(e.title_order)):
                                entry[e.title_order[j]] = prob_[j]
                            topk_probs[-1].append(entry)
                    pred_output[-1]['topk_probs'] = topk_probs

                    # Output the selected paragraphs
                    context = {}
                    for ts in topk_titles:
                        for t in ts:
                            context[t] = e.all_paras[t]
                    pred_output[-1]['context'] = context

        eval_start_index = eval_end_index + 1

        del eval_features
        del all_input_ids
        del all_input_masks
        del all_segment_ids
        del all_output_masks
        del all_num_paragraphs
        del all_num_steps
        del all_ex_indices
        del eval_data

    if args.pred_file is not None:
        json.dump(pred_output, open(args.pred_file, 'w'))
예제 #16
0
    def __init__(
            self,
            args: DataTrainingArguments,
            op_args: GeneralArguments,
            tokenizer: PreTrainedTokenizer,
            limit_length: Optional[int] = None,
            mode: Split = Split.train,
            cache_dir: Optional[str] = None,
    ):
        self.args = args
        self.processor = DataProcessor()
        self.output_mode = op_args.output_mode

        self.processor.set_labels(op_args._labels())

        if isinstance(mode, str):
            try:
                mode = Split[mode]
            except KeyError:
                raise KeyError("mode is not a valid split name")

        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            "cached_{}_{}_{}_{}".format(
                mode.value,
                tokenizer.__class__.__name__,
                str(args.max_seq_length),
                args.task_name,
            )
        )

        label_list = self.processor.get_labels()
        if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
                RobertaTokenizer,
                RobertaTokenizerFast,
                XLMRobertaTokenizer,
                BartTokenizer,
                BartTokenizerFast,
        ):
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        self.label_list = label_list

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else:
                logger.info(f"Creating features from dataset file at {args.data_dir}")
                label_list = self.processor.get_labels()

                if mode.value == 'train':
                    examples = self.processor.get_train_examples(args.data_dir)
                elif mode.value == 'dev':
                    examples = self.processor.get_test_examples(args.data_dir)
                elif mode.value == 'test':
                    examples = self.processor.get_test_examples(args.data_dir)
                else:
                    examples = None

                if limit_length is not None:
                    examples = examples[:limit_length]

                self.features = convert_examples_to_features(
                    examples,
                    tokenizer,
                    max_length=args.max_seq_length,
                    task=args.task_name,
                    label_list=label_list,
                    output_mode=self.output_mode,
                )
                start = time.time()
                torch.save(self.features, cached_features_file)
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )
예제 #17
0
            def valid(in_path, slot_path, intent_path):
                data_processor_valid = DataProcessor(in_path,
                                                     slot_path,
                                                     intent_path,
                                                     in_vocab,
                                                     slot_vocab,
                                                     intent_vocab,
                                                     use_bert=arg.use_bert)

                pred_intents = []
                correct_intents = []
                slot_outputs = []
                correct_slots = []
                input_words = []

                while True:
                    in_data, slot_data, slot_weight, length, intents, in_seq, slot_seq, intent_seq = data_processor_valid.get_batch(
                        arg.batch_size)

                    input_seq_embeddings = np.empty(
                        shape=[0, 0, arg.embed_dim])
                    if arg.use_bert:
                        input_seq_embeddings = get_bert_embeddings(in_seq)

                    feed_dict = {
                        input_data.name: in_data,
                        sequence_length.name: length,
                        input_sequence_embeddings.name: input_seq_embeddings
                    }

                    if len(in_data) != 0:
                        ret = sess.run(inference_outputs, feed_dict)
                        for i in ret[2]:
                            pred_intents.append(np.argmax(i))
                        for i in intents:
                            correct_intents.append(i)

                        pred_slots = ret[3][-1, :, :, :].reshape(
                            (slot_data.shape[0], slot_data.shape[1], -1))
                        for p, t, i, l, s in zip(pred_slots, slot_data,
                                                 in_data, length, slot_seq):
                            p = np.argmax(p, 1)
                            tmp_pred = []
                            tmp_correct = []
                            tmp_input = []
                            for j in range(l):
                                tmp_pred.append(slot_vocab['rev'][p[j]])
                                tmp_correct.append(slot_vocab['rev'][t[j]])
                                tmp_input.append(in_vocab['rev'][i[j]])

                            slot_outputs.append(tmp_pred)
                            correct_slots.append(tmp_correct)
                            input_words.append(tmp_input)
                    if data_processor_valid.end == 1:
                        break
                pred_intents = np.array(pred_intents)
                correct_intents = np.array(correct_intents)
                from sklearn.metrics import classification_report
                logging.info(
                    classification_report(y_true=correct_intents,
                                          y_pred=pred_intents,
                                          digits=4))
                accuracy = (pred_intents == correct_intents)
                semantic_error = accuracy
                accuracy = accuracy.astype(float)
                accuracy = np.mean(accuracy) * 100.0

                index = 0
                for t, p in zip(correct_slots, slot_outputs):
                    # Process Semantic Error
                    if len(t) != len(p):
                        raise ValueError('Error!!')

                    for j in range(len(t)):
                        if p[j] != t[j]:
                            semantic_error[index] = False
                            break
                    index += 1
                semantic_error = semantic_error.astype(float)
                semantic_error = np.mean(semantic_error) * 100.0

                f1, precision, recall = computeF1Score(correct_slots,
                                                       slot_outputs)
                logging.info('slot f1: ' + str(f1))
                logging.info('intent accuracy: ' + str(accuracy))
                logging.info(
                    'semantic error(intent, slots are all correct): ' +
                    str(semantic_error))

                return f1, accuracy, semantic_error, pred_intents, correct_intents, slot_outputs, correct_slots, input_words
예제 #18
0
            #Unk
            # For unk purpose
            if arg.use_unk == True:
                unker = UNKer(os.path.join(full_train_path, arg.input_file),
                              os.path.join(
                                  full_train_path,
                                  arg.input_file + ".unk." + arg.unk_priority),
                              os.path.join(full_train_path, arg.slot_file),
                              ratio=arg.unk_ratio,
                              threshold=arg.unk_threshold,
                              priority=arg.unk_priority)
                data_processor = DataProcessor(
                    os.path.join(full_train_path,
                                 arg.input_file + ".unk." + arg.unk_priority),
                    os.path.join(full_train_path, arg.slot_file),
                    os.path.join(full_train_path, arg.intent_file),
                    in_vocab,
                    slot_vocab,
                    intent_vocab,
                    shuffle=True,
                    use_bert=arg.use_bert)
            else:
                data_processor = DataProcessor(
                    os.path.join(full_train_path, arg.input_file),
                    os.path.join(full_train_path, arg.slot_file),
                    os.path.join(full_train_path, arg.intent_file),
                    in_vocab,
                    slot_vocab,
                    intent_vocab,
                    shuffle=True,
                    use_bert=arg.use_bert)
예제 #19
0
            def valid(in_path, slot_path, intent_path):
                data_processor_valid = DataProcessor(in_path, slot_path, intent_path, in_vocab, slot_vocab, intent_vocab)

                pred_intents = []
                correct_intents = []
                slot_outputs = []
                correct_slots = []
                input_words = []

                #used to gate
                gate_seq = []
                while True:
                    in_data, slot_data, slot_weight, length, intents, in_seq, slot_seq, intent_seq = data_processor_valid.get_batch(arg.batch_size)
                    feed_dict = {input_data.name: in_data, sequence_length.name: length}
                    ret = sess.run(inference_outputs, feed_dict)
                    for i in ret[0]:
                        pred_intents.append(np.argmax(i))
                    for i in intents:
                        correct_intents.append(i)

                    pred_slots = ret[1].reshape((slot_data.shape[0], slot_data.shape[1], -1))
                    for p, t, i, l in zip(pred_slots, slot_data, in_data, length):
                        p = np.argmax(p, 1)
                        tmp_pred = []
                        tmp_correct = []
                        tmp_input = []
                        for j in range(l):
                            tmp_pred.append(slot_vocab['rev'][p[j]])
                            tmp_correct.append(slot_vocab['rev'][t[j]])
                            tmp_input.append(in_vocab['rev'][i[j]])

                        slot_outputs.append(tmp_pred)
                        correct_slots.append(tmp_correct)
                        input_words.append(tmp_input)

                    if data_processor_valid.end == 1:
                        break

                pred_intents = np.array(pred_intents)
                correct_intents = np.array(correct_intents)
                accuracy = (pred_intents==correct_intents)
                semantic_error = accuracy
                accuracy = accuracy.astype(float)
                accuracy = np.mean(accuracy)*100.0

                index = 0
                for t, p in zip(correct_slots, slot_outputs):
                    # Process Semantic Error
                    if len(t) != len(p):
                        raise ValueError('Error!!')

                    for j in range(len(t)):
                        if p[j] != t[j]:
                            semantic_error[index] = False
                            break
                    index += 1
                semantic_error = semantic_error.astype(float)
                semantic_error = np.mean(semantic_error)*100.0

                f1, precision, recall = computeF1Score(correct_slots, slot_outputs)
                logging.info('slot f1: ' + str(f1))
                logging.info('intent accuracy: ' + str(accuracy))
                logging.info('semantic error(intent, slots are all correct): ' + str(semantic_error))

                data_processor_valid.close()
                return f1,accuracy,semantic_error,pred_intents,correct_intents,slot_outputs,correct_slots,input_words,gate_seq
예제 #20
0
data_loader.correlation_numeric_col(df, corr_method="spearman")

cols_list = ["TotalPerformanceSnapshot", "BrskThickness1", 
             "BrskThickness2", "BrskThickness3", "BrskThickness4"]
feature_selector = FeatureSelect(data = df[cols_list])

feature_selector.corr_standarised_num_col(corr_method="pearson")
feature_selector.heat_map()


# wide dataframe to long
# melt
import pandas as pd
df_changedate = pd.melt(df, 
                        id_vars=['PostID'], 
                        value_vars=['BrskChangeDate1', 'BrskChangeDate2', 'BrskChangeDate3', 'BrskChangeDate4'],
                        var_name='Brakes', value_name='ChangeDate')

df.join(df_changedate, on='PostID', how='right')

dat_prep = DataProcessor(df)
dat_prep.eda()

one_hot_enc = DataProcessor(df)
one_hotify_these_categorical = ["VehicleOperatorName", "Littera"]
one_hot_enc.one_hot_encoding(one_hotify_these_categorical)




예제 #21
0
    def __init__(self, data_file, seperator=','):

        data_processor = DataProcessor(data_file,
                                       seperator=seperator,
                                       raw_data=True)
        self.data, self.labels = data_processor.get_training_data(
            raw_text=True)

        self.X_train = self.data
        self.y_train = self.labels

        test_data, test_labels = data_processor.process_test_file(
            '../data/imdb/test.csv', contains_label=True, header=0)

        print('Running Naive Bayes...')
        pipeline, parameters = self.get_naive_bayes_model()
        grid_search_tune = GridSearchCV(pipeline,
                                        parameters,
                                        cv=2,
                                        n_jobs=4,
                                        verbose=10)
        grid_search_tune.fit(self.X_train, self.y_train)
        print("Best parameters set:")
        self.best_estimator_ = grid_search_tune.best_estimator_
        print(grid_search_tune.best_score_)
        self.calculate_metric(test_data, test_labels)
        print('#' * 80)

        print('Running Linear SVM...')
        pipeline, parameters = self.get_linear_svm_model()
        grid_search_tune = GridSearchCV(pipeline,
                                        parameters,
                                        cv=2,
                                        n_jobs=4,
                                        verbose=10)
        grid_search_tune.fit(self.X_train, self.y_train)
        print("Best parameters set:")
        self.best_estimator_ = grid_search_tune.best_estimator_
        print(grid_search_tune.best_score_)
        self.calculate_metric(test_data, test_labels)
        print('#' * 80)

        print('Running Non Linear SVM...')
        pipeline, parameters = self.get_non_linear_svm_model()
        grid_search_tune = GridSearchCV(pipeline,
                                        parameters,
                                        cv=2,
                                        n_jobs=4,
                                        verbose=10)
        grid_search_tune.fit(self.X_train, self.y_train)
        print("Best parameters set:")
        self.best_estimator_ = grid_search_tune.best_estimator_
        print(grid_search_tune.best_score_)
        self.calculate_metric()
        print('#' * 80)

        print('Running Naive Bayes SVM...')
        pipeline, parameters = self.get_nbsvm_model()
        grid_search_tune = GridSearchCV(pipeline,
                                        parameters,
                                        cv=2,
                                        n_jobs=6,
                                        verbose=10)
        grid_search_tune.fit(self.X_train, self.y_train)
        print("Best parameters set:")
        self.best_estimator_ = grid_search_tune.best_estimator_
        print(grid_search_tune.best_score_)
        self.calculate_metric(test_data, test_labels)
        print('#' * 80)
예제 #22
0
    no_improve = 0

    # variables to store highest values among epochs, only use 'valid_err' for now
    valid_slot = 0
    test_slot = 0
    valid_intent = 0
    test_intent = 0
    valid_err = 0
    test_err = 0

    while True:
        if data_processor == None:
            data_processor = DataProcessor(
                os.path.join(full_train_path, arg.input_file),
                os.path.join(full_train_path, arg.slot_file),
                os.path.join(full_train_path, arg.intent_file),
                in_vocab,
                slot_vocab,
                intent_vocab,
            )
        (
            in_data,
            slot_data,
            slot_weight,
            length,
            intents,
            _,
            _,
            _,
        ) = data_processor.get_batch(arg.batch_size)
        feed_dict = {
            input_data.name: in_data,
import config

from api.BinanceApi import BinanceApi
from utils import DataProcessor, DataDownloader

binance_api = BinanceApi(api_type="future")

# symbol = "BNBUSDT"
symbol = "DOGEUSDT"

response = binance_api.send_public_request('/fapi/v1/trades',
                                           payload={
                                               'symbol': symbol,
                                               "limit": 1000
                                           })
df = DataProcessor.convert_trade_response_to_dataframe(response)
print(df)

DataDownloader.download_trade_id('/fapi/v1/historicalTrades',
                                 symbol=symbol,
                                 last_id=363853000,
                                 n_trade=10000000,
                                 delay_time=2.0)

# df = DataDownloader.download_with_number_trade('/fapi/v1/historicalTrades', symbol=symbol, last_id=174000000, n_trade=100000, delay_time=2)
# df.to_csv(f"{config.trade_logs_binance_data_dir}{symbol}.csv")

# df = DataDownloader.download_with_number_trade('/fapi/v1/historicalTrades', symbol=symbol, last_id=174000000, n_trade=100000, delay_time=2)
# df.to_csv(f"{config.trade_logs_binance_data_dir}{symbol}.csv")
# DataDownloader.download_period('/api/v3/historicalTrades', symbol=symbol, id=224114000)
# DataDownloader.download_period('/fapi/v1/historicalTrades', symbol=symbol, last_id=174000000, until_date=datetime(2021, 4, 1), delay_time=1.2)
예제 #24
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default='bert-base-uncased',
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        '--task',
        type=str,
        default=None,
        required=True,
        help="Task code in {hotpot_open, hotpot_distractor, squad, nq, ambigqa}"
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=378,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=1,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=5,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam. (def: 5e-5)")
    parser.add_argument("--num_train_epochs",
                        default=5.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )

    # RNN graph retriever-specific parameters
    parser.add_argument("--example_limit", default=None, type=int)

    parser.add_argument("--max_para_num", default=10, type=int)
    parser.add_argument(
        "--neg_chunk",
        default=8,
        type=int,
        help=
        "The chunk size of negative examples during training (to reduce GPU memory consumption with negative sampling)"
    )
    parser.add_argument(
        "--eval_chunk",
        default=100000,
        type=int,
        help=
        "The chunk size of evaluation examples (to reduce RAM consumption during evaluation)"
    )
    parser.add_argument(
        "--split_chunk",
        default=300,
        type=int,
        help=
        "The chunk size of BERT encoding during inference (to reduce GPU memory consumption)"
    )

    parser.add_argument('--train_file_path',
                        type=str,
                        default=None,
                        help="File path to the training data")
    parser.add_argument('--dev_file_path',
                        type=str,
                        default=None,
                        help="File path to the eval data")

    parser.add_argument('--beam', type=int, default=1, help="Beam size")
    parser.add_argument('--min_select_num',
                        type=int,
                        default=1,
                        help="Minimum number of selected paragraphs")
    parser.add_argument('--max_select_num',
                        type=int,
                        default=3,
                        help="Maximum number of selected paragraphs")
    parser.add_argument(
        "--use_redundant",
        action='store_true',
        help="Whether to use simulated seqs (only for training)")
    parser.add_argument(
        "--use_multiple_redundant",
        action='store_true',
        help="Whether to use multiple simulated seqs (only for training)")
    parser.add_argument(
        '--max_redundant_num',
        type=int,
        default=100000,
        help=
        "Whether to limit the number of the initial TF-IDF pool (only for open-domain eval)"
    )
    parser.add_argument(
        "--no_links",
        action='store_true',
        help=
        "Whether to omit any links (or in other words, only use TF-IDF-based paragraphs)"
    )
    parser.add_argument("--pruning_by_links",
                        action='store_true',
                        help="Whether to do pruning by links (and top 1)")
    parser.add_argument(
        "--expand_links",
        action='store_true',
        help=
        "Whether to expand links with paragraphs in the same article (for NQ)")
    parser.add_argument(
        '--tfidf_limit',
        type=int,
        default=None,
        help=
        "Whether to limit the number of the initial TF-IDF pool (only for open-domain eval)"
    )

    parser.add_argument("--pred_file",
                        default=None,
                        type=str,
                        help="File name to write paragraph selection results")
    parser.add_argument("--tagme",
                        action='store_true',
                        help="Whether to use tagme at inference")
    parser.add_argument(
        '--topk',
        type=int,
        default=2,
        help="Whether to use how many paragraphs from the previous steps")

    parser.add_argument(
        "--model_suffix",
        default=None,
        type=str,
        help="Suffix to load a model file ('pytorch_model_' + suffix +'.bin')")

    parser.add_argument("--db_save_path",
                        default=None,
                        type=str,
                        help="File path to DB")

    args = parser.parse_args()

    cpu = torch.device('cpu')

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.train_file_path is not None:
        do_train = True

        if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".
                format(args.output_dir))
        os.makedirs(args.output_dir, exist_ok=True)

    elif args.dev_file_path is not None:
        do_train = False

    else:
        raise ValueError(
            'One of train_file_path: {} or dev_file_path: {} must be non-None'.
            format(args.train_file_path, args.dev_file_path))

    processor = DataProcessor()

    # Configurations of the graph retriever
    graph_retriever_config = GraphRetrieverConfig(
        example_limit=args.example_limit,
        task=args.task,
        max_seq_length=args.max_seq_length,
        max_select_num=args.max_select_num,
        max_para_num=args.max_para_num,
        tfidf_limit=args.tfidf_limit,
        train_file_path=args.train_file_path,
        use_redundant=args.use_redundant,
        use_multiple_redundant=args.use_multiple_redundant,
        max_redundant_num=args.max_redundant_num,
        dev_file_path=args.dev_file_path,
        beam=args.beam,
        min_select_num=args.min_select_num,
        no_links=args.no_links,
        pruning_by_links=args.pruning_by_links,
        expand_links=args.expand_links,
        eval_chunk=args.eval_chunk,
        tagme=args.tagme,
        topk=args.topk,
        db_save_path=args.db_save_path)

    logger.info(graph_retriever_config)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    ##############################
    # Training                   #
    ##############################
    if do_train:
        model = BertForGraphRetriever.from_pretrained(
            args.bert_model,
            cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
            'distributed_{}'.format(-1),
            graph_retriever_config=graph_retriever_config)

        model.to(device)

        if n_gpu > 1:
            print("Parallel Training.")
            model = torch.nn.DataParallel(model)

        global_step = 0
        nb_tr_steps = 0
        tr_loss = 0

        POSITIVE = 1.0
        NEGATIVE = 0.0

        # Load training examples
        train_examples = None
        num_train_steps = None
        train_examples = processor.get_train_examples(graph_retriever_config)
        train_features = convert_examples_to_features(train_examples,
                                                      args.max_seq_length,
                                                      args.max_para_num,
                                                      graph_retriever_config,
                                                      tokenizer,
                                                      train=True)
        # len(train_examples) and len(train_features) can be different, depedning on the redundant setting
        num_train_steps = int(
            len(train_features) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        t_total = num_train_steps

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total,
                             max_grad_norm=1.0)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        model.train()
        epc = 0
        for _ in range(int(args.num_train_epochs)):
            logger.info('Epoch ' + str(epc + 1))

            TOTAL_NUM = len(train_features)
            train_start_index = 0
            CHUNK_NUM = 4  # this doesn't matter for performance
            train_chunk = TOTAL_NUM // CHUNK_NUM
            chunk_index = 0

            random.shuffle(train_features)

            save_retry = False

            while train_start_index < TOTAL_NUM:
                train_end_index = min(train_start_index + train_chunk - 1,
                                      TOTAL_NUM - 1)
                chunk_len = train_end_index - train_start_index + 1

                train_features_ = train_features[
                    train_start_index:train_start_index + chunk_len]

                all_input_ids = torch.tensor(
                    [f.input_ids for f in train_features_], dtype=torch.long)
                all_input_masks = torch.tensor(
                    [f.input_masks for f in train_features_], dtype=torch.long)
                all_segment_ids = torch.tensor(
                    [f.segment_ids for f in train_features_], dtype=torch.long)
                all_output_masks = torch.tensor(
                    [f.output_masks for f in train_features_],
                    dtype=torch.float)
                all_num_paragraphs = torch.tensor(
                    [f.num_paragraphs for f in train_features_],
                    dtype=torch.long)
                all_num_steps = torch.tensor(
                    [f.num_steps for f in train_features_], dtype=torch.long)
                train_data = TensorDataset(all_input_ids, all_input_masks,
                                           all_segment_ids, all_output_masks,
                                           all_num_paragraphs, all_num_steps)

                train_sampler = RandomSampler(train_data)
                train_dataloader = DataLoader(train_data,
                                              sampler=train_sampler,
                                              batch_size=args.train_batch_size)

                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info('Examples from ' + str(train_start_index) +
                            ' to ' + str(train_end_index))
                for step, batch in enumerate(
                        tqdm(train_dataloader, desc="Iteration")):
                    input_masks = batch[1]
                    batch_max_len = input_masks.sum(dim=2).max().item()

                    num_paragraphs = batch[4]
                    batch_max_para_num = num_paragraphs.max().item()

                    num_steps = batch[5]
                    batch_max_steps = num_steps.max().item()

                    output_masks_cpu = (
                        batch[3])[:, :batch_max_steps, :batch_max_para_num + 1]

                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_masks, segment_ids, output_masks, _, __ = batch
                    B = input_ids.size(0)

                    input_ids = input_ids[:, :batch_max_para_num, :
                                          batch_max_len]
                    input_masks = input_masks[:, :batch_max_para_num, :
                                              batch_max_len]
                    segment_ids = segment_ids[:, :batch_max_para_num, :
                                              batch_max_len]
                    output_masks = output_masks[:, :batch_max_steps, :
                                                batch_max_para_num +
                                                1]  # 1 for EOE

                    target = torch.FloatTensor(output_masks.size()).fill_(
                        NEGATIVE)  # (B, NUM_STEPS, |P|+1) <- 1 for EOE
                    for i in range(B):
                        output_masks[i, :num_steps[i], -1] = 1.0  # for EOE

                        for j in range(num_steps[i].item() - 1):
                            target[i, j, j].fill_(
                                POSITIVE
                            )  # positive paragraphs are stored in order of the right path

                        target[i, num_steps[i] - 1, -1].fill_(POSITIVE)  # EOE
                    target = target.to(device)

                    neg_start = batch_max_steps - 1
                    while neg_start < batch_max_para_num:
                        neg_end = min(neg_start + args.neg_chunk - 1,
                                      batch_max_para_num - 1)
                        neg_len = (neg_end - neg_start + 1)

                        input_ids_ = torch.cat(
                            (input_ids[:, :batch_max_steps - 1, :],
                             input_ids[:, neg_start:neg_start + neg_len, :]),
                            dim=1)
                        input_masks_ = torch.cat(
                            (input_masks[:, :batch_max_steps - 1, :],
                             input_masks[:, neg_start:neg_start + neg_len, :]),
                            dim=1)
                        segment_ids_ = torch.cat(
                            (segment_ids[:, :batch_max_steps - 1, :],
                             segment_ids[:, neg_start:neg_start + neg_len, :]),
                            dim=1)
                        output_masks_ = torch.cat(
                            (output_masks[:, :, :batch_max_steps - 1],
                             output_masks[:, :, neg_start:neg_start + neg_len],
                             output_masks[:, :, batch_max_para_num:
                                          batch_max_para_num + 1]),
                            dim=2)
                        target_ = torch.cat(
                            (target[:, :, :batch_max_steps - 1],
                             target[:, :, neg_start:neg_start + neg_len],
                             target[:, :,
                                    batch_max_para_num:batch_max_para_num +
                                    1]),
                            dim=2)

                        if neg_start != batch_max_steps - 1:
                            output_masks_[:, :, :batch_max_steps - 1] = 0.0
                            output_masks_[:, :, -1] = 0.0

                        loss = model(input_ids_, segment_ids_, input_masks_,
                                     output_masks_, target_, batch_max_steps)

                        if n_gpu > 1:
                            loss = loss.mean(
                            )  # mean() to average on multi-gpu.
                        if args.gradient_accumulation_steps > 1:
                            loss = loss / args.gradient_accumulation_steps

                        loss.backward()
                        tr_loss += loss.item()
                        neg_start = neg_end + 1

                    nb_tr_examples += B
                    nb_tr_steps += 1
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        # modify learning rate with special warm up BERT uses
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / t_total, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1

                chunk_index += 1
                train_start_index = train_end_index + 1

                # Save the model at the half of the epoch
                if (chunk_index == CHUNK_NUM // 2 or save_retry):
                    status = save(model, args.output_dir, str(epc + 0.5))
                    save_retry = (not status)

                del all_input_ids
                del all_input_masks
                del all_segment_ids
                del all_output_masks
                del all_num_paragraphs
                del all_num_steps
                del train_data

            # Save the model at the end of the epoch
            save(model, args.output_dir, str(epc + 1))

            epc += 1

    if do_train:
        return

    ##############################
    # Evaluation                 #
    ##############################
    assert args.model_suffix is not None

    if graph_retriever_config.db_save_path is not None:
        import sys
        sys.path.append('../')
        from pipeline.tfidf_retriever import TfidfRetriever
        tfidf_retriever = TfidfRetriever(graph_retriever_config.db_save_path,
                                         None)
    else:
        tfidf_retriever = None

    model_state_dict = load(args.output_dir, args.model_suffix)

    model = BertForGraphRetriever.from_pretrained(
        args.bert_model,
        state_dict=model_state_dict,
        graph_retriever_config=graph_retriever_config)
    model.to(device)

    model.eval()

    if args.pred_file is not None:
        pred_output = []

    eval_examples = processor.get_dev_examples(graph_retriever_config)

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args.eval_batch_size)

    TOTAL_NUM = len(eval_examples)
    eval_start_index = 0

    while eval_start_index < TOTAL_NUM:
        eval_end_index = min(
            eval_start_index + graph_retriever_config.eval_chunk - 1,
            TOTAL_NUM - 1)
        chunk_len = eval_end_index - eval_start_index + 1

        eval_features = convert_examples_to_features(
            eval_examples[eval_start_index:eval_start_index + chunk_len],
            args.max_seq_length, args.max_para_num, graph_retriever_config,
            tokenizer)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_masks = torch.tensor([f.input_masks for f in eval_features],
                                       dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_output_masks = torch.tensor(
            [f.output_masks for f in eval_features], dtype=torch.float)
        all_num_paragraphs = torch.tensor(
            [f.num_paragraphs for f in eval_features], dtype=torch.long)
        all_num_steps = torch.tensor([f.num_steps for f in eval_features],
                                     dtype=torch.long)
        all_ex_indices = torch.tensor([f.ex_index for f in eval_features],
                                      dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_masks,
                                  all_segment_ids, all_output_masks,
                                  all_num_paragraphs, all_num_steps,
                                  all_ex_indices)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        for input_ids, input_masks, segment_ids, output_masks, num_paragraphs, num_steps, ex_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            batch_max_len = input_masks.sum(dim=2).max().item()
            batch_max_para_num = num_paragraphs.max().item()

            batch_max_steps = num_steps.max().item()

            input_ids = input_ids[:, :batch_max_para_num, :batch_max_len]
            input_masks = input_masks[:, :batch_max_para_num, :batch_max_len]
            segment_ids = segment_ids[:, :batch_max_para_num, :batch_max_len]
            output_masks = output_masks[:, :batch_max_para_num +
                                        2, :batch_max_para_num + 1]
            output_masks[:, 1:, -1] = 1.0  # Ignore EOE in the first step

            input_ids = input_ids.to(device)
            input_masks = input_masks.to(device)
            segment_ids = segment_ids.to(device)
            output_masks = output_masks.to(device)

            examples = [
                eval_examples[eval_start_index + ex_indices[i].item()]
                for i in range(input_ids.size(0))
            ]

            with torch.no_grad():
                pred, prob, topk_pred, topk_prob = model.beam_search(
                    input_ids,
                    segment_ids,
                    input_masks,
                    examples=examples,
                    tokenizer=tokenizer,
                    retriever=tfidf_retriever,
                    split_chunk=args.split_chunk)

            for i in range(len(pred)):
                e = examples[i]
                titles = [e.title_order[p] for p in pred[i]]

                # Output predictions to a file
                if args.pred_file is not None:
                    pred_output.append({})
                    pred_output[-1]['q_id'] = e.guid

                    pred_output[-1]['titles'] = titles
                    pred_output[-1]['probs'] = []
                    for prob_ in prob[i]:
                        entry = {'EOE': prob_[-1]}
                        for j in range(len(e.title_order)):
                            entry[e.title_order[j]] = prob_[j]
                        pred_output[-1]['probs'].append(entry)

                    topk_titles = [[e.title_order[p] for p in topk_pred[i][j]]
                                   for j in range(len(topk_pred[i]))]
                    pred_output[-1]['topk_titles'] = topk_titles

                    topk_probs = []
                    for k in range(len(topk_prob[i])):
                        topk_probs.append([])
                        for prob_ in topk_prob[i][k]:
                            entry = {'EOE': prob_[-1]}
                            for j in range(len(e.title_order)):
                                entry[e.title_order[j]] = prob_[j]
                            topk_probs[-1].append(entry)
                    pred_output[-1]['topk_probs'] = topk_probs

                    # Output the selected paragraphs
                    context = {}
                    for ts in topk_titles:
                        for t in ts:
                            context[t] = e.all_paras[t]
                    pred_output[-1]['context'] = context

        eval_start_index = eval_end_index + 1

        del eval_features
        del all_input_ids
        del all_input_masks
        del all_segment_ids
        del all_output_masks
        del all_num_paragraphs
        del all_num_steps
        del all_ex_indices
        del eval_data

    if args.pred_file is not None:
        json.dump(pred_output, open(args.pred_file, 'w'))
    #     'num_steps': 300,
    #     'embed_size': 300,
    #     'hidden_size':300,
    #     'keep_prob': 0.5,
    #     'batch_size': 32,
    #     'num_classes': 2,
    #     'vocab_size': 40000,
    #     'combine_mode': 'last',
    #     'weight_decay': 1e-8,
    #     'save_path': 'checkpoint/imdb/'
    # }



    if len(sys.argv) ==2 and  sys.argv[1] == 'std':
        x_train, x_test, y_train, y_test = imdb_for_library(seq_len=config['num_steps'], max_features=config['vocab_size'])
    else:
        data_path = 'data/imdb/train.csv'
        data_processor = DataProcessor(data_path,vocab_size=config['vocab_size'],\
                        seperator=',',max_seq_len=config['num_steps'],header=0,reverse=True)
        x_train , y_train    = data_processor.get_training_data()
        x_test, y_test = data_processor.process_test_file( 'data/imdb/test.csv',contains_label=True,header=0)
        # embedding = data_processor.get_embedding(config['embed_size'])
    #     # print('Embedding Shape',embedding.shape)

    trainer = Trainer(config,x_train,y_train,embedding=None)
    trainer.train()
    trainer.load_best_model()
    pred = trainer.predict(x_test)
    print(classification_report(y_true=y_test,y_pred=pred))
예제 #26
0
from trainer import train_model, save_png

import warnings
warnings.filterwarnings('ignore')

# ===============
# Settings
# ===============
parser = argparse.ArgumentParser()
parser.add_argument('--common', default='../configs/common/default.yml')
parser.add_argument('--notify', default='../configs/common/notify.yml')
parser.add_argument('-m', '--model')
parser.add_argument('-c', '--comment')
options = parser.parse_args()

dp = DataProcessor()
config = dp.load(options.common)
config.update(dp.load(f'../configs/exp/{options.model}.yml'))

# ===============
# Constants
# ===============
comment = options.comment
now = datetime.datetime.now()
model_name = options.model
run_name = f'{model_name}_{now:%Y%m%d%H%M%S}'

compe_params = config.compe
data_params = config.data
train_params = config.train_params
setting_params = config.settings
예제 #27
0
            def valid(in_path, slot_path, intent_path):
                data_processor_valid = DataProcessor(in_path, slot_path,
                                                     intent_path, in_vocab,
                                                     slot_vocab, intent_vocab)

                pred_intents = []
                correct_intents = []
                slot_outputs = []
                correct_slots = []
                input_words = []

                # used to gate
                gate_seq = []
                while True:
                    (
                        in_data,
                        slot_data,
                        slot_weight,
                        length,
                        intents,
                        in_seq,
                        slot_seq,
                        intent_seq,
                    ) = data_processor_valid.get_batch(arg.batch_size)
                    feed_dict = {
                        input_data.name: in_data,
                        sequence_length.name: length
                    }
                    ret = sess.run(inference_outputs, feed_dict)
                    for i in ret[0]:
                        pred_intents.append(np.argmax(i))
                    for i in intents:
                        correct_intents.append(i)

                    pred_slots = ret[1].reshape(
                        (slot_data.shape[0], slot_data.shape[1], -1))
                    for p, t, i, l in zip(pred_slots, slot_data, in_data,
                                          length):
                        p = np.argmax(p, 1)
                        tmp_pred = []
                        tmp_correct = []
                        tmp_input = []
                        for j in range(l):
                            tmp_pred.append(slot_vocab["rev"][p[j]])
                            tmp_correct.append(slot_vocab["rev"][t[j]])
                            tmp_input.append(in_vocab["rev"][i[j]])

                        slot_outputs.append(tmp_pred)
                        correct_slots.append(tmp_correct)
                        input_words.append(tmp_input)

                    if data_processor_valid.end == 1:
                        break

                pred_intents = np.array(pred_intents)
                correct_intents = np.array(correct_intents)
                accuracy = pred_intents == correct_intents
                semantic_error = accuracy
                accuracy = accuracy.astype(float)
                accuracy = np.mean(accuracy) * 100.0

                index = 0
                for t, p in zip(correct_slots, slot_outputs):
                    # Process Semantic Error
                    if len(t) != len(p):
                        raise ValueError("Error!!")

                    for j in range(len(t)):
                        if p[j] != t[j]:
                            semantic_error[index] = False
                            break
                    index += 1
                semantic_error = semantic_error.astype(float)
                semantic_error = np.mean(semantic_error) * 100.0

                f1, precision, recall = computeF1Score(correct_slots,
                                                       slot_outputs)
                logging.info("slot f1: " + str(f1))
                logging.info("intent accuracy: " + str(accuracy))
                logging.info(
                    "semantic error(intent, slots are all correct): " +
                    str(semantic_error))

                data_processor_valid.close()
                return (
                    f1,
                    accuracy,
                    semantic_error,
                    pred_intents,
                    correct_intents,
                    slot_outputs,
                    correct_slots,
                    input_words,
                    gate_seq,
                )
예제 #28
0
파일: train.py 프로젝트: hrlinlp/JAVE-1
    "/txts.embedded.npy",  # text encoded by pre-trained bert, shape=[N, max_len_of_word_seqs, dim_of_bert_output]
    paths["embedded"] +
    "/txts.embeddedG.npy",  # vectors of [CLS] encoded by a pre-trained bert, shape=[N, dim_of_bert_output]
    paths["embedded"] +
    "/cids_of_imgs",  # indexes to find image encoded vector 
    paths["embedded"] +
    "/imgs.embedded.npy",  # image encoded by pre-trained resnet, shape=[N, image_region_num, dim_of_resnet_output]
    paths["embedded"] +
    "/imgs.embeddedG.npy"  # image encoded by pre-trained resnet, shape=[N, dim_of_resnet_output]
)

# data_processor: utils for data processing(load data, get batch samples, etc.)
data_processor_train = DataProcessor(paths["train_data"] + "/indexs",
                                     paths["train_data"] + "/input.seq",
                                     paths["train_data"] + "/output.seq",
                                     paths["train_data"] + "/output.label",
                                     w2i_word,
                                     w2i_bio,
                                     w2i_label,
                                     shuffling=True)

data_processor_valid = DataProcessor(paths["valid_data"] + "/indexs",
                                     paths["valid_data"] + "/input.seq",
                                     paths["valid_data"] + "/output.seq",
                                     paths["valid_data"] + "/output.label",
                                     w2i_word,
                                     w2i_bio,
                                     w2i_label,
                                     shuffling=False)

data_processor_test = DataProcessor(paths["test_data"] + "/indexs",
                                    paths["test_data"] + "/input.seq",
예제 #29
0
chlr.setFormatter(formatter)
fhlr = logging.FileHandler(log_file_path)
fhlr.setFormatter(formatter)
logger.addHandler(chlr)
logger.addHandler(fhlr)

logger.info("loading vocab...")

w2i_char, i2w_char = load_vocabulary("./data/vocab_char.txt")
w2i_bio, i2w_bio = load_vocabulary("./data/vocab_bioattr.txt")

logger.info("loading data...")

data_processor_train = DataProcessor("./data/train/input.seq.char",
                                     "./data/train/output.seq.bioattr",
                                     w2i_char,
                                     w2i_bio,
                                     shuffling=True)

data_processor_valid = DataProcessor("./data/test/input.seq.char",
                                     "./data/test/output.seq.bioattr",
                                     w2i_char,
                                     w2i_bio,
                                     shuffling=True)

logger.info("building model...")

model = MyModel(embedding_dim=300,
                hidden_dim=300,
                vocab_size_char=len(w2i_char),
                vocab_size_bio=len(w2i_bio),
예제 #30
0
파일: predict.py 프로젝트: sunnyhuma171/mae
    "image_vector": "./data/image_fc_vectors.npy"
}

use_image = False

print("load data...")

w2i_word, i2w_word = load_vocabulary(paths["vocab_word"])
w2i_attr, i2w_attr = load_vocabulary(paths["vocab_attr"])
w2i_value, i2w_value = load_vocabulary(paths["vocab_value"])

data_processor = DataProcessor(
    paths["test_data"] + "/input.seq",
    paths["test_data"] + "/input.imageindex",
    paths["test_data"] + "/input.attr",
    paths["test_data"] + "/output.value",
    w2i_word,
    w2i_attr, 
    w2i_value, 
    shuffling=False
)

if use_image:
    image_vector_container = VectorContainer(paths["image_vector"], img_embedding_size)
else:
    image_vector_container = VectorContainer_ZERO(img_embedding_size)

print("loading checkpoint from", paths["ckpt"], "...")
    
tf_config = tf.ConfigProto(allow_soft_placement=True)
tf_config.gpu_options.allow_growth = True
sess = tf.Session(config=tf_config)
예제 #31
0
from utils.uiUtils import yesNoPrompt
from utils import uiUtils
#Custom datset processor
from utils import DataProcessor
import json
import cv2
import numpy as np

#Prompt user to select dataset
print("Which dataset would you like to visualize?")
dataset = uiUtils.listOptionsPrompt(['train', 'validate', 'test', 'exit'])

while (dataset != 'exit'):
    pp = DataProcessor.DataPreProcessor(args.data_directory,
                                        1,
                                        dataset,
                                        args,
                                        debug=True)
    for i, (inputs, labels, meta) in enumerate(pp):
        title = 'Image #' + str(i)
        title += ', ' + str(meta[0]).replace(args.data_directory, '')
        print(title)

        print('LABELS')
        print(labels[0])
        fgDisplay = cv2.resize(np.reshape(inputs['input_4'], (25, 25)),
                               (224, 224))
        fgDisplay = np.stack((fgDisplay, fgDisplay, fgDisplay), axis=2)

        #Place 3 images side by side to display
        output = np.concatenate((inputs['input_3'][0], fgDisplay,
        'num_layers': 1,
        'num_steps': 50,
        'embed_size': 300,
        'hidden_size': 600,
        'keep_prob': 0.7,
        'batch_size': 16,
        'num_classes': 2,
        'vocab_size': 40000,
        'combine_mode': 'last',
        'weight_decay': 3e-6,
        'save_path': 'checkpoint/cove/'
    }

    data_path = 'data/imdb/train.csv'
    data_processor = DataProcessor(data_path,vocab_size=config['vocab_size'],\
                                   seperator=',',max_seq_len=config['num_steps'],\
                                   header=0,reverse=True)
    data, labels = data_processor.get_training_data()
    print('Train Data Shape', data.shape, labels.shape)
    embedding = data_processor.get_embedding(config['embed_size'])
    print('Embedding Shape', embedding.shape)
    test_data, test_labels = data_processor.process_test_file(
        'data/imdb/test.csv', contains_label=True, header=0)
    trainer = Trainer(config, data, labels, embedding)
    # trainer.X_test = test_data
    # trainer.y_test = test_labels
    trainer.train()
    trainer.load_best_model()
    pred = trainer.predict(test_data)
    print(classification_report(y_true=test_labels, y_pred=pred))