def main():
    """
    Using the fastText model here to predict licenses using automatic hyperparameter tuning
    """
    os.chdir('../../../all_files_generated')
    current_dir = os.getcwd()

    text_files_dir = os.path.join(current_dir, 'text_files')
    model_pickles_dir = os.path.join(current_dir, 'model_pickles')
    model_confusion_matrix_dir = os.path.join(current_dir, 'model_confusion_matrix_files')

    training_validation_file_path = os.path.join(text_files_dir, 'train_validation.txt')
    test_file_path = os.path.join(text_files_dir, 'test.txt')

    model_path = os.path.join(model_pickles_dir, 'fasttext.pickle')
    confusion_matrix_path = os.path.join(model_confusion_matrix_dir, 'fast_text_confusion_matrix.png')

    try:
        license_classifier = fasttext.load_model(model_path)
        print('Model was loaded in successfully!')
    except ValueError as e:
        print('fastText model will begin training ...')
        license_classifier = fasttext.train_supervised(input=training_validation_file_path,
                                                       autotuneValidationFile=test_file_path,
                                                       autotuneDuration=60)
        print('fastText model finished training')
        print('Saving model ...')
        license_classifier.save_model(model_path)
        print('Saved!')

    print('Starting predictions ...')

    x_train = []
    y_train = []
    train_predictions = []
    with open(training_validation_file_path, 'r', encoding='utf-8') as train_file:
        for line in train_file.readlines():
            line_array = line.split('__label__')
            comment_block_text = line_array[0].strip()
            label = int(line_array[1])
            x_train.append(comment_block_text)
            y_train.append(label)
            train_predictions.append(int(license_classifier.predict(comment_block_text)[0][0][9:]))

    x_test = []
    test_predictions = []
    y_test = []
    with open(test_file_path, 'r', encoding='utf-8') as validation_file:
        for line in validation_file.readlines():
            line_array = line.split('__label__')
            comment_block_text = line_array[0].strip()
            label = int(line_array[1])
            x_test.append(comment_block_text)
            y_test.append(label)
            test_predictions.append(int(license_classifier.predict(comment_block_text)[0][0][9:]))

    print('Predictions complete!')

    # Training accuracy
    print("The training accuracy is: ")
    print(accuracy_score(y_train, train_predictions))

    # Test accuracy
    print("The test accuracy is: ")
    print(accuracy_score(y_test, test_predictions))

    # Classification report
    print("Classification report")
    print(classification_report(y_test, test_predictions))

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, test_predictions)
    print(conf_matrix)
    plt.figure(figsize=(12.8, 6))
    sns.heatmap(conf_matrix,
                annot=True,
                xticklabels=['not_license', 'license'],
                yticklabels=['not_license', 'license'],
                cmap="Blues")
    plt.ylabel('Predicted')
    plt.xlabel('Actual')
    plt.title('Confusion matrix')
    plt.savefig(confusion_matrix_path)
    plt.show()
def calibrate(collection, issn_map, is_stratified, sample_data_file=None):
    if sample_data_file is None:
        sample_data = sample(collection, issn_map, is_stratified)
    else:
        sample_data = f"{PV_MOUNT}{sample_data_file}"
        download_object("sampling", sample_data_file, sample_data)
    data = pd.read_json(sample_data, orient="records",
                        lines=True).to_dict(orient='records')
    #download_object("tmp", sample_data.split('/')[-1], sample_data)
    logger.debug("len data = " + str(len(data)))
    logger.debug("len issn_map = " + str(len(issn_map)))
    for elt in data:
        if '_id' in elt:
            del elt['_id']
        current_label_text = []
        #current_label_text_global = []
        for issn_type in ['issn_electronic', 'issn_print']:
            issn = elt[issn_type]
            if issn in issn_map:
                current_label_text += issn_map[issn]

        current_label_text = list(set(current_label_text))

        elt["labels_text"] = current_label_text

    data_with_label = [e for e in data if len(e['labels_text'])]

    data_train, data_test = train_test_split(data_with_label,
                                             test_size=85000,
                                             random_state=0)

    for data_type in ["train", "test"]:
        logger.debug(data_type)
        outfile = {}
        for f in [
                'title', 'abstract', 'keywords', 'mesh_headings',
                'journal_title'
        ]:
            outfile[f] = open(f"{PV_MOUNT}{collection}_{data_type}_{f}.txt",
                              "w")
            outfile[f].close()

        for f in [
                'title', 'abstract', 'keywords', 'mesh_headings',
                'journal_title'
        ]:
            outfile[f] = open(f"{PV_MOUNT}{collection}_{data_type}_{f}.txt",
                              "a+")
            logger.debug(f)

            if data_type == "train":
                current_data = data_train
            else:
                current_data = data_test

            for ix, elt in enumerate(current_data):
                if ix % 100000 == 0:
                    logger.debug(ix)

                current_words = elt.get(f)
                if current_words is None:
                    continue

                if isinstance(current_words, list):
                    current_words = " ".join(current_words)

                if f == "abstract" and len(current_words.split(" ")) < 20:
                    continue
                elif f == "title" and len(current_words.split(" ")) < 10:
                    continue
                elif len(current_words.split(" ")) < 2:
                    continue
                elif len(current_words) < 5:
                    continue

                current_words = normalize(current_words)

                labels = [
                    "__label__" + label.replace(' ', '_')
                    for label in elt.get('labels_text', [])
                ]

                tags = " ".join(labels)

                newline = current_words + " " + tags + "\n"

                outfile[f].write(newline)
            outfile[f].close()

    for f in [
            'journal_title', 'title', 'abstract', 'keywords', 'mesh_headings'
    ]:
        logger.debug("training " + f)

        model = fasttext.train_supervised(
            f'{PV_MOUNT}{collection}_train_{f}.txt',
            wordNgrams=2,
            minCount=20,
            loss='ova',
            epoch=50)
        model_filename = f"{PV_MOUNT}{collection}_model_{f}_strat{is_stratified}.model"
        model.save_model(model_filename)
        upload_object("models", model_filename)

        test = model.test(f'{PV_MOUNT}{collection}_test_{f}.txt',
                          k=-1,
                          threshold=0.5)
        precision = test[1]
        recall = test[2]
        f1 = 2 * (recall * precision) / (recall + precision)
        logger.debug(f"precision: {precision}, recall: {recall}, f1: {f1}")
Exemplo n.º 3
0
        if count<10000:
            ftrain.write(outline)
            ftrain.flush()
            continue
        elif count<20000:
            ftest.write()
            ftest.flush()
            continue
        else:
            break
ftrain.close()
ftest.close()
print("---------dataset done--------")


classifier=fasttext.train_supervised("news_fasttext_train.txt",label_prefix="_label_")  #训练模型
classifier.save_model("Model.bin")   #保存模型
# classifier=fasttext.load_model('Model.bin')   #已经有训练好的模型的话,直接加载训练好的模型
print("---------train done----------")

#预测,输出准确率
result=classifier.test("news_fasttext_test.txt")
print('precision:   ',result[1])

print("---------输出各类的统计情况----------")
#以下模块可以统计不同分类的结果
labels_right = []
texts = []
with open("news_fasttext_test.txt",encoding="utf-8") as fr:
    for line in fr:
        line = str(line.encode("utf-8"), 'utf-8').rstrip()
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
import fasttext
import time
import os
#训练模型
# path = "fasttext"
# os.chdir(path)
# 监督训练模型
start = time.clock()
model = fasttext.train_supervised(
    input="train.txt",
    label_prefix="__label__",
    lr=0.05,
    epoch=25,
    # wordNgrams=2, # 该参数导致准确率降低
    bucket=200000,
    dim=50,
    loss="softmax"  # 可选loss='softmax'
)
end = time.clock()
print('Running time: %s Seconds' % (end - start))
model.save_model("model_news_fasttext.bin")
#load训练好的模型
model = fasttext.load_model('model_news_fasttext.bin')
print('训练完成!')
Exemplo n.º 5
0
import sys
import fasttext as ft
from pprint import pprint
'''
パラメータ説明
dim, 次元の数
lr, 学習率(1.0に近いほど学習が早いけれど不安定)
epoch, 学習回数(デフォルト5 多すぎると過学習)
'''
model = ft.train_supervised('test.txt', dim=200, lr=0.5, epoch=10, thread=16)
model.save_model("model_filename.bin")

#pprint(model.labels)

pprint(model.test_label("test.txt"))
pprint(model.test('test.txt'))
Exemplo n.º 6
0
for train_idx, val_idx in kfold.split(X):

    train_X, train_y = X[train_idx], y[train_idx]
    val_X, val_y = X[val_idx], y[val_idx]

    print(f'DEBUG: train X {len(train_X)} tweets, train y {len(train_y)} tweets')
    print(f'DEBUG: validation X {len(val_X)} tweets, validation y {len(val_y)} tweets')

    fasttext_format(train_X, train_y, TRAIN_FORMATTED)

    print('Training model on train set...')
    model = ft.train_supervised( \
            input=TRAIN_FORMATTED, \
            epoch=PARAMS['epoch'], \
            word_ngrams=PARAMS['word_ngrams'], \
            min_count=PARAMS['min_count'], \
            ws=PARAMS['ws'], \
            lr=PARAMS['lr'], \
            loss=PARAMS['loss'], \
            neg=PARAMS['neg'], \
            dim=PARAMS['dim'])

    print('Computing predictions on validation set...')
    preds = model.predict(list(val_X))
    val_preds = convert_preds(preds)

    print('Computing accuracy...')
    acc = accuracy(val_preds, val_y)
    accuracies.append(acc)
    print(acc)

print(np.array(accuracies).mean())
Exemplo n.º 7
0
)

test.to_csv(
    DIR + '/corpus_ft.test'
    , header=False
    , index=False
    , sep='\t'
)

# Train the Model
# Manual Tuning (ie trial and error)
model = fasttext.train_supervised(
    DIR + '/corpus_ft.train'
    , lr=1.0
    , epoch=25
    , wordNgrams=3
    , bucket=200000
    , dim=50
    , loss='ova'
)

# Test Manual Model
results = model.test(DIR + '/corpus_ft.test', k=4, threshold=0.7)
print(results)

# Train the Model with Auto-Tuning
# # Auto-Tunes parameters (gets the best parameters for the above parameters like lr, wordNgrams, etc.)
# automodel = fasttext.train_supervised(
#         DIR + '/corpus_ft.train'
#     , autotuneValidationFile=DIR + '/corpus_ft.test'
#     , autotuneDuration=300 # Tune for 5 minutes
Exemplo n.º 8
0
import fasttext

model = fasttext.train_supervised(input="./oroscopo-data/oroscopo.train",
                                  epoch=20,
                                  dim=500,
                                  wordNgrams=2)
model.save_model("model_oroscopo_big.bin")

print(model.test("./oroscopo-data/oroscopo.valid"))
    def run_on_file(self,
                    input_filename,
                    output_filename,
                    user_id,
                    project_id,
                    label_id=None,
                    pipeline=None,
                    bootstrap_iterations=0,
                    bootstrap_threshold=0.9,
                    run_on_entire_dataset=False):
        input_filename = os.path.abspath(input_filename)
        output_filename = os.path.abspath(output_filename)
        output_folder = os.path.join(os.path.dirname(output_filename),
                                     'results')
        os.makedirs(output_folder, exist_ok=True)

        print(
            'Running text classification model on input file {}. Results will be saved to {}...'
            .format(input_filename, output_filename))
        print('Reading input file...')
        if input_filename[-8:] == '.parquet':
            df = pd.read_parquet(input_filename)
        else:
            df = pd.read_csv(input_filename, encoding='latin1')

        label_field = 'label_id'
        if 'label_id' in df.columns:
            df['label'] = df['label_id']
        elif 'label' not in df.columns:
            raise ValueError(
                "no columns 'label' or 'label_id' exist in input file")

        df = df[~pd.isnull(df['text'])]

        df.loc[:, label_field] = df[label_field].apply(
            lambda x: str(x) if not pd.isnull(x) else x)
        df.loc[df[label_field] == ' ', label_field] = None

        if label_id:
            df_labeled = df[df[label_field] == label_id]
            df_labeled = pd.concat([
                df_labeled,
                df[df[label_field] != label_id].sample(df_labeled.shape[0])
            ])
            df_labeled.loc[df_labeled[label_field] != label_id,
                           label_field] = 0
            df_labeled = df_labeled[(~pd.isnull(df_labeled[label_field]))
                                    & (df_labeled[label_field] != ' ')]
        else:
            df_labeled = df[(~pd.isnull(df[label_field]))]

        print('Pre-processing text and extracting features...')
        self.set_preprocessor(pipeline)
        X = self.pre_process(df_labeled, fit=True)

        if label_field not in df_labeled.columns:
            raise RuntimeError("column '{}' not found".format(label_field))
        else:
            y = df_labeled[label_field].values

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2)

        print('Training the model...')
        self.fit(X_train, y_train)

        print('Performance on train set:')
        _, evaluation_text = self.evaluate(X_train, y_train)
        result = 'Performance on train set: \n' + evaluation_text

        print('Performance on test set:')
        _, evaluation_text = self.evaluate(X_test, y_test)
        result = result + '\nPerformance on test set: \n' + evaluation_text

        df_gold_labels = df[df['user_id'] == 'gold_label']
        y_gold_labels = df_gold_labels[label_field].values
        if len(y_gold_labels) > 0:
            X_gold_labels = self.pre_process(df_gold_labels, fit=False)
            print('Performance on gold labels set:')
            _, evaluation_text = self.evaluate(X_gold_labels, y_gold_labels)
            result = result + '\nPerformance on gold labels set: \n' + evaluation_text
        else:
            print(
                'Gold labels do not exist - skipping the evaluation of model performance on them.'
            )

        if run_on_entire_dataset:
            print('Running the model on the entire dataset...')

            columns = ['document_id', label_field, 'user_id', 'prob']

            if bootstrap_iterations > 0:
                print('Bootstrapping...')
            y_aug = df[label_field].copy()
            for i in range(bootstrap_iterations + 1):
                # fitting on labeled examples
                has_label = ~pd.isna(y_aug)
                X_labeled = self.pre_process(df.loc[has_label], fit=False)
                self.fit(X_labeled, y_aug[has_label])

                # predict in chunks and (optionally) add bootstrapped labels
                chunk_size = 10000
                n_samples = df.shape[0]
                for chunk_start in tqdm(range(0, n_samples, chunk_size)):
                    chunk_end = min(n_samples, chunk_start + chunk_size)
                    chunk_df = df.iloc[chunk_start:chunk_end]
                    chunk_df.loc[:, label_field] = None
                    y_chunk = df.iloc[chunk_start:chunk_end][label_field]
                    X_chunk = self.pre_process(chunk_df, fit=False)

                    if i < bootstrap_iterations:
                        print('bootstrap iteration ', i, '/',
                              bootstrap_iterations, ' ', [
                                  x for x in zip(
                                      np.unique(y_aug[has_label],
                                                return_counts=True))
                              ])

                        # no need to re-fit the model, only predict
                        y_chunk_aug = self.bootstrap(X_chunk,
                                                     y=y_chunk,
                                                     th=bootstrap_threshold,
                                                     fit=False)
                        y_aug.iloc[chunk_start:chunk_end] = y_chunk_aug

                    # write to file only in last iteration
                    if i == bootstrap_iterations:
                        chunk_prediction_df = self.get_prediction_df(X_chunk,
                                                                     y=y_chunk)

                        chunk_prediction_df['document_id'] = df['document_id']
                        chunk_prediction_df['user_id'] = user_id
                        chunk_prediction_df = chunk_prediction_df.rename(
                            {'confidence': 'prob'}, axis=1)
                        chunk_prediction_df[label_field] = chunk_prediction_df[
                            'prediction']
                        chunk_prediction_df[columns].to_csv(output_filename,
                                                            index=False,
                                                            header=True)

        # output_df = pd.DataFrame(columns=columns)
        # output_df.to_csv(output_filename, index=False, header=True, index_label=False)

        print('Saving model weights to file...')
        class_weights = self.important_features
        class_weights_filename = os.path.join(
            output_folder,
            'ml_logistic_regression_weights_{project_id}.csv'.format(
                project_id=project_id))
        class_weights.to_csv(class_weights_filename, header=True, index=False)

        print('Saving model to a pickle file...')
        model_save_filename = os.path.join(
            output_folder,
            'ml_model_{project_id}.pickle'.format(project_id=project_id))
        self.save(model_save_filename)

        print('Saving model results to a text file...')
        ml_model_results_filename = os.path.join(
            output_folder, 'ml_model_results_{}.txt'.format(project_id))
        with open(ml_model_results_filename, 'wt') as f:
            f.write(result)

        y_test_pred = self.predict(X_test)
        y_test_pred_proba = self.predict_proba(X_test)

        # # Showing examples of large errors
        # df_labeled.loc[:, 'y_pred'] = self.predict(X)
        # df_labeled.loc[:, 'is_error'] = df_labeled['y_pred']!=df_labeled[label_field]
        # df_labeled.loc[:, 'y_pred_proba'] = np.max(self.predict_proba(X), axis=1)
        # df_labeled.to_csv(output_filename, index=False, header=True, index_label=False)

        # Confusion matrix
        print('Generating confusion matrix...')
        from src.utils.analyze_model import plot_confusion_matrix
        fig = plot_confusion_matrix(y_test,
                                    y_test_pred,
                                    classes=None,
                                    normalize=True,
                                    title='Normalized confusion matrix - test')
        filename = os.path.join(
            output_folder, 'confusion_matrix_test_{}.png'.format(project_id))
        fig.savefig(filename)
        plt.clf()

        fig = plot_confusion_matrix(
            y_train,
            self.predict(X_train),
            classes=None,
            normalize=True,
            title='Normalized confusion matrix - train')
        filename = os.path.join(
            output_folder, 'confusion_matrix_train_{}.png'.format(project_id))
        fig.savefig(filename)
        plt.clf()

        # Precision-recall curve
        print('Generating the Precision-Recall graph...')
        try:
            fig = plot_precision_recall_curve(y_test_pred_proba, y_test)
            filename = os.path.join(
                output_folder,
                'precision_recall_curve_{}.png'.format(project_id))
            fig.savefig(filename)
            plt.clf()
        except ValueError as e:
            print(e)

        # ROC curve
        print('Generating ROC curve...')
        try:
            fig = plot_roc_curve(y_test_pred_proba, y_test)
            filename = os.path.join(output_folder,
                                    'roc_curve_{}.png'.format(project_id))
            fig.savefig(filename)
            plt.clf()
        except ValueError as e:
            print(e)

        # Confidence-accuracy graph
        print('Generating the Confidence-Accuracy graph...')
        try:
            fig = plot_confidence_performance(y_test_pred, y_test_pred_proba,
                                              y_test)
            filename = os.path.join(
                output_folder,
                'confidence_accuracy_graph_{}.png'.format(project_id))
            fig.savefig(filename)
            plt.clf()
        except ValueError as e:
            print(e)

        # Confidence Distribution
        print('Computing distribution of confidence...')
        try:
            ax = pd.Series(np.max(y_test_pred_proba, axis=1)).hist(bins=50)
            plt.xlabel('Confidence')
            plt.ylabel('Counts')
            filename = os.path.join(
                output_folder,
                'confidence_distribution_{}.png'.format(project_id))
            plt.gcf().savefig(filename)
            plt.clf()
        except ValueError as e:
            print(e)

        # Generating learning curve
        print('Generating the learning curve...')
        from src.utils.analyze_model import plot_learning_curve_cv
        fig = plot_learning_curve_cv(X, y, estimator=self._model)
        filename = os.path.join(output_folder,
                                'learning_curve_{}.png'.format(project_id))
        fig.savefig(filename)
        plt.clf()

        # Run FastText for text classification
        df_labeled_train = df_labeled.loc[X_train.index, :]
        df_labeled_test = df_labeled.loc[X_test.index, :]

        if RUN_FASTTEXT:
            try:
                print('Running FastText model...')
                import fasttext

                def write_as_fasttext_format(df, filename):
                    with open(filename, 'wt', encoding='utf-8') as f:
                        _ = [
                            f.write('{} __label__{}\n'.format(
                                r['text'].lower().replace('\n', ' '),
                                r['label_id'].replace(' ', '_')))
                            for i, r in df.iterrows()
                        ]

                write_as_fasttext_format(df_labeled_train,
                                         output_folder + '/fasttext_train.txt')
                write_as_fasttext_format(df_labeled_test,
                                         output_folder + '/fasttext_test.txt')
                classifier = fasttext.train_supervised(
                    output_folder + '/fasttext_train.txt', 'model')
                fasttext_result = classifier.test(output_folder +
                                                  '/fasttext_test.txt')
                fasttext_pred = classifier.predict([
                    r['text'].lower().replace('\n', ' ')
                    for i, r in df_labeled_test.iterrows()
                ])
                fasttext_pred = [x[0] for x in fasttext_pred]

                _, evaluation_text = self.evaluate(
                    X=None,
                    y=df_labeled_test['label_id'].str.replace(' ', '_').values,
                    y_pred=fasttext_pred)
                result += '\nFastText performance on gold labels set: \n' + evaluation_text
            except Exception as e:
                print(e)

        print('Done running the model!')
        return result
Exemplo n.º 10
0
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import os
from fasttext import train_supervised


def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))


if __name__ == "__main__":
    train_data = os.path.join(os.getenv("DATADIR", ''),
                              '../input_data/icon.txt')
    #valid_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.valid')

    # train_supervised uses the same arguments and defaults as the fastText cli
    model = train_supervised(input=train_data,
                             epoch=25,
                             lr=1.0,
                             wordNgrams=2,
                             verbose=2,
                             minCount=1)
    #print_results(*model.test(valid_data))

    #print_results(*model.test(valid_data))
    model.save_model("../model/icon_1307.bin")
Exemplo n.º 11
0
                    help='Predict the class of a sentence')
parser.add_argument('-v',
                    action='store',
                    dest='validation',
                    help='Validate the model')
parser.add_argument('-k',
                    action='store',
                    dest='K Value',
                    help='Validate the model')

results = parser.parse_args()

if results.do_training:
    # model = fasttext.train_supervised(input="stkhelp.train", autotuneValidationFile='stkhelp.test', autotuneDuration=3600)
    model = fasttext.train_supervised(input="stkhelp.train",
                                      wordNgrams=3,
                                      autotuneValidationFile='stkhelp.test',
                                      autotuneDuration=3600)
    model.save_model("model_stkhelp.bin")
elif results.sentence != None:
    # model = fasttext.load_model("model_amazon_q.bin")
    model = fasttext.load_model("model_stkhelp_q.bin")
    text = clean_text(results.sentence)
    label = model.predict(text, k=3)
    str_label = str(label)
    # print(str_label)
    # jsonstr = json.dumps(label)
    col1 = list(label[0])
    col2 = label[1].tolist()
    print(col1)
    print(col2)
    pairs = zip(col1, col2)
Exemplo n.º 12
0
def predict_results(model, sentence):
    res = model.predict(preprocessing(sentence))
    return res[0][0]


if __name__ == "__main__":
    current_dir = os.getcwd()
    data_path = os.path.join(current_dir, "data")
    train_data = "../data_preprocessed/train.txt"
    valid_data = "../data_preprocessed/test.txt"
    model = train_supervised(
        input=train_data,
        epoch=150,
        lr=0.05,
        wordNgrams=2,
        verbose=2,
        loss="softmax",
        label="__lb__",
    )
    print_results(*model.test(valid_data))
    summaries, details = test(model, valid_data)
    print(summaries)
    print(details)
    model.save_model("model/ft.li.1701.bin")
    # model = load_model("model/ft.li.1701.bin")
    # with open(valid_data, "r") as f:
    #     lines = f.read().split("\n")
    #     lines = [str(model.predict((line))[0]).replace(
    #         "('", "").replace("',)", "") + " " + line for line in lines]
    # with open("test4.txt", "w") as wf:
def grid_search(train_fn, val_fn, learning_rates, minCounts, epochs, ws, wvs,
                ndims):
    best_lr = None
    best_ndim = None
    best_minCount = None
    best_epochs = None
    best_ws = None
    best_wv = None
    highest_f1 = float("-inf")
    label_counts_val = {}
    with open(val_fn) as fin:
        for line in fin:
            lbls = [
                l for l in line.strip().split(" ") if l.startswith('__label__')
            ]
            for lbl in lbls:
                label_counts_val[lbl] = label_counts_val.get(lbl, 0) + 1
    label_counts_train = {}
    with open(train_fn) as fin:
        for line in fin:
            lbls = [
                l for l in line.strip().split(" ") if l.startswith('__label__')
            ]
            for lbl in lbls:
                label_counts_train[lbl] = label_counts_train.get(lbl, 0) + 1
    grid_search_results = []
    for lr in learning_rates:
        for minCount in minCounts:
            for epoch in epochs:
                for w in ws:
                    for i in range(0, len(wvs)):
                        wv = wvs[i]
                        ndim = ndims[i]
                        print(
                            "Building fasttext model: {0} lr; {1} dim; {2} min count; {3} epochs. {4} ws. wv: {5}."
                            .format(lr, ndim, minCount, epoch, w, wv))
                        # train model
                        model = fasttext.train_supervised(
                            input=train_fn,
                            minCount=minCount,
                            wordNgrams=WORDNGRAMS,
                            pretrainedVectors=wv,
                            lr=lr,
                            epoch=epoch,
                            dim=ndim,
                            ws=w,
                            minn=MINN,
                            maxn=MAXN,
                            thread=MAXTHREADS,
                            loss=LOSS,
                            verbose=VERBOSITY)
                        # val
                        results_by_lbl = model.test_label(val_fn,
                                                          threshold=0.5,
                                                          k=-1)
                        f1_scores, support = zip(
                            *[(res['f1score'], label_counts_val[lbl])
                              for lbl, res in results_by_lbl.items()
                              if lbl in label_counts_val])
                        macro_f1 = np.average(f1_scores)
                        micro_f1 = np.average(f1_scores, weights=support)
                        f1_avg = np.average([micro_f1, macro_f1])
                        if f1_avg > highest_f1:
                            best_lr = lr
                            best_ndim = ndim
                            best_minCount = minCount
                            best_epochs = epoch
                            best_ws = w
                            best_wv = wv
                            highest_f1 = f1_avg

                        # train (check overfitting)
                        results_by_lbl = model.test_label(train_fn,
                                                          threshold=0.5,
                                                          k=-1)
                        f1_scores, support = zip(
                            *[(res['f1score'], label_counts_train[lbl])
                              for lbl, res in results_by_lbl.items()
                              if lbl in label_counts_train])
                        tr_macro_f1 = np.average(f1_scores)
                        tr_micro_f1 = np.average(f1_scores, weights=support)

                        print(
                            "{0:.3f} micro f1. {1:.3f} macro f1. {2:.3f} train micro f1. {3:.3f} train macro f1"
                            .format(micro_f1, macro_f1, tr_micro_f1,
                                    tr_macro_f1))
                        grid_search_results.append({
                            'lr': lr,
                            'ndim': ndim,
                            'minCount': minCount,
                            'epoch': epoch,
                            'ws': w,
                            'val_micro_f1': micro_f1,
                            'val_macro_f1': macro_f1,
                            'tra_micro_f1': tr_micro_f1,
                            'tra_macro_f1': tr_macro_f1,
                            'wv': wv
                        })

    print("\n==== Grid Search Results====\n")
    print(
        pd.DataFrame(grid_search_results)[[
            'lr', 'ndim', 'minCount', 'epoch', 'ws', 'val_micro_f1',
            'tra_micro_f1', 'val_macro_f1', 'tra_macro_f1', 'wv'
        ]])
    print(
        "\nBest: {0} lr; {1} dim; {2} min count; {3} epochs; {4} ws; {5} wv\n".
        format(best_lr, best_ndim, best_minCount, best_epochs, best_ws,
               best_wv))
    return best_lr, best_ndim, best_minCount, best_epochs, best_ws, best_wv
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--training_data",
        default=
        "/home/isaacj/fastText/drafttopic/wikitext/enwiki.balanced_article_sample.w_article_text_50413_train_data.txt"
    )
    parser.add_argument(
        "--val_data",
        default=
        "/home/isaacj/fastText/drafttopic/wikitext/enwiki.balanced_article_sample.w_article_text_6301_val_data.txt"
    )
    parser.add_argument(
        "--test_data",
        default=
        "/home/isaacj/fastText/drafttopic/wikitext/enwiki.balanced_article_sample.w_article_text_6303_test_data.txt"
    )
    parser.add_argument("--false_negatives_fn")
    parser.add_argument("--output_model")
    parser.add_argument(
        "--word_vectors",
        nargs="+",
        default=[
            '/home/isaacj/fastText/drafttopic/wvs/enwiki.vectors.20191201.skipgram_50.300k.vec'
        ],
        type=str)
    parser.add_argument("--learning_rates",
                        nargs="+",
                        default=[0.1],
                        type=float)
    parser.add_argument("--minCounts", nargs="+", default=[3], type=int)
    parser.add_argument("--epochs", nargs="+", default=[25], type=int)
    parser.add_argument("--ws", nargs="+", default=[20], type=int)
    parser.add_argument("--ndims", nargs="+", default=[50], type=int)
    args = parser.parse_args()

    if args.val_data and len(args.learning_rates + args.minCounts +
                             args.epochs + args.ws + args.ndims) > 5:
        lr, ndim, minCount, epochs, ws, wv = grid_search(
            args.training_data, args.val_data, args.learning_rates,
            args.minCounts, args.epochs, args.ws, args.word_vectors,
            args.ndims)
    else:
        lr = args.learning_rates[0]
        minCount = args.minCounts[0]
        epochs = args.epochs[0]
        ws = args.ws[0]
        wv = args.word_vectors[0]
        ndim = args.ndims[0]

    print(
        "Building fasttext model: {0} lr; {1} min count; {2} epochs; {3} ws; wv: {4}"
        .format(lr, minCount, epochs, ws, wv))
    model = fasttext.train_supervised(input=args.training_data,
                                      minCount=minCount,
                                      wordNgrams=WORDNGRAMS,
                                      lr=lr,
                                      epoch=epochs,
                                      pretrainedVectors=wv,
                                      ws=ws,
                                      dim=ndim,
                                      minn=MINN,
                                      maxn=MAXN,
                                      thread=MAXTHREADS,
                                      loss=LOSS,
                                      verbose=VERBOSITY)

    if args.output_model:
        print("Dumping fasttext model to {0}".format(args.output_model))
        model.save_model(args.output_model)

    if args.test_data:
        # build statistics dataframe for printing
        print("==== test statistics ====")
        lbl_statistics = {}
        toplevel_statistics = {}
        threshold = 0.5
        all_lbls = model.get_labels()
        for lbl in all_lbls:
            lbl_statistics[lbl] = {
                'n': 0,
                'FP': 0,
                'TP': 0,
                'FN': 0,
                'TN': 0,
                'true': [],
                'pred': []
            }
            toplevel_statistics[ft_to_toplevel(lbl)] = {
                'n': 0,
                'FP': 0,
                'TP': 0,
                'FN': 0,
                'TN': 0
            }
        with open(args.test_data, 'r') as fin:
            for line_no, datapoint in enumerate(fin):
                _, topics = model.get_line(datapoint.strip())
                prediction = model.predict(datapoint.strip(), k=-1)
                predicted_labels = []
                for idx in range(len(prediction[0])):
                    prob = prediction[1][idx]
                    lbl = prediction[0][idx]
                    lbl_statistics[lbl]['true'].append(int(lbl in topics))
                    lbl_statistics[lbl]['pred'].append(prob)
                    if prob > threshold:
                        predicted_labels.append(lbl)
                for lbl in all_lbls:
                    if lbl in topics and lbl in predicted_labels:
                        lbl_statistics[lbl]['n'] += 1
                        lbl_statistics[lbl]['TP'] += 1
                    elif lbl in topics:
                        lbl_statistics[lbl]['n'] += 1
                        lbl_statistics[lbl]['FN'] += 1
                    elif lbl in predicted_labels:
                        lbl_statistics[lbl]['FP'] += 1
                    else:
                        lbl_statistics[lbl]['TN'] += 1
                toplevel_topics = [ft_to_toplevel(l) for l in topics]
                toplevel_predictions = [
                    ft_to_toplevel(l) for l in predicted_labels
                ]
                for lbl in toplevel_statistics:
                    if lbl in toplevel_topics and lbl in toplevel_predictions:
                        toplevel_statistics[lbl]['n'] += 1
                        toplevel_statistics[lbl]['TP'] += 1
                    elif lbl in toplevel_topics:
                        toplevel_statistics[lbl]['n'] += 1
                        toplevel_statistics[lbl]['FN'] += 1
                    elif lbl in toplevel_predictions:
                        toplevel_statistics[lbl]['FP'] += 1
                    else:
                        toplevel_statistics[lbl]['TN'] += 1

        for lbl in all_lbls:
            s = lbl_statistics[lbl]
            fpr, tpr, _ = roc_curve(s['true'], s['pred'])
            s['pr-auc'] = auc(fpr, tpr)
            s['avg_pre'] = average_precision_score(s['true'], s['pred'])
            try:
                s['precision'] = s['TP'] / (s['TP'] + s['FP'])
            except ZeroDivisionError:
                s['precision'] = 0
            try:
                s['recall'] = s['TP'] / (s['TP'] + s['FN'])
            except ZeroDivisionError:
                s['recall'] = 0
            try:
                s['f1'] = 2 * (s['precision'] *
                               s['recall']) / (s['precision'] + s['recall'])
            except ZeroDivisionError:
                s['f1'] = 0

        for lbl in toplevel_statistics:
            s = toplevel_statistics[lbl]
            try:
                s['precision'] = s['TP'] / (s['TP'] + s['FP'])
            except ZeroDivisionError:
                s['precision'] = 0
            try:
                s['recall'] = s['TP'] / (s['TP'] + s['FN'])
            except ZeroDivisionError:
                s['recall'] = 0
            try:
                s['f1'] = 2 * (s['precision'] *
                               s['recall']) / (s['precision'] + s['recall'])
            except ZeroDivisionError:
                s['f1'] = 0

        print("\n=== Mid Level Categories ===")
        mlc_statistics = pd.DataFrame(lbl_statistics).T
        mlc_statistics['mid-level-category'] = [
            s.replace('__label__', '').replace('_', ' ')
            for s in mlc_statistics.index
        ]
        mlc_statistics.set_index('mid-level-category', inplace=True)
        mlc_statistics[''] = '-->'
        mlc_statistics = mlc_statistics[[
            'n', '', 'TP', 'FP', 'TN', 'FN', 'precision', 'recall', 'f1',
            'pr-auc', 'avg_pre'
        ]]
        with pd.option_context('display.max_rows', None):
            print(mlc_statistics)

        print("\nPrecision: {0:.3f} micro; {1:.3f} macro".format(
            np.average(mlc_statistics['precision'],
                       weights=mlc_statistics['n']),
            np.mean(mlc_statistics['precision'])))
        print("Recall: {0:.3f} micro; {1:.3f} macro".format(
            np.average(mlc_statistics['recall'], weights=mlc_statistics['n']),
            np.mean(mlc_statistics['recall'])))
        print("F1: {0:.3f} micro; {1:.3f} macro".format(
            np.average(mlc_statistics['f1'], weights=mlc_statistics['n']),
            np.mean(mlc_statistics['f1'])))
        print("PR-AUC: {0:.3f} micro; {1:.3f} macro".format(
            np.average(mlc_statistics['pr-auc'], weights=mlc_statistics['n']),
            np.mean(mlc_statistics['pr-auc'])))
        print("Avg pre.: {0:.3f} micro; {1:.3f} macro".format(
            np.average(mlc_statistics['avg_pre'], weights=mlc_statistics['n']),
            np.mean(mlc_statistics['avg_pre'])))

        print("\n=== Top Level Categories ===")
        tlc_statistics = pd.DataFrame(toplevel_statistics).T
        tlc_statistics.index.name = 'top-level-category'
        tlc_statistics[''] = '-->'
        tlc_statistics = tlc_statistics[[
            'n', '', 'TP', 'FP', 'TN', 'FN', 'precision', 'recall', 'f1'
        ]]
        print(tlc_statistics)

        print("\nPrecision: {0:.3f} micro; {1:.3f} macro".format(
            np.average(tlc_statistics['precision'],
                       weights=tlc_statistics['n']),
            np.mean(tlc_statistics['precision'])))
        print("Recall: {0:.3f} micro; {1:.3f} macro".format(
            np.average(tlc_statistics['recall'], weights=tlc_statistics['n']),
            np.mean(tlc_statistics['recall'])))
        print("F1: {0:.3f} micro; {1:.3f} macro".format(
            np.average(tlc_statistics['f1'], weights=tlc_statistics['n']),
            np.mean(tlc_statistics['f1'])))

        if args.false_negatives_fn:
            num_examples_per_label = 10
            false_negatives = {}
            for lbl in all_lbls:
                false_negatives[lbl] = []
            with open(args.test_data, 'r') as fin_data:
                with open(args.test_data.replace('data.txt', 'meta.txt'),
                          'r') as fin_metadata:
                    for line_no, datapoint in enumerate(fin_data):
                        claims, topics = model.get_line(datapoint.strip())
                        metadata = next(fin_metadata)
                        prediction = model.predict(datapoint.strip(), k=-1)
                        predicted_labels = [
                            l for idx, l in enumerate(prediction[0])
                            if prediction[1][idx] > threshold
                        ]
                        for lbl in topics:
                            if lbl not in predicted_labels:
                                false_negatives[lbl].append('{0}\t{1}'.format(
                                    lbl, metadata))
                    with open(args.false_negatives_fn, 'w') as fout:
                        for lbl in false_negatives:
                            num_examples = min(len(false_negatives[lbl]),
                                               num_examples_per_label)
                            random_examples = np.random.choice(
                                false_negatives[lbl],
                                num_examples,
                                replace=False)
                            for ex in random_examples:
                                fout.write(ex)
Exemplo n.º 15
0
def build_model():
    start = time.time()
    model = fasttext.train_supervised('train.txt')
    print("{0:-^30}".format("模型训练"))
    print("elapse time: %.3fs" % (time.time() - start))
    model.save_model("fasttext_model.bin")
Exemplo n.º 16
0
              
                test+=line
            count+=1
    dosya=open('train1.txt','w', encoding="utf-8")
    dosya.write(train)
    dosya.close()
    dosya=open('test1.txt','w', encoding="utf-8")
    dosya.write(test)
    dosya.close()
    print(k)
    print(s)


ayir('ortaknew.csv',20)

model = fasttext.train_supervised(input='train1.txt', epoch=25,lr=0.1, wordNgrams=2, loss='hs',dim=100)

model.predict("çok iyi",k=3)

from mlxtend.plotting import plot_confusion_matrix
rounded_pred = model.predict(s, k=1)
print(rounded_pred[0][1])

print(confusion_matrix(k,rounded_pred[0]))
print(plot_confusion_matrix(conf_mat=confusion_matrix(k,rounded_pred[0])))

total_acc=0
for i in range(len(rounded_pred[0])):

  if rounded_pred[0][i][0]==true_labels[i]:
    total_acc+=1
Exemplo n.º 17
0
train = pd.read_csv(data_path+"data.txt", header=0, sep='\r\n', engine='python')
ts =  train.shape
df = pd.DataFrame(train)
new_train = df.reindex(np.random.permutation(df.index))

# 按9:1比例切分为2个文件
indice_90_percent = int((ts[0]/100.0)* 90)

new_train[indice_90_percent:].to_csv(data_path+'test.txt',index=False)
new_train[:indice_90_percent].to_csv(data_path+'train.txt',index=False)

# 开始训练
model = fasttext.train_supervised(input=data_path+"train.txt",
                                    epoch=20,
                                    lr=1.0,
                                    wordNgrams=2,
                                    bucket=200000,
                                    dim=50,
                                    loss='hs')

# 保存训练好的模型
model.save_model(data_path+"model.bin")

# 优化模型
model.quantize(input=data_path+ 'model.bin', retrain=False)
model.save_model(data_path+"model.ftz")

# 测试单个词
print(model.predict("保暖 内衣", k=3))

# 测试集
Exemplo n.º 18
0
import fasttext

print("Training model ...")
model = fasttext.train_supervised(input="cooking.train", epoch=25, lr=1.0)

print("Saving model ...")
model.save_model("model_cooking.bin")

print("Validating model ...")
result = model.test("cooking.valid")
print(result)

print("Predicting model ...")
result = model.predict("Which baking dish is best to bake a banana bread ?")
print(result)
import fasttext
import os

if __name__ == "__main__":

    hyper_params = {
        "lr": 0.35,         # Learning rate
        "epoch": 100,       # Number of training epochs to train for
        "wordNgrams": 3,    # Number of word n-grams to consider during training
        "dim": 155,         # Size of word vectors
        "ws": 5,            # Size of the context window for CBOW or skip-gram
        "minn": 2,          # Min length of char ngram
        "maxn": 5,          # Max length of char ngram
        "bucket": 2014846,  # Number of buckets
    }

    training_data_path = 'sst_train.txt'

    # Train the FastText model
    model = fasttext.train_supervised(input=training_data_path, **hyper_params)
    print("FastText model trained with the hyperparameters: \n {}".format(hyper_params))

    model.save_model(os.path.join('C:/Users/mehra/OneDrive/Documents/GitHub/73StringsAssignment', "sst.bin"))

    # Quantize model to reduce space usage
    model.quantize(input=training_data_path, qnorm=True, retrain=True, cutoff=110539)
    model.save_model(os.path.join('C:/Users/mehra/OneDrive/Documents/GitHub/73StringsAssignment', "sst_quantized.ftz"))
Exemplo n.º 20
0
def build_classify_model():
    model = fasttext.train_supervised(config.classify_corpus_path,
                                      epoch=20,
                                      wordNgrams=2,
                                      minCount=1)
    model.save_model(config.classify_model_path)
from gensim.models import FastText, LdaMulticore
import gensim
import re
import pymorphy2
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import fasttext

# Получаем экземпляр анализатора (10-20мб)
morph = pymorphy2.MorphAnalyzer()
corpus_file = datapath('lee_background.cor')

model = fasttext.train_supervised('train-comedies-horrors.txt');

model.save_model("comedies-horrors-model.bin")

result = model.predict('девушка заброшенный дом призрак')

print('test');
import fasttext
import os
import json

model = fasttext.train_supervised(
    input="data/flair_data2/train/combined.csv",
    epoch=25,
    lr=0.5,
    wordNgrams=2,
    bucket=200000,
    dim=50,
    loss="ova",
)
model.save_model("fastText_models/fastText_combined.bin")

# model = fasttext.load_model('fastText_models/fastText_combined.bin')

scores = dict()
input_folder_path = "data/flair_data2/dev/"
for filename in os.listdir(input_folder_path):
    if filename.endswith(".csv"):
        score = model.test_label(os.path.join(input_folder_path, filename))
        score["micro-avaraging"] = model.test(os.path.join(input_folder_path, filename))
        scores[filename] = score


with open("fastText_clf_outputs/prfs1.txt", "w") as jsonfile:
    json.dump(scores, jsonfile, indent=2)
Exemplo n.º 23
0
FOLDER = "fasttext_tool/"
def saveInfoToFile(row, output):
    output.write("__label__{} {}\n".format(row['polarity'], str(row['text'])))
    return ""

def adjustForm(dataSet, fileName):
    print("Transforming...")
    with open('{}{}'.format(FOLDER, fileName), 'w+') as output:
        dataSet.apply(lambda x: saveInfoToFile(x, output), axis=1)

if __name__ == "__main__":
    dataReader = DataReader()
    evaluator = Evaluator()
    if not "data.train" in os.listdir(FOLDER):
        dataSet = dataReader.read_data_set()
        adjustForm(dataSet, "data.train")
    if not "data.test" in os.listdir(FOLDER):
        testSet = dataReader.read_test_set()
        adjustForm(testSet, "data.test")
    if not "model.bin" in os.listdir(FOLDER):
        model = ft.train_supervised(input=FOLDER + "data.train")
        model.save_model(FOLDER + "model.bin")
    else:
        model = ft.load_model(FOLDER + "model.bin")
    (_, precision, recall) = model.test(FOLDER + "data.test")
    metrics = {'precision': precision, 'recall': recall, 'fscore': evaluator.calculate_fscore(precision, recall)}
    metrics_str = evaluator.getString(metrics)
    with open(FOLDER + "results.txt", 'w') as output:
        output.write(metrics_str)
    print(metrics_str)
Exemplo n.º 24
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


data_all['Question']=data_all['Question Sentence'].apply(lambda x: " ".join(jieba.cut(x)))

import fasttext
from sklearn.metrics import f1_score
# A

data_df=data_all[['Question','category_A']].head(5000)
data_df['label_ft'] = '__label__' + data_train['category_A'].head(5000).astype(str)
data_df[['Question','label_ft']].to_csv('train.csv', index=None, header=None, sep='\t')


model = fasttext.train_supervised('train.csv', lr=0.05, wordNgrams=2, 
                                  verbose=2, minCount=1, epoch=500, loss="hs")

val_pred_A = [model.predict(x)[0][0].split('__')[-1] for x in data_all['Question'][5000:]]


sub['category_A']=val_pred_A

print(sub['category_A'].value_counts()/3000)
print(data_train['category_A'].value_counts()/5000)

#B
data_df=data_all[['Question','category_B']].head(5000)
data_df['label_ft'] = '__label__' + data_train['category_B'].head(5000).astype(str)
data_df[['Question','label_ft']].to_csv('train.csv', index=None, header=None, sep='\t')

#loss function {ns, hs, softmax, ova}
Exemplo n.º 25
0
import fasttext
import pandas as pd

#data path
path = ".\\segdata\\segData.csv"

#data = pd.read_csv(path, encoding='UTF-8')
#data.iloc[0:int(len(data)*0.8)].to_csv('.\\segdata\\train.txt', header=None, index=None, encoding='utf-8-sig', mode='w')
#data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('.\\segdata\\test.txt', header=None, index=None, encoding='utf-8-sig', mode='w')

#train_data = ".\\segdata\\train.txt"
#test_data = ".\\segdata\\test.txt"

#train
model = fasttext.train_supervised(path)
#model = fasttext.train_unsupervised(path, model='cbow')
#model = fasttext.train_unsupervised(path, model='skipgram')

#save model
model.save_model(".\\model\\model_news.bin")
Exemplo n.º 26
0
    index = np.argmax(pred[1])
    label = int(pred[0][index][-1])
    return label


def get_proba(pred):
    pred_dic = {}
    pred_dic[pred[0][0]] = pred[1][0]
    pred_dic[pred[0][1]] = pred[1][1]
    return pred_dic['__label__1']


print('------------------开始训练模型--------------------')
model = fasttext.train_supervised(input="d:/train_semantic.txt",
                                  lr=0.1,
                                  epoch=100,
                                  wordNgrams=3,
                                  dim=300)
print('------------------模型训练结束--------------------')

test_pred = []
for i in range(len(test_data)):
    r = model.predict(" ".join(test_data[i]), k=2)
    test_pred.append(get_label(r))

acc = accuracy_score(test_pred, test_label)
precision = precision_score(test_pred, test_label)
recall = recall_score(test_pred, test_label)
f1 = f1_score(test_pred, test_label)

print("准确率:" + str(acc) + "\n")
Exemplo n.º 27
0
def train():
    classifier = fasttext.train_supervised(input="data_for_fasttext_train.txt",
                                           epoch=20)
    classifier.save_model('fasttext_classifier.bin')
    return classifier
Exemplo n.º 28
0
train_df = pd.read_csv(
    '/Users/yowasa/Documents/天池入门NLP - 新闻文本分类/train_set.csv',
    sep='\t',
    nrows=15000)

train_df['label_ft'] = '__label__' + train_df['label'].astype(str)
train_df[['text', 'label_ft']].iloc[:-5000].to_csv('train.csv',
                                                   index=None,
                                                   header=None,
                                                   sep='\t')

model = fasttext.train_supervised('train.csv',
                                  lr=1,
                                  wordNgrams=3,
                                  dim=500,
                                  verbose=2,
                                  minCount=1,
                                  epoch=25,
                                  loss="softmax")

val_pred = [
    model.predict(x)[0][0].split('__')[-1]
    for x in train_df.iloc[-5000:]['text']
]

print(
    f1_score(train_df['label'].values[-5000:].astype(str),
             val_pred,
             average='macro'))
Exemplo n.º 29
0
import random
import os
import fasttext as ft

model = ft.train_supervised(input="__in.txt", epoch=500, lr=0.7)
model.save_model("wikihow.model")

results = model.test("__out.txt")
print(results)
Exemplo n.º 30
0
            classification_report(test_Y, pred_Y, target_names=mlb.classes_))
        print("accuracy score: ", str(accuracy_score(test_Y, pred_Y)))
        #report_df.to_csv('Data/fast/preds/trec_train12_classification_report.csv', index=True) # uncomment to generate the report
        from sklearn.metrics import jaccard_similarity_score
        from sklearn.metrics import hamming_loss
        jac_score = jaccard_similarity_score(test_Y, pred_Y)
        loss = hamming_loss(test_Y, pred_Y)
        print(jac_score, loss)


if __name__ == '__main__':
    ft = FastText()
    ft.prepare_dataset()
    ft.prepare_train_test_val()
    ft.prepare_testData()
    model = fasttext.train_supervised(
        input='Data/pTrec.train.txt',
        autotuneValidationFile='Data/pTrec.val.txt',
        autotunePredictions=-1,
        autotuneDuration=1200)
    model.save_model('TRECmodel_autotune.ftz')
    ## -- optional ---
    #model = fasttext.load_model('TRECmodel_autotune.ftz')
    #print(model.test('Data/fast/papertrec_tweets_test.txt', k= -1))
    ## ---------------
    ## run the command from the terminal to generate prediction files using the generated model - change the file names according to the dataset
    # ./fastText-0.9.1/fasttext predict TRECmodel_autotune.ftz Data/fast/papertrec_tweets_test.txt -1 0.2 > Data/fast/paper/prediction_results/on_trec_test.txt
    ## -- after running the command on the terminal, run the following two lines of code
    #ft.prepare_prediction_file()
    #ft.generate_classification_report()