示例#1
0
def validate(val_file, val_filetype, df_location, outfile):
    """ Performs validation on the file supplied in the first argument.
    ARGUMENTS: val_file: the path to the validation file, string
               val_filetype: string 'Buzzfeed Validation File' or
               'Crowdsourced File used as a validation file'
               df_location: location to load/save validation df from
               out_file: path to output file
    RETURNS: None
    """
    val_df = clean_shuffle.read_prepare_df(val_file, file_path=df_location)
    # Load the model, and tag the docs (obviously, no training step, so set
    # init_models to False)
    pv = ParagraphVectorModel(val_df, init_models=False)
    # Tag the documents (title + content separately)
    pv.get_tagged_docs()
    pv.model_content_dbow = model_content_dbow
    pv.model_title_dbow = model_title_dbow
    # y_val_df is a DataFrame with id and hyperpartisan
    X_val, y_val_df = get_vector_label_mapping(pv)
    # Get the predictions
    y_pred = predict_vals(svc, X_val)
    y_pred_df = pd.DataFrame(y_pred, columns=['hyperpartisan'])
    # The order of ids will be the same
    y_pred_df['id'] = y_val_df.id
    calculate_metrics(y_val_df, y_pred_df, svc, val_filetype)
    write_to_tsv(y_pred_df, y_val_df, outfile)
示例#2
0
def load_texts(crowdsourced=False, split=True):
    tsv_name = 'crowdsourced_train' if crowdsourced is True else 'buzzfeed_training'
    df_name = 'crowdsourced_train_df' if crowdsourced is True else 'training_df'

    filename = os.path.join(sem_eval_path, 'data', 'IntegratedFiles', '{}_withid.tsv'.format(tsv_name))
    df_location = os.path.join(sem_eval_path, 'data', 'Pickles', '{}.pickle'.format(df_name))

    df = clean_shuffle.read_prepare_df(filename, file_path=df_location)

    df["text"] = df["title"] + ' ' + df["content"]

    return df
示例#3
0
def load_texts():
    filename = os.path.join(
        sem_eval_path, 'data', 'IntegratedFiles',
        'buzzfeed_validation_withid.tsv')  # 'crowdsourced_train_withid.tsv')
    df_location = os.path.join(sem_eval_path, 'data', 'Pickles',
                               'validation_df.pickle')

    df = clean_shuffle.read_prepare_df(filename, file_path=df_location)

    ids_to_labels = ground_truth_sqlite.select_id_hyperpartisan_mappings(
        sem_eval_path,
        'ground_truth_validation')  # 'ground_truth_crowdsourced_train'
    df['hyperpartisan'] = df.apply(
        lambda row: 1 if ids_to_labels[row['id']] == 'true' else 0, axis=1)

    df["text"] = df["title"] + ' ' + df["content"]

    return df['text'], df['hyperpartisan']
示例#4
0
    def load(self, split=False, validation=False, sentences=False):
        name = 'validation' if validation else 'training'
        tsv_name = 'crowdsourced_train' if self.crowdsourced is True else 'buzzfeed_{}'.format(
            name)
        table_name = 'crowdsourced_train' if self.crowdsourced is True else name
        sentences_phrase = '_s' if sentences else ''
        df_name = 'crowdsourced_train{}_df'.format(
            sentences_phrase
        ) if self.crowdsourced is True else '{}{}_df'.format(
            name, sentences_phrase)

        filename = os.path.join(self.sem_eval_path, 'data', 'IntegratedFiles',
                                '{}_withid.tsv'.format(tsv_name))
        df_location = os.path.join(self.sem_eval_path, 'data', 'Pickles',
                                   '{}.pickle'.format(df_name))

        print('DataFrame file location: {}'.format(df_location))
        if sentences:
            df = clean_shuffle.read_prepare_sentence_df(filename,
                                                        file_path=df_location)
        else:
            df = clean_shuffle.read_prepare_df(filename, file_path=df_location)

        print('df {} loaded. Shape: {}'.format(df_name, df.shape))

        ids_to_labels = ground_truth_sqlite.select_id_hyperpartisan_mappings(
            self.sem_eval_path, 'ground_truth_{}'.format(table_name))
        df['hyperpartisan'] = df.apply(
            lambda row: 1 if ids_to_labels[row['id']] == 'true' else 0, axis=1)

        if not 'text' in df.columns:
            df["text"] = df["title"] + ' ' + df["content"]

        if split:
            boundary = int(self.train_val_boundary * df['text'].shape[0])
            return df['text'][:boundary], df['hyperpartisan'][:boundary], df[
                'text'][boundary:], df['hyperpartisan'][boundary:]
        else:
            return df['text'], df['hyperpartisan']
示例#5
0
def main():
    """ Main function which reads the training file into a shuffled data frame, builds 2 ParagraphVectorModels,
    combines them, gets the resulting vector-label mappings, and trains an SVM (SVC) model on these mappings.
    This SVM model is persisted to disk."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--path",
        '-p',
        default="/home/ashwath/Files/SemEval",
        help=
        "Use this argument to change the SemEval directory path (the default path is: '/home/ashwath/Files/SemEval')"
    )
    parser.add_argument("--skipml",
                        '-s',
                        action="store_true",
                        default=False,
                        help="Use this argument to skip training the ML model")
    parser.add_argument(
        "--retrainpv",
        '-r',
        action="store_true",
        default=False,
        help=
        "Use this argument to retrain the embeddings (loaded from previous run's pickle by default)"
    )
    #parser.add_argument('--noskipml', dest='skipml', action='store_false')

    args = parser.parse_args()
    sem_eval_path = args.path
    filename = os.path.join(sem_eval_path, 'data', 'IntegratedFiles',
                            'buzzfeed_training_withid.tsv')
    df_location = os.path.join(sem_eval_path, 'data', 'Pickles',
                               'training_df.pickle')
    pv_location = os.path.join(sem_eval_path, 'models', 'pv_object.pickle')
    if args.retrainpv:
        df = clean_shuffle.read_prepare_df(filename, file_path=df_location)
        print("Training paragraph vectors...")
        pv = build_pv_models(df, sem_eval_path)
    else:
        try:
            # If a paragraph vector has already been pickled, load it in.
            with open(pv_location, 'rb') as pfile:
                print("Loading paragraph vector instance from pickle...")
                pv = pickle.load(pfile)
        except FileNotFoundError:
            # Doc2Vec training required
            df = clean_shuffle.read_prepare_df(filename, file_path=df_location)
            import sys
            sys.exit()
            print("Training paragraph vectors...")
            pv = build_pv_models(df, sem_eval_path)

    # Train machine learning model if args.skipml is False (default)
    if not args.skipml:
        # Get a composite embedding model: X_train has the vectors, y_train is a dataframe with id and
        # hyperpartisan indicator.
        print("Getting vector label mapping...")
        X_train, y_train_df = get_vector_label_mapping(pv, 'concat')
        # y_train_df is a dataframe, y_train_df.hyperpartisan has the labels.
        print("Training SVC...")
        svc = train_ml_model(X_train, y_train_df.hyperpartisan)
        # Serialize the model and save to disk
        svc_model_location = os.path.join(sem_eval_path, 'models',
                                          'svc_embeddings.joblib')
        joblib.dump(svc, svc_model_location)
    else:
        print("SVC model not trained")
    print("DONE!")
    if not args.skipml:
        print("SVC Model saved to {}".format(svc_model_location))
    print("Paragraph vector object pickle is at: {}".format(pv_location))
    print("Dataframe is pickled at: {}".format(df_location))