def validate(val_file, val_filetype, df_location, outfile): """ Performs validation on the file supplied in the first argument. ARGUMENTS: val_file: the path to the validation file, string val_filetype: string 'Buzzfeed Validation File' or 'Crowdsourced File used as a validation file' df_location: location to load/save validation df from out_file: path to output file RETURNS: None """ val_df = clean_shuffle.read_prepare_df(val_file, file_path=df_location) # Load the model, and tag the docs (obviously, no training step, so set # init_models to False) pv = ParagraphVectorModel(val_df, init_models=False) # Tag the documents (title + content separately) pv.get_tagged_docs() pv.model_content_dbow = model_content_dbow pv.model_title_dbow = model_title_dbow # y_val_df is a DataFrame with id and hyperpartisan X_val, y_val_df = get_vector_label_mapping(pv) # Get the predictions y_pred = predict_vals(svc, X_val) y_pred_df = pd.DataFrame(y_pred, columns=['hyperpartisan']) # The order of ids will be the same y_pred_df['id'] = y_val_df.id calculate_metrics(y_val_df, y_pred_df, svc, val_filetype) write_to_tsv(y_pred_df, y_val_df, outfile)
def load_texts(crowdsourced=False, split=True): tsv_name = 'crowdsourced_train' if crowdsourced is True else 'buzzfeed_training' df_name = 'crowdsourced_train_df' if crowdsourced is True else 'training_df' filename = os.path.join(sem_eval_path, 'data', 'IntegratedFiles', '{}_withid.tsv'.format(tsv_name)) df_location = os.path.join(sem_eval_path, 'data', 'Pickles', '{}.pickle'.format(df_name)) df = clean_shuffle.read_prepare_df(filename, file_path=df_location) df["text"] = df["title"] + ' ' + df["content"] return df
def load_texts(): filename = os.path.join( sem_eval_path, 'data', 'IntegratedFiles', 'buzzfeed_validation_withid.tsv') # 'crowdsourced_train_withid.tsv') df_location = os.path.join(sem_eval_path, 'data', 'Pickles', 'validation_df.pickle') df = clean_shuffle.read_prepare_df(filename, file_path=df_location) ids_to_labels = ground_truth_sqlite.select_id_hyperpartisan_mappings( sem_eval_path, 'ground_truth_validation') # 'ground_truth_crowdsourced_train' df['hyperpartisan'] = df.apply( lambda row: 1 if ids_to_labels[row['id']] == 'true' else 0, axis=1) df["text"] = df["title"] + ' ' + df["content"] return df['text'], df['hyperpartisan']
def load(self, split=False, validation=False, sentences=False): name = 'validation' if validation else 'training' tsv_name = 'crowdsourced_train' if self.crowdsourced is True else 'buzzfeed_{}'.format( name) table_name = 'crowdsourced_train' if self.crowdsourced is True else name sentences_phrase = '_s' if sentences else '' df_name = 'crowdsourced_train{}_df'.format( sentences_phrase ) if self.crowdsourced is True else '{}{}_df'.format( name, sentences_phrase) filename = os.path.join(self.sem_eval_path, 'data', 'IntegratedFiles', '{}_withid.tsv'.format(tsv_name)) df_location = os.path.join(self.sem_eval_path, 'data', 'Pickles', '{}.pickle'.format(df_name)) print('DataFrame file location: {}'.format(df_location)) if sentences: df = clean_shuffle.read_prepare_sentence_df(filename, file_path=df_location) else: df = clean_shuffle.read_prepare_df(filename, file_path=df_location) print('df {} loaded. Shape: {}'.format(df_name, df.shape)) ids_to_labels = ground_truth_sqlite.select_id_hyperpartisan_mappings( self.sem_eval_path, 'ground_truth_{}'.format(table_name)) df['hyperpartisan'] = df.apply( lambda row: 1 if ids_to_labels[row['id']] == 'true' else 0, axis=1) if not 'text' in df.columns: df["text"] = df["title"] + ' ' + df["content"] if split: boundary = int(self.train_val_boundary * df['text'].shape[0]) return df['text'][:boundary], df['hyperpartisan'][:boundary], df[ 'text'][boundary:], df['hyperpartisan'][boundary:] else: return df['text'], df['hyperpartisan']
def main(): """ Main function which reads the training file into a shuffled data frame, builds 2 ParagraphVectorModels, combines them, gets the resulting vector-label mappings, and trains an SVM (SVC) model on these mappings. This SVM model is persisted to disk.""" parser = argparse.ArgumentParser() parser.add_argument( "--path", '-p', default="/home/ashwath/Files/SemEval", help= "Use this argument to change the SemEval directory path (the default path is: '/home/ashwath/Files/SemEval')" ) parser.add_argument("--skipml", '-s', action="store_true", default=False, help="Use this argument to skip training the ML model") parser.add_argument( "--retrainpv", '-r', action="store_true", default=False, help= "Use this argument to retrain the embeddings (loaded from previous run's pickle by default)" ) #parser.add_argument('--noskipml', dest='skipml', action='store_false') args = parser.parse_args() sem_eval_path = args.path filename = os.path.join(sem_eval_path, 'data', 'IntegratedFiles', 'buzzfeed_training_withid.tsv') df_location = os.path.join(sem_eval_path, 'data', 'Pickles', 'training_df.pickle') pv_location = os.path.join(sem_eval_path, 'models', 'pv_object.pickle') if args.retrainpv: df = clean_shuffle.read_prepare_df(filename, file_path=df_location) print("Training paragraph vectors...") pv = build_pv_models(df, sem_eval_path) else: try: # If a paragraph vector has already been pickled, load it in. with open(pv_location, 'rb') as pfile: print("Loading paragraph vector instance from pickle...") pv = pickle.load(pfile) except FileNotFoundError: # Doc2Vec training required df = clean_shuffle.read_prepare_df(filename, file_path=df_location) import sys sys.exit() print("Training paragraph vectors...") pv = build_pv_models(df, sem_eval_path) # Train machine learning model if args.skipml is False (default) if not args.skipml: # Get a composite embedding model: X_train has the vectors, y_train is a dataframe with id and # hyperpartisan indicator. print("Getting vector label mapping...") X_train, y_train_df = get_vector_label_mapping(pv, 'concat') # y_train_df is a dataframe, y_train_df.hyperpartisan has the labels. print("Training SVC...") svc = train_ml_model(X_train, y_train_df.hyperpartisan) # Serialize the model and save to disk svc_model_location = os.path.join(sem_eval_path, 'models', 'svc_embeddings.joblib') joblib.dump(svc, svc_model_location) else: print("SVC model not trained") print("DONE!") if not args.skipml: print("SVC Model saved to {}".format(svc_model_location)) print("Paragraph vector object pickle is at: {}".format(pv_location)) print("Dataframe is pickled at: {}".format(df_location))