def main(source=source, data_dir='data', checkpoint_dir="outputs/eval2/roberta_finetune_nogptneo", best_model_dir='outputs/eval2/best_model_roberta_finetune_nogptneo', n_train=240000, n_valid=4000, n_test=4000, n_epochs=10, learning_rate=4e-05, train_batch_size=64, eval_batch_size=64, evaluate_during_training=True, evaluate_during_training_steps=2000, reprocess_input=True, overwrite_output_dir=True, n_gpu=2): # import pdb; pdb.set_trace() train_df = data_loading.load_split(data_dir, source, 'train', n=n_train) valid_df = data_loading.load_split(data_dir, source_test, 'valid', n=n_valid) test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test) # Optional model configuration model_args = ClassificationArgs( num_train_epochs=n_epochs, evaluate_during_training=evaluate_during_training, evaluate_during_training_steps=evaluate_during_training_steps, best_model_dir=best_model_dir, manual_seed=0, train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, overwrite_output_dir=overwrite_output_dir, n_gpu=n_gpu, output_dir=checkpoint_dir, reprocess_input_data=reprocess_input, learning_rate=learning_rate) # Create a ClassificationModel model = ClassificationModel("roberta", model_name="roberta-large", args=model_args, use_cuda=True) # Train the model model.train_model(train_df, eval_df=valid_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer)
def main ( source=source, data_dir='data', load_model_dir="outputs/eval2/best_model_openai_finetune_1", checkpoint_dir="outputs/eval2/test_xl-1542M-nucleus_eval2_analytic", n_train=250000, n_valid=10000, n_test=np.inf, reprocess_input=True, ): transformers_logger.info(f'source: {source}, checkpoint_dir: {checkpoint_dir}') test_df = data_loading.load_split(data_dir, source, 'test', n=n_test) # Optional model configuration model_args = ClassificationArgs( output_dir=checkpoint_dir, num_train_epochs=2, evaluate_during_training=True, save_steps=25000, evaluate_during_training_steps=25000, manual_seed=0, train_batch_size=256, eval_batch_size=256, overwrite_output_dir=True, reprocess_input_data=reprocess_input, n_gpu=2, no_cache=True, ) # Create a ClassificationModel model = ClassificationModel( "roberta", # model_name="roberta-large-openai-detector", load_model_dir, args=model_args, use_cuda=True ) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer )
def main ( source='xl-1542M-k40;xl-1542M', data_dir='data', load_model_dir="outputs/checkpoint-15626-epoch-2", checkpoint_dir="outputs", n_train=250000, n_valid=10000, n_test=100, reprocess_input=False, ): print('loading data') test_texts, test_labels = data_loading.load_split(data_dir, source, 'test', n=n_test) print('Done loading data') for text, label in zip(test_texts, test_labels): output = analytics.analytics(text) print(output, label)
def main(source=source, data_dir='data', checkpoint_dir="outputs/eval2/openai_finetune_1", best_model_dir='outputs/eval2/best_model_openai_finetune_1', model_name="roberta-large-openai-detector", n_train=300000, n_valid=8000, n_test=10000, n_epochs=5, learning_rate=1e-06, train_batch_size=64, eval_batch_size=64, evaluate_during_training=True, evaluate_during_training_steps=400, reprocess_input=True, overwrite_output_dir=True, n_gpu=2): print( f'{source}\n{data_dir}\n{checkpoint_dir}\n{best_model_dir}\n{model_name}\n{n_train}\n{n_valid}\n{n_test}\n{n_epochs}\n{learning_rate}\n{train_batch_size}\n{eval_batch_size}\n{evaluate_during_training}\n{evaluate_during_training_steps}\n{reprocess_input}\n{overwrite_output_dir}\n{n_gpu}\n' ) # import pdb; pdb.set_trace() train_df = data_loading.load_split(data_dir, source, 'train', n=n_train) valid_df = data_loading.load_split(data_dir, source_test, 'valid', n=n_valid) test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test) # Optional model configuration model_args = ClassificationArgs( num_train_epochs=n_epochs, evaluate_during_training=evaluate_during_training, evaluate_during_training_steps=evaluate_during_training_steps, best_model_dir=best_model_dir, manual_seed=0, train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, overwrite_output_dir=overwrite_output_dir, n_gpu=n_gpu, output_dir=checkpoint_dir, reprocess_input_data=reprocess_input, learning_rate=learning_rate) # Create a ClassificationModel model = ClassificationModel("roberta", model_name=model_name, args=model_args, use_cuda=True) # Train the model model.train_model(train_df, eval_df=valid_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer)
def main( source=source, data_dir='data', checkpoint_dir="outputs/" + experiment_name, n_train=np.inf, n_valid=5000, n_epochs=10, n_test=np.inf, reprocess_input=True, small=True, ): train_texts, train_labels = data_loading.load_split(data_dir, source, 'train', n=n_train) valid_texts, valid_labels = data_loading.load_split(data_dir, source_test, 'test', n=n_valid) test_texts, test_labels = data_loading.load_split(data_dir, source_test, 'test', n=n_test) for i, text in enumerate(train_texts): for key in ['Article: ', 'Body: ', 'Abstract: ']: if key in text: train_texts[i] = text.split(key)[-1] train_labels = [int(not label) for label in train_labels] for i, text in enumerate(valid_texts): for key in ['Article: ', 'Body: ', 'Abstract: ']: if key in text: valid_texts[i] = text.split(key)[-1] valid_labels = [int(not label) for label in valid_labels] for i, text in enumerate(test_texts): for key in ['Article: ', 'Body: ', 'Abstract: ']: if key in text: test_texts[i] = text.split(key)[-1] test_labels = [int(not label) for label in test_labels] def sample_sequences(texts, labels): small_texts = [] small_labels = [] for text, label in zip(texts, labels): toks = text.split() for seq_len in [16, 32, 64, 128, 256]: if len(toks) > seq_len: start_idx = random.randrange(len(toks) - seq_len) subseq = toks[start_idx:start_idx + seq_len] small_texts.append(" ".join(subseq)) small_labels.append(label) # import pdb; pdb.set_trace() all_texts = texts + small_texts all_labels = labels + small_labels return all_texts, all_labels if small: train_texts, train_labels = sample_sequences(train_texts, train_labels) # Preparing train data train_data = {'text': train_texts, 'labels': train_labels} train_df = pd.DataFrame(data=train_data) # Preparing eval data valid_data = {'text': valid_texts, 'labels': valid_labels} valid_df = pd.DataFrame(data=valid_data) # Preparing test data test_data = {'text': test_texts, 'labels': test_labels} test_df = pd.DataFrame(data=test_data) # Optional model configuration model_args = ClassificationArgs(num_train_epochs=n_epochs, evaluate_during_training=True, manual_seed=0, train_batch_size=16, eval_batch_size=32, overwrite_output_dir=True, n_gpu=2, output_dir=checkpoint_dir, reprocess_input_data=reprocess_input, cache_dir="cache_dir/" + experiment_name, best_model_dir='outputs/best_model_' + experiment_name, max_seq_length=256) # Create a ClassificationModel model = ClassificationModel("roberta", "roberta-large", args=model_args, use_cuda=True) # Train the model model.train_model(train_df, eval_df=valid_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer)
def main( data_dir, log_dir, source='xl-1542M-k40', n_train=500000, n_valid=10000, n_test=np.inf, n_jobs=None, n_jobs_custom=None, verbose=False, save_featureizer=False, save_model=False, save_features=False, load_featureizer=None, load_features=None, load_model=None, no_hyperparam_search=False, tfidf_features=False, custom_features=False, dual=False, max_iter=1000, test_only=False, min_df=5, ): start_time = time.time() if not os.path.exists(log_dir): os.makedirs(log_dir) # Loading data. train_texts, train_labels = data_loading.load_split(data_dir, source, 'valid', n=n_train) valid_texts, valid_labels = data_loading.load_split(data_dir, source, 'train', n=n_valid) test_texts, test_labels = data_loading.load_split(data_dir, source, 'test', n=n_test) cur_time = time.time() print(f'{cur_time - start_time:.2f}\tFinished loading data.') start_time = cur_time # Extracting features. if not load_features: if not load_featureizer: transformers = [] if tfidf_features: transformers.append([ 'tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=min_df, max_features=2**21) ]) if custom_features: transformers.append([ 'custom_features', features.CustomFeatures(n_jobs=n_jobs_custom) ]) assert len(transformers) >= 1, f'You should select at least one set of features to use.' vect = FeatureUnion(transformers, n_jobs=min(n_jobs, len(transformers))) train_features = vect.fit_transform(train_texts) else: with open(os.path.join(load_featureizer, 'featureizer.pickle'), 'rb') as infile: vect = pickle.load(infile) if not test_only: train_features = vect.transform(train_texts) valid_features = vect.transform(valid_texts) test_features = vect.transform(test_texts) else: with open(os.path.join(log_dir, 'train_features.pickle'), 'rb') as infile: train_features = pickle.load(infile) with open(os.path.join(log_dir, 'valid_features.pickle'), 'rb') as infile: valid_features = pickle.load(infile) with open(os.path.join(log_dir, 'test_features.pickle'), 'rb') as infile: test_features = pickle.load(infile) cur_time = time.time() print(f'{cur_time - start_time:.2f}\tFinished extracting features. Number of features: {test_features.shape[1]}') start_time = cur_time # Training the model. if not load_model: model = LogisticRegression(solver='liblinear', dual=dual, max_iter=max_iter) if not no_hyperparam_search: params = {'C': [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4, 8, 16, 32, 64]} split = PredefinedSplit([-1]*n_train+[0]*n_valid) search = GridSearchCV(model, params, cv=split, n_jobs=n_jobs, verbose=verbose, refit=False) search.fit(sparse.vstack([train_features, valid_features]), train_labels+valid_labels) model = model.set_params(**search.best_params_) cur_time = time.time() print(f'{cur_time - start_time:.2f}\tFinished hyperparam search.') start_time = cur_time model.fit(train_features, train_labels) else: with open(os.path.join(load_model, 'model.pickle'), 'rb') as infile: model = pickle.load(infile) cur_time = time.time() print(f'{cur_time - start_time:.2f}\tFinished training model.') start_time = cur_time # Scoring the model. valid_accuracy = model.score(valid_features, valid_labels)*100. test_accuracy = model.score(test_features, test_labels)*100. data = { 'source':source, 'n_train':n_train, 'valid_accuracy':valid_accuracy, 'test_accuracy':test_accuracy } cur_time = time.time() print(f'{cur_time - start_time:.2f}\tFinished evaluating model.') start_time = cur_time print(data) json.dump(data, open(os.path.join(log_dir, f'stats.json'), 'w')) # Saving the model. if save_features: with open(os.path.join(log_dir, 'train_features.pickle'), 'wb') as outfile: pickle.dump(train_features, outfile) with open(os.path.join(log_dir, 'valid_features.pickle'), 'wb') as outfile: pickle.dump(valid_features, outfile) with open(os.path.join(log_dir, 'test_features.pickle'), 'wb') as outfile: pickle.dump(test_features, outfile) cur_time = time.time() print(f'{cur_time - start_time:.2f}\tFinished saving features.') start_time = cur_time if save_model: with open(os.path.join(log_dir, 'model.pickle'), 'wb') as outfile: pickle.dump(model, outfile) cur_time = time.time() print(f'{cur_time - start_time:.2f}\tFinished saving model.') start_time = cur_time if save_featureizer: with open(os.path.join(log_dir, 'featureizer.pickle'), 'wb') as outfile: pickle.dump(vect, outfile) cur_time = time.time() print(f'{cur_time - start_time:.2f}\tFinished saving featureizer.') start_time = cur_time
def main( source=source, data_dir='data', checkpoint_dir="outputs/eval2/openai_finetune", n_train=120000, n_valid=5000, n_epochs=20, n_test=5000, reprocess_input=False, ): # import pdb; pdb.set_trace() train_texts, train_labels = data_loading.load_split(data_dir, source, 'train', n=n_train) valid_texts, valid_labels = data_loading.load_split(data_dir, source_test, 'test', n=n_valid) test_texts, test_labels = data_loading.load_split(data_dir, source_test, 'test', n=n_test) # for i, text in enumerate(train_texts): # for key in ['Article: ', 'Body: ', 'Abstract: ']: # if key in text: # train_texts[i] = text.split(key)[-1] train_labels = [int(not label) for label in train_labels] # for i, text in enumerate(valid_texts): # for key in ['Article: ', 'Body: ', 'Abstract: ']: # if key in text: # valid_texts[i] = text.split(key)[-1] valid_labels = [int(not label) for label in valid_labels] # for i, text in enumerate(test_texts): # for key in ['Article: ', 'Body: ', 'Abstract: ']: # if key in text: # test_texts[i] = text.split(key)[-1] test_labels = [int(not label) for label in test_labels] # Preparing train data train_data = {'text': train_texts, 'labels': train_labels} train_df = pd.DataFrame(data=train_data) # Preparing eval data valid_data = {'text': valid_texts, 'labels': valid_labels} valid_df = pd.DataFrame(data=valid_data) # Preparing test data test_data = {'text': test_texts, 'labels': test_labels} test_df = pd.DataFrame(data=test_data) # Optional model configuration model_args = ClassificationArgs( num_train_epochs=n_epochs, evaluate_during_training=True, evaluate_during_training_steps=2000, best_model_dir='outputs/eval2/best_model_openai_finetune', manual_seed=0, train_batch_size=32, eval_batch_size=128, overwrite_output_dir=True, n_gpu=2, output_dir=checkpoint_dir, reprocess_input_data=reprocess_input, learning_rate=0.00001) # Create a ClassificationModel model = ClassificationModel("roberta", model_name="roberta-large-openai-detector", args=model_args, use_cuda=True) # Train the model model.train_model(train_df, eval_df=valid_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer)
def main ( sources=sources, data_dir='data', load_model_dir="outputs/analytic_checkpoint_v0", checkpoint_dir="outputs/hackathon_eval1", n_train=250000, n_valid=10000, n_test=np.inf, reprocess_input=False, ): # Optional model configuration model_args = ClassificationArgs( output_dir=checkpoint_dir, num_train_epochs=2, evaluate_during_training=True, save_steps=25000, evaluate_during_training_steps=25000, manual_seed=0, train_batch_size=256, eval_batch_size=256, overwrite_output_dir=True, reprocess_input_data=reprocess_input, n_gpu=1, no_cache=True, ) # Create a ClassificationModel model = ClassificationModel( "roberta", # load_model_dir, model_name="roberta-large-openai-detector", args=model_args, use_cuda=True ) for source in sources: print(source) test_texts, test_labels = data_loading.load_split(data_dir, source, 'test', n=n_test) for i, text in enumerate(test_texts): for key in ['Article: ', 'Body: ', 'Abstract: ']: if key in text: test_texts[i] = text.split(key)[-1] test_labels = [not label for label in test_labels] # Preparing test data test_data = { 'text':test_texts, 'labels':test_labels } test_df = pd.DataFrame(data=test_data) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer ) print(result) with open('hackathon_outputs/'+source+'.csv', 'w') as outfile: outfile.write(f'row_id, score\n') for line, output in enumerate(model_outputs): outfile.write(f'{line}, {output[1]}\n')