def main(): random.seed(1012) logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 names = proc.generate_pairs_of_random_names(number_of_pairs=100, name_dir="../data/other/filtered_names.csv") bert_base_cased = HappyBERT("bert-base-cased") with open("../data/truism_data/social_data_sentences_2.json", "r") as f: social_sents = json.load(f) with open("../data/truism_data/social_data_2.json", "r") as f: social_config = json.load(f) logger.info("finished reading in social data") output_df = run_pipeline(model=bert_base_cased, fictitious_entities=names, sentences=social_sents, config=social_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv("../data/masked_word_result_data/bert_w_names/bert_social_perf_{}.csv".format(number_of_entity_trials), index=False) logger.info("finished saving social results")
""" Tests for the "answers_to_question" method that can be accessed through a HappyBERT object """ from happytransformer import HappyBERT happy_bert = HappyBERT('bert-large-uncased-whole-word-masking-finetuned-squad') PARAGRAPH = ( 'McGill is a university located in Montreal. ' 'It was founded in 1821, making it the eight oldest university in Canada. ' 'It is currently ranked 31st worldwide according to the QS Global World Ranking ' ) QA_PAIRS = [ ('When was McGill founded?', '1821'), ('Where is McGill located?', 'Montreal'), ('What is McGill\'s worldwide ranking?', '31st'), ] def test_qa_multi(): for question, expected_answer in QA_PAIRS: computed_answers = happy_bert.answers_to_question(question, PARAGRAPH, k=10) computed_answer = happy_bert.answer_question(question, PARAGRAPH) # k is being respected assert len(computed_answers) == 10 # both answering methods yield correct result assert computed_answers[0]["text"].lower() == expected_answer.lower() assert computed_answer.lower() == expected_answer.lower()
from flask import Flask, jsonify, request, make_response from flask_cors import CORS from autocomplete.autocomplete import get_next_word from question.question import get_answer from happytransformer import HappyBERT from autocomplete.learn import train_custom from complete_my_song.autocomplete_generator import generate_main from lyrics_generation.generator import get_song import glob bert = HappyBERT() app = Flask(__name__) CORS(app) @app.route('/complete-markov-artists', methods=['GET']) def complete_markov_artists(): json_files = glob.glob("./autocomplete/jsons/*.json") return jsonify( sorted( list( map(lambda file: file[21:-5].replace('-', ' ').title(), json_files)))) @app.route('/complete-gru-artists', methods=['GET']) def complete_gru_artists(): return jsonify([]) # json_files = glob.glob("./autocomplete/jsons/*.json") # return jsonify(sorted(list(map(lambda file: file[21:-5].replace('-', ' ').title(), json_files))))
def eq_ish(x, y, epsilon): '''soft similarity check between two numbers''' return abs(y - x) < epsilon def errors(func): '''determines whether function errors''' try: func() except: return True return False happy = HappyBERT() SENTENCE_PAIRS = [["How old are you?", "The Eiffel Tower is in Paris", False], ["How old are you?", "I am 40 years old", True]] def test_argument_errors(): ''' tests that the nsp module correctly rejects multi-sentence inputs ''' two_sentences = "This is the first sentence. This is the second sentence" one_sentence = "This is one sentence." assert errors( lambda: happy.predict_next_sentence(two_sentences, one_sentence)) assert errors( lambda: happy.predict_next_sentence(one_sentence, two_sentences))
#--------------------------------------# num_list = [ "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "no", "zero" ] if __name__ == "__main__": model_str = sys.argv[1] input_filename = sys.argv[2] output_filename = sys.argv[3] cuda = True model = None if model_str.startswith("reload_"): if model_str.startswith("reload_bert"): model = HappyBERT(model_str.replace("reload_bert:", "")) elif model_str.startswith("reload_roberta"): model = HappyROBERTA(model_str.replace("reload_roberta:", "")) else: if model_str.startswith("bert"): # bert-base, bert-large model = HappyBERT(model_str + "-uncased") elif model_str.startswith("roberta"): # roberta-base, roberta-large model = HappyROBERTA(model_str) elif model_str.startswith("xlnet"): # ignore model = HappyXLNET(model_str + "-cased") elif model_str.startswith("gpt"): model = GPT2LMHeadModel.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# print("Masked:", " ".join(tokenized_text)) # print("Predicted token:", predicted_token) # prediction = predicted_token # print("Other options:") # # just curious about what the next few options look like. # for i in range(10): # predictions[0, masked_index, predicted_index] = -11100000 # predicted_index = torch.argmax(predictions[0, masked_index]).item() # predicted_token = tokenizer.convert_ids_to_tokens([predicted_index]) # print(predicted_token) # response = jsonify(prediction) # # response.headers.add("Access-Control-Allow-Origin", "*") # return response if __name__ == '__main__': # modelfile = 'models/final_prediction.pickle' # model = p.load(open(modelfile, 'rb')) # modelpath = "bert-base-uncased" # tokenizer = BertTokenizer.from_pretrained(modelpath) # model = BertForMaskedLM.from_pretrained(modelpath) # app.run(debug=True, host='127.0.0.1') #initialize the LMs xlnet = HappyXLNET("xlnet-base-cased") roberta = HappyROBERTA("roberta-base") bert = HappyBERT("bert-base-uncased") app.run()
import json num_list = [ "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "no", "zero" ] if __name__ == "__main__": model_str = sys.argv[1] input_filename = sys.argv[2] output_filename = sys.argv[3] model = None if model_str.startswith("reload_"): if model_str.startswith("reload_bert"): model_str = model_str.replace("reload_bert:", "") model = HappyBERT(model_str) elif model_str.startswith("reload_roberta"): model_str = model_str.replace("reload_roberta:", "") model = HappyROBERTA(model_str) else: if model_str.startswith("bert"): # bert-base, bert-large model = HappyBERT(model_str + "-uncased") elif model_str.startswith("roberta"): # roberta-base, roberta-large model = HappyROBERTA(model_str) elif model_str.startswith("xlnet"): # ignore model = HappyXLNET(model_str + "-cased") assert model is not None
# This test is here to see if we can # minimize logging from happytransformer import HappyBERT transformer = HappyBERT() predictions = transformer.predict_mask("Dogs make me [MASK] to eat", num_results=20) # when runnning this, logs should be minimal