def sample_model( model_name='774M', seed=None, nsamples=10, batch_size=1, length=150, temperature=1, top_k=0, top_p=1, models_dir='../models', ): """ Run the sample_model :model_name=124M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=0 : Number of samples to return, if 0, continues to generate samples indefinately. :batch_size=1 : Number of batches (only affects speed/memory). :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. :models_dir : path to parent folder containing model subfolders (i.e. contains the <model_name> folder) """ models_dir = os.path.expanduser(os.path.expandvars(models_dir)) enc = encoder.get_encoder(model_name, models_dir) hparams = model.default_hparams() with open(os.path.join(models_dir, model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence( hparams=hparams, length=length, context=context, # start_token=enc.encoder['<|endoftext|>'], batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p)[:, 1:] saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name)) saver.restore(sess, ckpt) # generated = 0 # while nsamples == 0 or generated < nsamples: # out = sess.run(output) # for i in range(batch_size): # generated += batch_size # text = enc.decode(out[i]) # print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) # print(text) context_tokens = enc.encode(rawtext) generated = 0 for _ in range(nsamples // batch_size): out = sess.run(output, feed_dict={ context: [context_tokens for _ in range(batch_size)] })[:, len(context_tokens):] for i in range(batch_size): generated += 1 text = enc.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80)
def interact_model( model_name='124M', seed=None, nsamples=1, batch_size=1, length=None, temperature=1, top_k=0, top_p=1, models_dir='models', ): """ Interactively run the model :model_name=124M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=1 : Number of samples to return total :batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples. :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. :models_dir : path to parent folder containing model subfolders (i.e. contains the <model_name> folder) """ models_dir = os.path.expanduser(os.path.expandvars(models_dir)) if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name, models_dir) hparams = model.default_hparams() with open(os.path.join(models_dir, model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence( hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p ) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name)) saver.restore(sess, ckpt) while True: raw_text = input("Model prompt >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Model prompt >>> ") context_tokens = enc.encode(raw_text)
def train_main(dataset, model_name='117M', seed=None, batch_size=1, sample_length=1023, sample_num=1, sample_every=100, run_name='run1', restore_from='latest', save_every=1000): enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if sample_length is None: sample_length = hparams.n_ctx // 2 elif sample_length > hparams.n_ctx: raise ValueError( "Can't get samples longer than window size: %s" % hparams.n_ctx) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = model.model(hparams=hparams, X=context) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=context[:, 1:], logits=output['logits'][:, :-1])) tf_sample = sample.sample_sequence( hparams=hparams, length=sample_length, context=context, batch_size=batch_size, temperature=1.0, top_k=40) train_vars = [v for v in tf.trainable_variables() if 'model' in v.name] opt = tf.train.AdamOptimizer().minimize(loss, var_list=train_vars) saver = tf.train.Saver( var_list=train_vars, max_to_keep=5, keep_checkpoint_every_n_hours=2) sess.run(tf.global_variables_initializer()) if restore_from == 'latest': ckpt = tf.train.latest_checkpoint( os.path.join(CHECKPOINT_DIR, run_name)) if ckpt is None: # Get fresh GPT weights if new run. ckpt = tf.train.latest_checkpoint( os.path.join('models', model_name)) elif restore_from == 'fresh': ckpt = tf.train.latest_checkpoint( os.path.join('models', model_name)) else: ckpt = tf.train.latest_checkpoint(restore_from) print('Loading checkpoint', ckpt) saver.restore(sess, ckpt) print('Loading dataset...') chunks = load_dataset(enc, dataset) data_sampler = Sampler(chunks) print('dataset has', data_sampler.total_size, 'tokens') print('Training...') counter = 1 if os.path.exists(os.path.join(CHECKPOINT_DIR, run_name, 'counter')): # Load the step number if we're resuming a run # Add 1 so we don't immediately try to save again with open(os.path.join(CHECKPOINT_DIR, run_name, 'counter'), 'r') as fp: counter = int(fp.read()) + 1 def save(): maketree(os.path.join(CHECKPOINT_DIR, run_name)) print( 'Saving', os.path.join(CHECKPOINT_DIR, run_name, 'model-{}').format(counter)) saver.save( sess, os.path.join(CHECKPOINT_DIR, run_name, 'model'), global_step=counter) with open(os.path.join(CHECKPOINT_DIR, run_name, 'counter'), 'w') as fp: fp.write(str(counter) + '\n') def generate_samples(): context_tokens = data_sampler.sample(1) all_text = [] index = 0 while index < sample_num: out = sess.run( tf_sample, feed_dict={context: batch_size*[context_tokens]}) for i in range(min(sample_num - index, batch_size)): text = enc.decode(out[i]) text = '======== SAMPLE {} ========\n{}\n'.format(index + 1, text) all_text.append(text) index += 1 print(text) maketree(os.path.join(SAMPLE_DIR, run_name)) with open( os.path.join(SAMPLE_DIR, run_name, 'samples-{}').format(counter), 'w') as fp: fp.write('\n'.join(all_text)) avg_loss = (0.0, 0.0) start_time = time.time() try: while True: if counter % save_every == 0: save() if counter % sample_every == 0: generate_samples() batch = [data_sampler.sample(1024) for _ in range(batch_size)] _, lv = sess.run((opt, loss), feed_dict={context: batch}) avg_loss = (avg_loss[0] * 0.99 + lv, avg_loss[1] * 0.99 + 1.0) print( '[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}' .format( counter=counter, time=time.time() - start_time, loss=lv, avg=avg_loss[0] / avg_loss[1])) counter += 1 except KeyboardInterrupt: print('interrupted') save()
def get_encoder(self): logging.info("getting model's encoder") self.enc = encoder.get_encoder(self.model_name)
def sample_model(model_name='117M', seed=None, nsamples=0, batch_size=1, length=None, temperature=1, top_k=0, top_p=0.0): """ Run the sample_model :model_name=117M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=0 : Number of samples to return, if 0, continues to generate samples indefinately. :batch_size=1 : Number of batches (only affects speed/memory). :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. :top_p=0.0 : Float value controlling diversity. Implements nucleus sampling, overriding top_k if set to a value > 0. A good setting is 0.9. """ enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence( hparams=hparams, length=length, start_token=enc.encoder['<|endoftext|>'], batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p)[:, 1:] saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name)) saver.restore(sess, ckpt) generated = 0 while nsamples == 0 or generated < nsamples: out = sess.run(output) for i in range(batch_size): generated += batch_size text = enc.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) return (text)
def interact_model( model_name='117M', seed=None, nsamples=1, batch_size=1, length=None, temperature=1, top_k=0, top_p=0.0, output_file='output.txt', test_cutoff=999, bleu_cutoff=0.4, ): """ Interactively run the model :model_name=117M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=1 : Number of samples to return total :batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples. :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. :top_p=0.0 : Float value controlling diversity. Implements nucleus sampling, overriding top_k if set to a value > 0. A good setting is 0.9. """ if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence(hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name)) saver.restore(sess, ckpt) data = load_codesearch_net_lite(_TEST_FILE) avg_bleu = 0 with open(output_file, "w") as out: for i, row in data.iterrows(): code = row['code'] input_snippet = extract_definition_and_documentation(code) output_code = generate_output(enc, nsamples, sess, context, output, input_snippet) output_code = output_code[:output_code.find("<|endoftext|>")] output_code = output_code[:output_code.find("<END>")] BLEUscore = nltk.translate.bleu_score.sentence_bleu( [code], output_code) avg_bleu += BLEUscore out.write(f"\ni = {i}, bleu = {BLEUscore}") print( f"i = {i}, bleu = {BLEUscore}, avg_bleu = {avg_bleu/(i + 1)}" ) if BLEUscore > bleu_cutoff: out.write( f"\ninput_snippet = {input_snippet}, output_code = {output_code}" ) out.flush() if i > test_cutoff: break avg_bleu /= i print(f"Bleu score = {avg_bleu}") out.write(f"\nBleu score = {avg_bleu}")
def interact_model( model_name='124M', seed=None, nsamples=1, batch_size=1, length=None, temperature=1, top_k=0, top_p=1, models_dir='models', constant=0.0, counter=0, ): """ Interactively run the model :model_name=124M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=1 : Number of samples to return total :batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples. :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. :models_dir : path to parent folder containing model subfolders (i.e. contains the <model_name> folder) """ models_dir = str(os.path.dirname( os.path.abspath(__file__))) + '/models_gpt' models_dir = os.path.expanduser(os.path.expandvars(models_dir)) if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name, models_dir) hparams = model.default_hparams() with open(os.path.join(models_dir, model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) length = 30 #100 word_set = np.load( str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/word_sets_num.npy') numbers = word_set[counter] evaluations = np.zeros((8)) if constant > 0: text_all = open( str(os.path.dirname(os.path.abspath(__file__))) + "/results/snippets_with_anchoring.txt", "a+") else: text_all = open( str(os.path.dirname(os.path.abspath(__file__))) + "/results/snippets_no_anchoring.txt", "a+") text_all.write( '==================================================================================================' ) text_all.write('\n') file1 = open( str(os.path.dirname(os.path.abspath(__file__))) + "/look_ups_gpt-2/word_sets.txt", "r+") line = file1.readlines() text_all.write(line[counter]) text_all.write('\n') text_all.write('\n') with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) glovers = load_words_and_glove() converter_table = np.load( str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/converter_table.npy') container = related_words() np.random.seed(seed) tf.set_random_seed(seed) weight = constant output, probabilites = sample.sample_sequence_glove_all_top_five_gpu( hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p, glove=[ glovers[numbers[0], :], glovers[numbers[1], :], glovers[numbers[2], :], glovers[numbers[3], :], glovers[numbers[4], :] ], #[glovers[0,:], glovers[98,:], glovers[2,:], glovers[19,:], glovers[85,:]] #converter_table[14836,:] #words[98,:] #converter_table[5536,:] #glover weight=weight) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name)) saver.restore(sess, ckpt) holder = 0 counterer = 0 perplexities = np.zeros((10)) Nr_words = np.zeros((10)) Nr_related = np.zeros((10)) Nr_related_with = np.zeros((10)) text_length = np.zeros((10)) Nr_main = np.zeros((10)) Nr_main_related_without = np.zeros((10)) Nr_main_related_with = np.zeros((10)) while holder < 10: Harry, counterer = Harry_sentences_no_capital(counterer, 2) context_tokens = enc.encode(Harry) generated = 0 for _ in range(nsamples // batch_size): out, proba = sess.run( [output, probabilites], feed_dict={ context: [context_tokens for _ in range(batch_size)] }) out = out[:, len(context_tokens):] for i in range(batch_size): main = [[numbers[0]]] counter_main_sim_without = isSubset( container[numbers[0]][1:], out[i]) counter_main_sim_with = isSubset(container[numbers[0]][0:], out[i]) counter_main = counter_main_sim_with - counter_main_sim_without checker = tokens_from_words(numbers) counter_tot = 0 counter_sim = 0 counter_sim_with = 0 cond = False counter = isSubset(checker, out[i]) counter_tot += counter if counter > 0: cond = True for num in numbers: counter_sim += isSubset(container[num][1:], out[i]) counter_sim_with += isSubset(container[num][0:], out[i]) perplexitiy = np.power(proba, (-1 / length)) generated += 1 text = enc.decode(out[i]) text_all.write(text) text_all.write('\n') perplexities[holder] = perplexitiy Nr_words[holder] = counter_tot Nr_related[holder] = counter_sim text_length[holder] = text.count(' ') Nr_main[holder] = counter_main Nr_main_related_with[holder] = counter_main_sim_with Nr_main_related_without[holder] = counter_main_sim_without Nr_related_with[holder] = counter_sim_with holder += 1 evaluations[0] = np.mean(perplexities) evaluations[1] = np.mean(Nr_main) evaluations[2] = np.mean(Nr_main_related_with) evaluations[3] = np.mean(Nr_main_related_without) evaluations[4] = np.mean(Nr_words) evaluations[5] = np.mean(Nr_related_with) evaluations[6] = np.mean(Nr_related) evaluations[7] = np.mean(text_length) text_all.write( '==================================================================================================' ) text_all.close() return evaluations
def interact_model( model_name='mixed', seed=None, nsamples=1, batch_size=1, #length=None, length=20, temperature=1, top_k=0, ): """ Interactively run the model :model_name=117M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=1 : Number of samples to return total :batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples. :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. """ if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence(hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name)) saver.restore(sess, ckpt) class Sample(Resource): def get(self, name): #raw_text = input("Model prompt >>> ") raw_text = "here's some text" context_tokens = enc.encode(raw_text) generated = 0 for _ in range(nsamples // batch_size): out = sess.run( output, feed_dict={ context: [context_tokens for _ in range(batch_size)] })[:, len(context_tokens):] for i in range(batch_size): generated += 1 text = enc.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80) return (text) def post(self, name): parser = reqparse.RequestParser() parser.add_argument("prompt") args = parser.parse_args() samples = [] context_tokens = enc.encode(args["prompt"]) generated = 0 for _ in range(nsamples // batch_size): out = sess.run( output, feed_dict={ context: [context_tokens for _ in range(batch_size)] })[:, len(context_tokens):] for i in range(batch_size): generated += 1 text = enc.decode(out[i]) # print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) sample = { "name": "handey", "prompt": args["prompt"], "text": text } samples.append(sample) return samples, 201 api.add_resource(Sample, "/sample/<string:name>") app.run(host='0.0.0.0', port=8000)
def replytalk(update, context): # carry global vars global length global sess global output global model_name global batch_size global nsamples global tfcontext global output global sample_context global persistent_context global my_name user = update.message.from_user logger.info("REPLYTALK received of %s: %s", user.first_name, update.message.text) if not sample_context: # initialize. try to inject some context to hint the model sample_context = 'Conversation of ' + my_name + ', and a person from internet called ' + user.first_name + '.\n' sample_context = sample_context + persistent_context + '\n\n' sample_context = sample_context + my_name + ' - Hi ' + user.first_name + '\n' raw_text = update.message.text sample_context = sample_context + user.first_name + ' - ' + raw_text + '\n' enc = encoder.get_encoder(model_name) context_tokens = encoder.get_encoder(model_name).encode(sample_context) logger.info("sample_context: " + sample_context) logger.info("sample_context_len: " + str(len(context_tokens))) out = sess.run(output, feed_dict={tfcontext: [context_tokens for _ in range(1)]})[:, len(context_tokens):] text = enc.decode(out[0]) logger.info("Model run complete") # parse the response somehow logger.info("model response" + text) logger.info("first line response" + text.split('\n')[0]) model_response_text = '' if len(text.split('\n')[0]) < 5 or len(compress(text.split('\n')[0])) < 5: model_response_text = text.split('\n')[1].lstrip() #+ '\n' else: model_response_text = text.split('\n')[0].lstrip() #+ '\n' logger.info("guessed response" + model_response_text) # if model response starts with correspondent name... if (model_response_text.startswith(user.first_name)): # v002+ just look for the first line beginning with my name for line in text.split('\n'): if line.startswith(my_name + ' - '): model_response_text = line.split('-')[1] logger.info("guessed response (2)" + model_response_text) if '<|endoftext|>' in model_response_text: model_response_text = model_response_text.split('<|endoftext|>')[0] # sometimes my name is mentioned on line 1 need to clean that if model_response_text.startswith(my_name + ' - '): model_response_text = model_response_text.split(my_name + ' - ')[1] logger.info("final response " + model_response_text) update.message.reply_text(model_response_text, reply_markup=ReplyKeyboardRemove()) sample_context = sample_context + my_name + ' - ' + model_response_text + '\n' # truncate the context linecount = 0 count = 0 for line in sample_context.splitlines(): linecount += 1 logger.info("ctx length " + str(linecount) + " " + str(len(context_tokens)) + " tokens") if linecount > 30 or len(context_tokens) > 800: #sample_context_new = ''; sample_context_new = persistent_context + '\n\n' for line in sample_context.splitlines(): count += 1 if count > (linecount - 30): sample_context_new = sample_context_new + line + '\n' sample_context = sample_context_new return REPLYTALK
def interact_model( model_name='345M', seed=None, nsamples=1, batch_size=1, length=None, temperature=1, top_k=0, ): """ Interactively run the model :model_name=117M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=1 : Number of samples to return total :batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples. :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. """ if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence(hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name)) saver.restore(sess, ckpt) while True: raw_text = sys.stdin.read() while not raw_text: raw_text = sys.stdin.read() context_tokens = enc.encode(raw_text) generated = 0 for _ in range(nsamples // batch_size): out = sess.run(output, feed_dict={ context: [context_tokens for _ in range(batch_size)] })[:, len(context_tokens):] for i in range(batch_size): generated += 1 text = enc.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80)
def text_generator(state_dict): parser = argparse.ArgumentParser() parser.add_argument("--text", type=str, required=True) parser.add_argument("--quiet", type=bool, default=False) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=float, default=0.7) parser.add_argument("--top_k", type=int, default=40) args = parser.parse_args() if args.quiet is False: print(args) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load Model enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) model.to(device) model.eval() if args.length == -1: args.length = config.n_ctx // 2 elif args.length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) print(args.text) context_tokens = enc.encode(args.text) generated = 0 for _ in range(args.nsamples // args.batch_size): out = sample_sequence( model=model, length=args.length, context=context_tokens if not args.unconditional else None, start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None, batch_size=args.batch_size, temperature=args.temperature, top_k=args.top_k, device=device) out = out[:, len(context_tokens):].tolist() for i in range(args.batch_size): generated += 1 text = enc.decode(out[i]) if args.quiet is False: print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text)
def generate(sess, run_name='run1', checkpoint_dir='checkpoint', model_name=None, model_dir='models', sample_dir='samples', return_as_list=False, truncate=None, destination_path=None, sample_delim='=' * 20 + '\n', prefix=None, seed=None, nsamples=1, batch_size=1, length=1023, temperature=0.7, top_k=0, top_p=0.0, include_prefix=True): """Generates text from a model loaded into memory. Adapted from https://github.com/openai/gpt-2/blob/master/src/interactive_conditional_samples.py """ if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 if nsamples == 1: sample_delim = '' if prefix == '': prefix = None if model_name: checkpoint_path = os.path.join(model_dir, model_name) else: checkpoint_path = os.path.join(checkpoint_dir, run_name) enc = encoder.get_encoder(checkpoint_path) hparams = model.default_hparams() with open(os.path.join(checkpoint_path, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if prefix: context = tf.compat.v1.placeholder(tf.int32, [batch_size, None]) context_tokens = enc.encode(prefix) np.random.seed(seed) tf.compat.v1.set_random_seed(seed) output = sample.sample_sequence( hparams=hparams, length=min(length, 1023 - (len(context_tokens) if prefix else 0)), start_token=enc.encoder['<|endoftext|>'] if not prefix else None, context=context if prefix else None, batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p)[:, 1:] if destination_path: f = open(destination_path, 'w') generated = 0 gen_texts = [] while generated < nsamples: if not prefix: out = sess.run(output) else: out = sess.run(output, feed_dict={context: batch_size * [context_tokens]}) for i in range(batch_size): generated += 1 gen_text = enc.decode(out[i]) if prefix: gen_text = enc.decode(context_tokens[:1]) + gen_text if truncate: truncate_esc = re.escape(truncate) if prefix and not include_prefix: prefix_esc = re.escape(prefix) pattern = '(?:{})(.*?)(?:{})'.format( prefix_esc, truncate_esc) else: pattern = '(.*?)(?:{})'.format(truncate_esc) trunc_text = re.search(pattern, gen_text, re.S) if trunc_text: gen_text = trunc_text.group(1) gen_text = gen_text.lstrip('\n') if destination_path: f.write("{}\n{}".format(gen_text, sample_delim)) if not return_as_list and not destination_path: print("{}\n{}".format(gen_text, sample_delim), end='') gen_texts.append(gen_text) if destination_path: f.close() if return_as_list: return gen_texts
def finetune(sess, dataset, steps=-1, model_name='124M', model_dir='models', combine=50000, batch_size=1, learning_rate=0.0001, accumulate_gradients=5, restore_from='latest', run_name='run1', checkpoint_dir='checkpoint', sample_every=100, sample_length=1023, sample_num=1, multi_gpu=False, save_every=1000, print_every=1, max_checkpoints=1, use_memory_saving_gradients=False, only_train_transformer_layers=False, optimizer='adam', overwrite=False, val_dataset=None, val_batch_size=2, val_batch_count=40, val_every=0): """Finetunes the model on the given dataset. Adapted from https://github.com/nshepperd/gpt-2/blob/finetuning/train.py. See that file for parameter definitions. """ # assert model_name not in ['774M', '1558M'] or multi_gpu, "Currently, a modern single GPU cannot finetune the 774M GPT-2 model or larger." SAMPLE_DIR = 'samples' checkpoint_path = os.path.join(checkpoint_dir, run_name) def maketree(path): try: os.makedirs(path) except: pass maketree(checkpoint_path) files = [f for f in os.listdir(checkpoint_path)] for file in ['hparams.json', 'encoder.json', 'vocab.bpe']: try: shutil.copyfile(os.path.join(model_dir, model_name, file), os.path.join(checkpoint_path, file)) except FileNotFoundError as fnf_error: print( "You need to download the GPT-2 model first via download_gpt2()" ) raise (fnf_error) enc = encoder.get_encoder(checkpoint_path) hparams = model.default_hparams() with open(os.path.join(checkpoint_path, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if sample_length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) if model_name not in ['117M', '124M']: use_memory_saving_gradients = True only_train_transformer_layers = True accumulate_gradients = 1 context = tf.compat.v1.placeholder(tf.int32, [batch_size, None]) gpus = [] if multi_gpu: gpus = get_available_gpus() output = model.model(hparams=hparams, X=context, gpus=gpus) loss = tf.reduce_mean( input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits( labels=context[:, 1:], logits=output['logits'][:, :-1])) # validation code if val_every > 0: val_context = tf.placeholder(tf.int32, [val_batch_size, None]) val_output = model.model(hparams=hparams, X=val_context, reuse=True) # added reuse=True val_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=val_context[:, 1:], logits=val_output['logits'][:, :-1])) val_loss_summary = tf.summary.scalar('val_loss', val_loss) tf_sample = sample.sample_sequence(hparams=hparams, length=sample_length, context=context, batch_size=batch_size, temperature=1.0, top_k=40) all_vars = [ v for v in tf.compat.v1.trainable_variables() if 'model' in v.name ] train_vars = [v for v in all_vars if '/h' in v.name ] if only_train_transformer_layers else all_vars if optimizer == 'adam': opt = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) elif optimizer == 'sgd': opt = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=learning_rate) if accumulate_gradients > 1: if use_memory_saving_gradients: exit( "Memory saving gradients are not implemented for gradient accumulation yet." ) opt = AccumulatingOptimizer(opt=opt, var_list=train_vars) opt_reset = opt.reset() opt_compute = opt.compute_gradients(loss) opt_apply = opt.apply_gradients() summary_loss = tf.compat.v1.summary.scalar('loss', opt_apply) else: if use_memory_saving_gradients: opt_grads = memory_saving_gradients.gradients(loss, train_vars) else: opt_grads = tf.gradients(ys=loss, xs=train_vars) opt_grads = list(zip(opt_grads, train_vars)) opt_apply = opt.apply_gradients(opt_grads) summary_loss = tf.compat.v1.summary.scalar('loss', loss) summary_log = tf.compat.v1.summary.FileWriter(checkpoint_path) saver = tf.compat.v1.train.Saver(var_list=all_vars, max_to_keep=max_checkpoints) sess.run(tf.compat.v1.global_variables_initializer()) if restore_from == 'latest': ckpt = tf.train.latest_checkpoint(checkpoint_path) if ckpt is None: # Get fresh GPT weights if new run. ckpt = tf.train.latest_checkpoint( os.path.join(model_dir, model_name)) elif restore_from == 'fresh': ckpt = tf.train.latest_checkpoint(os.path.join(model_dir, model_name)) else: ckpt = tf.train.latest_checkpoint(restore_from) print('Loading checkpoint', ckpt) saver.restore(sess, ckpt) print('Loading dataset...') chunks = load_dataset(enc, dataset, combine) data_sampler = Sampler(chunks) # validation code if val_every > 0: if val_dataset: val_chunks = load_dataset(enc, val_dataset, combine) else: val_chunks = chunks print('dataset has', data_sampler.total_size, 'tokens') print('Training...') # validation code if val_every > 0: # Sample from validation set once with fixed seed to make # it deterministic during training as well as across runs. val_data_sampler = Sampler(val_chunks, seed=1) val_batches = [[ val_data_sampler.sample(1024) for _ in range(val_batch_size) ] for _ in range(val_batch_count)] counter = 1 counter_path = os.path.join(checkpoint_path, 'counter') if os.path.exists(counter_path) and restore_from == 'latest': # Load the step number if we're resuming a run # Add 1 so we don't immediately try to save again with open(counter_path, 'r') as fp: counter = int(fp.read()) + 1 counter_base = counter def save(): maketree(checkpoint_path) print('Saving', os.path.join(checkpoint_path, 'model-{}').format(counter - 1)) saver.save(sess, os.path.join(checkpoint_path, 'model'), global_step=counter - 1) with open(counter_path, 'w') as fp: fp.write(str(counter - 1) + '\n') def generate_samples(): context_tokens = data_sampler.sample(1) all_text = [] index = 0 while index < sample_num: out = sess.run(tf_sample, feed_dict={context: batch_size * [context_tokens]}) for i in range(min(sample_num - index, batch_size)): text = enc.decode(out[i]) text = '======== SAMPLE {} ========\n{}\n'.format( index + 1, text) all_text.append(text) index += 1 print(text) maketree(os.path.join(SAMPLE_DIR, run_name)) with open( os.path.join(SAMPLE_DIR, run_name, 'samples-{}').format(counter), 'w') as fp: fp.write('\n'.join(all_text)) # validation code def validation(): print('Calculating validation loss...') losses = [] for batch in tqdm(val_batches): losses.append(sess.run(val_loss, feed_dict={val_context: batch})) v_val_loss = np.mean(losses) v_summary = sess.run(val_loss_summary, feed_dict={val_loss: v_val_loss}) summary_log.add_summary(v_summary, counter) summary_log.flush() print('[{counter} | {time:2.2f}] validation loss = {loss:2.2f}'.format( counter=counter, time=time.time() - start_time, loss=v_val_loss)) return v_val_loss def sample_batch(): return [data_sampler.sample(1024) for _ in range(batch_size)] if overwrite and restore_from == 'latest': for file in files: if file.startswith('model') or file.startswith('events'): os.remove(os.path.join(checkpoint_path, file)) save() avg_loss = (0.0, 0.0) start_time = time.time() #Trying out a change to finetune that saves only when validation loss decreases if steps: steps = int(steps) try: while True: if steps > 0 and counter == (counter_base + steps): #save() return # if (counter - 1) % save_every == 0 and counter > 1: # save() if (counter - 1) % sample_every == 0 and counter > 1: generate_samples() # validation code if val_every > 0 and counter == 1: v_val_loss = validation() save() elif val_every > 0 and counter == counter_base: v_val_loss = validation() elif val_every > 0 and (counter % val_every == 0): new_v_val_loss = validation() if new_v_val_loss < v_val_loss: v_val_loss = new_v_val_loss save() if accumulate_gradients > 1: sess.run(opt_reset) for _ in range(accumulate_gradients): sess.run(opt_compute, feed_dict={context: sample_batch()}) (v_loss, v_summary) = sess.run((opt_apply, summary_loss)) else: (_, v_loss, v_summary) = sess.run( (opt_apply, loss, summary_loss), feed_dict={context: sample_batch()}) summary_log.add_summary(v_summary, counter) if (counter % print_every == 0) or counter == 1: avg_loss = (avg_loss[0] * 0.99 + v_loss, avg_loss[1] * 0.99 + 1.0) print( '[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}' .format(counter=counter, time=time.time() - start_time, loss=v_loss, avg=avg_loss[0] / avg_loss[1])) counter += 1 except KeyboardInterrupt: print('interrupted') save()
def makeModel(text, leng, k): try: model_name = '774M' seed = None nsamples = 1 batch_size = 1 length = int(leng) temperature = 1 top_k = int(k) top_p = 1 models_dir = 'models' raw_text = text print('makeModel') models_dir = os.path.expanduser(os.path.expandvars(models_dir)) if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name, models_dir) hparams = model.default_hparams() with open(os.path.join(models_dir, model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence(hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint( os.path.join(models_dir, model_name)) saver.restore(sess, ckpt) print('raw_text in make model : ' + raw_text) context_tokens = enc.encode(raw_text) generated = 0 for _ in range(nsamples // batch_size): out = sess.run(output, feed_dict={ context: [context_tokens for _ in range(batch_size)] })[:, len(context_tokens):] for i in range(batch_size): generated += 1 text = enc.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) return text.encode('ascii', 'ignore').decode('ascii') except Exception as e: print(e) return 500
def encode_main(in_text, out_npz, model_name='117M'): enc = encoder.get_encoder(model_name) print('Reading files') chunks = load_dataset(enc, in_text) print('Writing', out_npz) np.savez_compressed(out_npz, *chunks)
def _bytes_feature(value): """Returns a bytes_list from a string / byte.""" return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) # Divides a list into chunks def chunks(l, n): out = [] for i in range(0, len(l), n): out.append(l[i:i + n]) return out if not os.path.exists(log_dir): os.mkdir(log_dir) enc = encoder.get_encoder(encoder_path) file_chunks = chunks(files, files_per) print("Got {} files, divided into {} chunks.".format(str(len(files)), str(len(file_chunks)))) def create_file(args): i, chunk = args s = name + "_" + str(i) + ".tfrecords" if os.path.exists(os.path.join(log_dir, s)): # Hack-y, if file of same name is in log dir, sign that the file is complete, so skip return if os.path.exists(os.path.join(output_dir, s)): # Unfinished file, remove os.remove(os.path.join(output_dir, s)) with tf.python_io.TFRecordWriter(os.path.join(output_dir, s)) as writer: good_files = 0
from encoder import get_encoder encoder = get_encoder() def pre_inference(payload, signature, metadata): context = encoder.encode(payload["text"]) return {"context": [context]} def post_inference(prediction, signature, metadata): response = prediction["sample"] return encoder.decode(response)
def interact_model(model_name='model', seed=99, nsamples=5, batch_size=5, length=8, temperature=0, top_k=10, top_p=.85, models_dir=''): models_dir = os.path.expanduser(os.path.expandvars(models_dir)) if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name, models_dir) hparams = model.default_hparams() with open(os.path.join(models_dir, model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) config = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=0, inter_op_parallelism_threads=0, allow_soft_placement=True, gpu_options=gpu_options) with tf.compat.v1.Session(graph=tf.Graph(), config=config) as sess: context = tf.compat.v1.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.compat.v1.set_random_seed(seed) # p = tf.random.uniform((1,), minval=.68, maxval=.98, dtype=tf.dtypes.float32, name='random_p_logits') output = sample.sample_sequence(hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p) saver = tf.compat.v1.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name)) saver.restore(sess, ckpt) class Autocomplete(Resource): def get(self): return '' def post(self): body = request.get_json(force=True) if body['text'] == "": return context_tokens = enc.encode(body['text']) generated = 0 predictions = [] for _ in range(nsamples // batch_size): feed_dict = { context: [context_tokens for _ in range(batch_size)] } out = sess.run(output, feed_dict=feed_dict)[:, len(context_tokens):] for i in range(batch_size): generated += 1 text = enc.decode(out[i]) predictions.append(str(text)) return Response(json.dumps({'result': predictions}), status=200, mimetype='application/json') if __name__ == '__main__': app = Flask(__name__) api = Api(app) api.add_resource(Autocomplete, '/autocomplete') app.run('0.0.0.0', port=3030, debug=True, use_reloader=False)
distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. :models_dir : path to parent folder containing model subfolders (i.e. contains the <model_name> folder) """ models_dir = os.path.expanduser(os.path.expandvars(models_dir)) if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name, models_dir) hparams = model.default_hparams() with open(os.path.join(models_dir, model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence( hparams=hparams, length=length,
'files', None, 'Name of a file that specifies the images in the dataset, one per line. E.g. --files my_list_of_images.txt' ) flags.DEFINE_integer('shards', 2048, 'Number of tfrecord files to generate') flags.DEFINE_integer('nprocs', 8, 'Number of processes to work in parallel') flags.DEFINE_boolean('directory_labels', False, 'Use the directory name of each file as a label') flags.DEFINE_string('crop_method', 'none', '<random, distorted, middle, none>') flags.DEFINE_integer('resize', -1, 'Resize to a specific resolution') flags.DEFINE_string('doc2vec_embeddings', None, 'Use a doc2vec model for embeddings') flags.DEFINE_string('tags_csv', None, 'Include tags from a csv') FLAGS = flags.FLAGS tokenizer = encoder.get_encoder() def _check_or_create_dir(directory): """Check if directory exists otherwise create it.""" if not tf.gfile.Exists(directory): tf.gfile.MakeDirs(directory) def _int64_feature(value): """Wrapper for inserting int64 features into Example proto.""" if not isinstance(value, list): value = [value] return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def alite_graph( model_name='124M', seed=None, nsamples=1, batch_size=1, length=10, temperature=1, # .5 usually has numbered steps, .7 usually does not beam_width=6, max_contexts=800, max_expansions=1000, top_k=None, top_p=1, models_dir='models', input_samples=[], ): """ Interactively run the model :model_name=124M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=1 : Number of samples to return total :batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples. :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=40 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. :models_dir : path to parent folder containing model subfolders (i.e. contains the <model_name> folder) """ models_dir = os.path.expanduser(os.path.expandvars(models_dir)) if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 top_k = beam_width #Set the top_k to the beam_width to only find the beam_width number of logits enc = encoder.get_encoder(model_name, models_dir) hparams = model.default_hparams() with open(os.path.join(models_dir, model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) logits = sample.get_logits(hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name)) saver.restore(sess, ckpt) input_iter = 0 start_time = time.perf_counter() while True: raw_text = "" if (input_iter < len(input_samples)): raw_text = input_samples[input_iter] input_iter += 1 print(raw_text) elif (len(input_samples) == 0): raw_text = input("Model prompt >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Model prompt >>> ") context_tokens = enc.encode(raw_text) generated = 0 #Create a graph to map the contexts g = Graph(directed=True) g.add_vertex(str(context_tokens)) generated = 0 times_run = 0 for _ in range(nsamples // batch_size): for i in range(batch_size): generated += 1 max_length = len(context_tokens) + length contexts = [context_tokens] # o_file.write(str(contexts) + '\n') print(contexts) probability_map = {} full_probability_map = {} all_contexts = [] all_contexts.append(context_tokens) while True: #This will probably check if the context lengths are less than max_length new_contexts = [] #Get highest probability context max_key = str(contexts[0]) if (bool(probability_map)): max_key = max(probability_map.keys(), key=(lambda k: probability_map[k])) else: probability_map[str(context_tokens)] = 0 full_probability_map[str(context_tokens)] = 0 if (len(probability_map) >= max_contexts): break if (times_run >= max_expansions): break #Find the highest probability context con = max_key str_values = con.strip('][').split(', ') new_values = [] for val in str_values: new_values.append(int(val)) con = new_values out_logits = sess.run( logits, feed_dict={ context: [con for _ in range(batch_size)] }) times_run += 1 # Normalize the outputs out_logits = out_logits - np.max(out_logits) eo_logits = np.exp(out_logits) + 1e-20 out_logits = np.log(eo_logits / (np.sum(eo_logits))) logit_indeces = [] logit_probs = [] for logit_index in range(len(out_logits[0])): if (out_logits.item(logit_index) > np.min(out_logits)): #We should get (beam width) # of logit indeces and probabilities logit_indeces.append(logit_index) logit_probs.append( out_logits[0].item(logit_index)) new_contexts = [] for i in range(len(logit_indeces)): temp_context = con.copy() temp_context.append(logit_indeces[i]) all_contexts.append(temp_context) if str(con) in probability_map.keys(): g.add_vertex(str(temp_context)) parent_context = temp_context[ 0:len(temp_context) - 1] g.add_edge(str(temp_context), str(parent_context)) probability_map[str( temp_context )] = probability_map[str(con)] + logit_probs[i] full_probability_map[str( temp_context )] = probability_map[str(con)] + logit_probs[i] # o_file.write("Probability for " + str(temp_context) + " is " + str(logit_probs[i]) + " + " + str(probability_map[str(con)]) + " = " + str(probability_map[str(temp_context)]) + "\n") # print("Probability for " + str(temp_context) + " is " + str(logit_probs[i]) + " + " + str(probability_map[str(con)]) + " = " + str(probability_map[str(temp_context)])) else: g.add_vertex(str(temp_context)) parent_context = temp_context[ 0:len(temp_context) - 1] g.add_edge(str(temp_context), str(parent_context)) probability_map[str( temp_context)] = logit_probs[i] full_probability_map[str( temp_context)] = logit_probs[i] # o_file.write("Probability for " + str(temp_context) + " is " + str(logit_probs[i]) + " = " + str(probability_map[str(temp_context)]) + "\n") # print("Probability for " + str(temp_context) + " is " + str(logit_probs[i]) + " = " + str(probability_map[str(temp_context)])) new_contexts.append(temp_context) if str(con) in probability_map.keys(): del probability_map[str(con)] contexts = new_contexts new_probs = {} for con in contexts: if str(con) in probability_map: new_probs[str(con)] = probability_map[str(con)] # for con in contexts: # print(enc.decode(con) + " --- Probability: ---" + str(probability_map[str(con)])) string_contexts = list(probability_map.keys()) #print(string_contexts) new_contexts = [] for con in string_contexts: str_values = con.strip('][').split(', ') new_values = [] for val in str_values: new_values.append(int(val)) new_contexts.append(new_values) contexts = new_contexts all_strings = [] for context in all_contexts: con_string = enc.decode(context) all_strings.append(con_string) for context in contexts: con_string = enc.decode(context) print(con_string) # print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40 + '\n') # print(text) nr_vertices = g.vcount() print(nr_vertices) es = EdgeSeq(g) E = [e.tuple for e in es] lay = g.layout_auto() v_label = list(map(str, range(nr_vertices))) position = {k: lay[k] for k in range(nr_vertices)} Y = [lay[k][1] for k in range(nr_vertices)] M = max(Y) L = len(position) Xn = [position[k][0] for k in range(L)] Yn = [2 * M - position[k][1] for k in range(L)] Xe = [] Ye = [] for edge in E: Xe += [ position[edge[0]][0], position[edge[1]][0], None ] Ye += [ 2 * M - position[edge[0]][1], 2 * M - position[edge[1]][1], None ] labels = v_label hover_annotations = [ all_strings[x] + ' - Probability: ' + str(full_probability_map[str(all_contexts[x])]) for x in range(len(all_strings)) ] def make_annotations(pos, text, font_size=10, font_color='rgb(250,250,250)'): L = len(pos) if len(text) != L: raise ValueError( 'The lists pos and text must have the same len' ) annotations = [] for k in range(L): annotations.append( dict( text=str( text[k] ), # or replace labels with a different list for the text within the circle x=pos[k][0], y=2 * M - position[k][1], xref='x1', yref='y1', font=dict(color=font_color, size=font_size), showarrow=False)) return annotations fig = go.Figure() fig.add_trace( go.Scatter(x=Xe, y=Ye, mode='lines', line=dict(color='rgb(210,210,210)', width=1), hoverinfo='none')) fig.add_trace( go.Scatter( x=Xn, y=Yn, mode='markers', name='bla', marker=dict( symbol='circle-dot', size=18, color='#6175c1', #'#DB4551', line=dict(color='rgb(50,50,50)', width=1)), text=hover_annotations, hoverinfo='text', opacity=0.8)) axis = dict( showline= False, # hide axis line, grid, ticklabels and title zeroline=False, showgrid=False, showticklabels=False, ) fig.update_layout( title='Tree - Nodes: ' + str(len(all_contexts)) + ' Beam width: ' + str(beam_width), annotations=make_annotations(position, v_label), font_size=12, showlegend=False, xaxis=axis, yaxis=axis, margin=dict(l=40, r=40, b=85, t=100), hovermode='closest', plot_bgcolor='rgb(248,248,248)') fig.show() time_elapsed = time.perf_counter() - start_time print('\n' + str(time_elapsed) + " seconds elapsed" + '\n' + '-' * 60 + '\n') return print("=" * 80)
def interact_model( raw_text, model_name='345MChinese', seed=None, nsamples=1, batch_size=1, length=None, temperature=0.9, top_k=40, top_p=0.0, generated=0, ): """ Interactively run the model :model_name=117M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=1 : Number of samples to return total :batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples. :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. :top_p=0.0 : Float value controlling diversity. Implements nucleus sampling, overriding top_k if set to a value > 0. A good setting is 0.9. """ if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('../models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence(hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join('../models', model_name)) saver.restore(sess, ckpt) while True: # raw_text = input("Model prompt >>> ") while not raw_text: print('You should input text.') raw_text = input("Model prompt >>> ") print("raw_text:", raw_text) context_tokens = enc.encode(raw_text) result = [] text_orgin = raw_text[13:] generate = 0 for _ in range(nsamples // batch_size): generated += 1 generate += 1 out = sess.run(output, feed_dict={ context: [context_tokens for _ in range(batch_size)] })[:, len(context_tokens):] for i in range(batch_size): text = enc.decode(out[i]) if '<|endoftext|>' in text: pos = text.index('<|endoftext|>') else: pos = text.index('.', 1000) pos = pos + 1 text = text_orgin + text[:pos] result.append(text) # print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) path1 = '/home/stu/pkq/gpt-2/samples/story_final_return' path2 = path1[:-7] path3 = str(generate) + ".txt" path4 = raw_text + " " + str(generated) + ".txt" path_return = os.path.join(path1, path3) path = os.path.join(path2, path4) # saved all the generated texts in path with open(path, "w") as f1: f1.write(text) # saved all the generated texts that start with the raw_text in path_return to return to the client with open(path_return, "w") as f2: f2.write(text) # print(text) # print("=" * 80) return result
def sample_model( model_name='117M', seed=None, nsamples=0, batch_size=1, length=None, temperature=1, top_k=0, ): """ Run the sample_model :model_name=117M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=0 : Number of samples to return, if 0, continues to generate samples indefinately. :batch_size=1 : Number of batches (only affects speed/memory). :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. """ enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence( hparams=hparams, length=length, start_token=enc.encoder['<|endoftext|>'], batch_size=batch_size, temperature=temperature, top_k=top_k )[:, 1:] saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name)) saver.restore(sess, ckpt) generated = 0 while nsamples == 0 or generated < nsamples: out = sess.run(output) for i in range(batch_size): generated += batch_size text = enc.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text)
def interact_model( model_name='117M', seed=None, nsamples=1, batch_size=None, length=None, temperature=1, top_k=0, ): if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 np.random.seed(seed) tf.set_random_seed(seed) enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) print("GPT-2 Parameters") print("================") print("Model Name : "+ str(model_name)) print("Seed = " + str(seed)) print("N Samples = " + str(nsamples)) print("Batch Size = " + str(batch_size)) print("Length = " + str(length)) print("Temperature = " + str(temperature)) print("Top K = " + str(top_k)) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) output = sample.sample_sequence( hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k ) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name)) saver.restore(sess, ckpt) context_tokens = enc.encode(raw_text) print("Context Tokens = " + str(context_tokens)) generated = 0 for _ in range(nsamples // batch_size): out = sess.run(output, feed_dict={ context: [context_tokens for _ in range(batch_size)] })[:, len(context_tokens):] for i in range(batch_size): generated += 1 text = enc.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80)
def main(): args = parser.parse_args() enc = encoder.get_encoder(args.model_name) hparams = model.default_hparams() with open(os.path.join('models', args.model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if args.sample_length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) if args.model_name == '345M': args.memory_saving_gradients = True if args.optimizer == 'adam': args.only_train_transformer_layers = True config = tf.ConfigProto() config.gpu_options.allow_growth = True config.graph_options.rewrite_options.layout_optimizer = rewriter_config_pb2.RewriterConfig.OFF with tf.Session(config=config) as sess: context = tf.placeholder(tf.int32, [args.batch_size, None]) context_in = randomize(context, hparams, args.noise) output = model.model(hparams=hparams, X=context_in) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=context[:, 1:], logits=output['logits'][:, :-1])) if args.val_every > 0: val_context = tf.placeholder(tf.int32, [args.val_batch_size, None]) val_output = model.model(hparams=hparams, X=val_context) val_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=val_context[:, 1:], logits=val_output['logits'][:, :-1])) val_loss_summary = tf.summary.scalar('val_loss', val_loss) tf_sample = sample.sample_sequence(hparams=hparams, length=args.sample_length, context=context, batch_size=args.batch_size, temperature=1.0, top_k=args.top_k, top_p=args.top_p) all_vars = [v for v in tf.trainable_variables() if 'model' in v.name] train_vars = [v for v in all_vars if '/h' in v.name ] if args.only_train_transformer_layers else all_vars if args.optimizer == 'adam': opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) elif args.optimizer == 'sgd': opt = tf.train.GradientDescentOptimizer( learning_rate=args.learning_rate) else: exit('Bad optimizer:', args.optimizer) if args.accumulate_gradients > 1: if args.memory_saving_gradients: exit( "Memory saving gradients are not implemented for gradient accumulation yet." ) opt = AccumulatingOptimizer(opt=opt, var_list=train_vars) opt_reset = opt.reset() opt_compute = opt.compute_gradients(loss) opt_apply = opt.apply_gradients() summary_loss = tf.summary.scalar('loss', opt_apply) else: if args.memory_saving_gradients: opt_grads = memory_saving_gradients.gradients(loss, train_vars) else: opt_grads = tf.gradients(loss, train_vars) opt_grads = list(zip(opt_grads, train_vars)) opt_apply = opt.apply_gradients(opt_grads) summary_loss = tf.summary.scalar('loss', loss) summary_lr = tf.summary.scalar('learning_rate', args.learning_rate) summaries = tf.summary.merge([summary_lr, summary_loss]) summary_log = tf.summary.FileWriter( os.path.join(CHECKPOINT_DIR, args.run_name)) saver = tf.train.Saver(var_list=all_vars, max_to_keep=5, keep_checkpoint_every_n_hours=2) sess.run(tf.global_variables_initializer()) if args.restore_from == 'latest': ckpt = tf.train.latest_checkpoint( os.path.join(CHECKPOINT_DIR, args.run_name)) if ckpt is None: # Get fresh GPT weights if new run. ckpt = tf.train.latest_checkpoint( os.path.join('models', args.model_name)) elif args.restore_from == 'fresh': ckpt = tf.train.latest_checkpoint( os.path.join('models', args.model_name)) else: ckpt = tf.train.latest_checkpoint(args.restore_from) print('Loading checkpoint', ckpt) saver.restore(sess, ckpt) print('Loading dataset...') chunks = load_dataset(enc, args.dataset, args.combine, encoding=args.encoding) data_sampler = Sampler(chunks) if args.val_every > 0: if args.val_dataset: val_chunks = load_dataset(enc, args.val_dataset, args.combine, encoding=args.encoding) else: val_chunks = chunks print('dataset has', data_sampler.total_size, 'tokens') print('Training...') if args.val_every > 0: # Sample from validation set once with fixed seed to make # it deterministic during training as well as across runs. val_data_sampler = Sampler(val_chunks, seed=1) val_batches = [[ val_data_sampler.sample(1024) for _ in range(args.val_batch_size) ] for _ in range(args.val_batch_count)] counter = 1 counter_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'counter') if os.path.exists(counter_path): # Load the step number if we're resuming a run # Add 1 so we don't immediately try to save again with open(counter_path, 'r') as fp: counter = int(fp.read()) + 1 def save(): maketree(os.path.join(CHECKPOINT_DIR, args.run_name)) print( 'Saving', os.path.join(CHECKPOINT_DIR, args.run_name, 'model-{}').format(counter)) saver.save(sess, os.path.join(CHECKPOINT_DIR, args.run_name, 'model'), global_step=counter) with open(counter_path, 'w') as fp: fp.write(str(counter) + '\n') def generate_samples(): print('Generating samples...') context_tokens = data_sampler.sample(1) all_text = [] index = 0 while index < args.sample_num: out = sess.run( tf_sample, feed_dict={context: args.batch_size * [context_tokens]}) for i in range(min(args.sample_num - index, args.batch_size)): text = enc.decode(out[i]) text = '======== SAMPLE {} ========\n{}\n'.format( index + 1, text) all_text.append(text) index += 1 print(text) maketree(os.path.join(SAMPLE_DIR, args.run_name)) with open(os.path.join(SAMPLE_DIR, args.run_name, 'samples-{}').format(counter), 'w', encoding=args.encoding) as fp: fp.write('\n'.join(all_text)) def validation(): print('Calculating validation loss...') losses = [] for batch in tqdm.tqdm(val_batches): losses.append( sess.run(val_loss, feed_dict={val_context: batch})) v_val_loss = np.mean(losses) v_summary = sess.run(val_loss_summary, feed_dict={val_loss: v_val_loss}) summary_log.add_summary(v_summary, counter) summary_log.flush() print('[{counter} | {time:2.2f}] validation loss = {loss:2.2f}'. format(counter=counter, time=time.time() - start_time, loss=v_val_loss)) def sample_batch(): return [data_sampler.sample(1024) for _ in range(args.batch_size)] avg_loss = (0.0, 0.0) start_time = time.time() try: while True: if counter % args.save_every == 0: save() if counter % args.sample_every == 0: generate_samples() if args.val_every > 0 and (counter % args.val_every == 0 or counter == 1): validation() if args.accumulate_gradients > 1: sess.run(opt_reset) for _ in range(args.accumulate_gradients): sess.run(opt_compute, feed_dict={context: sample_batch()}) (v_loss, v_summary) = sess.run((opt_apply, summaries)) else: (_, v_loss, v_summary) = sess.run( (opt_apply, loss, summaries), feed_dict={context: sample_batch()}) summary_log.add_summary(v_summary, counter) avg_loss = (avg_loss[0] * 0.99 + v_loss, avg_loss[1] * 0.99 + 1.0) print( '[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}' .format(counter=counter, time=time.time() - start_time, loss=v_loss, avg=avg_loss[0] / avg_loss[1])) counter += 1 except KeyboardInterrupt: print('interrupted') save()
tf.app.flags.DEFINE_string("gpu", "0", "Specify which gpu to use.") tf.app.flags.DEFINE_integer("batch_size", 10, "Number of batches (only affects speed/memory).") tf.app.flags.DEFINE_string("data_name", "roc", "Set `roc` to train the model on ROCStories corpus or \ `kg` to train the model on the knowledge bases or\ `multi_roc` to train the model on ROCStories with multi-task learning.") tf.app.flags.DEFINE_integer("n_class", 4, "Number of classes for the auxiliary classification task.") tf.app.flags.DEFINE_float("learning_rate", 1e-4, "Learning rate.") tf.app.flags.DEFINE_string("data_dir", "./data", "Data directory.") tf.app.flags.DEFINE_integer("length", 200, "Number of tokens in generated text.") tf.app.flags.DEFINE_float("temperature", 0.7, "Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions.") tf.app.flags.DEFINE_integer("top_k", 40, "Integer value controlling diversity.") FLAGS = tf.app.flags.FLAGS FLAGS.is_train = bool(FLAGS.is_train) FLAGS.cond = bool(FLAGS.cond) model_dir = os.path.expanduser(os.path.expandvars(FLAGS.model_dir)) enc = encoder.get_encoder(model_dir) PAD_ID = enc.encoder['<|endoftext|>'] hparams = model.default_hparams() with open(os.path.join(model_dir, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) def load_data(path, fname, enc, label): data = [] print('loading %s/%s ......' % (path, fname)) with open('%s/%s.txt' % (path, fname)) as f: tmp = [] for k, line in enumerate(f): i = k + 1 if i % 6 == 0: data.append({"st": tmp, "label": label}) tmp = []
import numpy as np import tensorflow as tf # import gpt.model, gpt.sample, gpt.encoder model_name='1558M' batch_size = 1 seed = None nsamples=1 length=10 temperature=1 top_k=0 np.random.seed(seed) tf.set_random_seed(seed) enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [1, None]) output = sample.sample_sequence( hparams=hparams, length=length, context=context, batch_size=1,
os.makedirs(results_path, exist_ok=False) os.makedirs(saved_model_path, exist_ok=False) log_file = os.path.join(results_path, 'log.txt') # create data paths root_path = FLAGS.root_path gold_path_valid = os.path.join(root_path, FLAGS.domain, 'original_data', 'valid.summary') gold_path_test = os.path.join(root_path, FLAGS.domain, 'original_data', 'test.summary') field_vocab_file = os.path.join(root_path, "human_books_songs_films_field_vocab.txt") processed_data_dir = os.path.join(root_path, FLAGS.domain, "processed_data") # bpe vocab last_best = 0.0 enc = encoder.get_encoder("117M") eos = 50256 #TODO move to settings empty = 28920 #TODO move to settings def train(sess, preprocessed_data, model): # keep track of all input parameters write_log(log_file, "####################INPUT PARAMETERS###################") for attr in FLAGS.flag_values_dict(): value = FLAGS.flag_values_dict()[attr] write_log(log_file, attr + " = " + str(value)) write_log(log_file, "#######################################################") train_iterator = DataLoader(preprocessed_data.train_set, FLAGS.domain, batch_size=FLAGS.batch_size, shuffle=True, eos=eos, empty=empty)
def main(): args = parser.parse_args() enc = encoder.get_encoder(args.model_name) hparams = model.default_hparams() hparams.res_dropout = args.dropout hparams.attn_dropout = args.dropout epsilon = -1e10 if args.dtype == 'float32': hparams.dtype = tf.float32 elif args.dtype == 'float16': hparams.dtype = tf.float16 epsilon = -65500 elif args.dtype == 'bfloat16': hparams.dtype = tf.bfloat16 epsilon = -65500 else: print('Unknown dtype', args.dtype) if args.float16: hparams.dtype = tf.bfloat16 epsilon = -65500 with open(os.path.join('models', args.model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if args.n_ctx >= 0: hparams.n_ctx = args.n_ctx if args.n_embd >= 0: hparams.n_embd = args.n_embd if args.n_head >= 0: hparams.n_head = args.n_head if args.n_layer >= 0: hparams.n_layer = args.n_layer if args.sample_length < 0: args.sample_length = hparams.n_ctx - 1 if args.sample_length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) if args.sample_ctx < 0: args.sample_ctx = hparams.n_ctx if args.model_name == '345M': args.memory_saving_gradients = True if args.optimizer == 'adam': args.only_train_transformer_layers = True config = tf.ConfigProto() if args.allow_growth: config.gpu_options.allow_growth = True if args.disable_layout_optimizer: config.graph_options.rewrite_options.layout_optimizer = rewriter_config_pb2.RewriterConfig.OFF with tflex.Session(config=config, init_tpu=args.init_tpu) as sess: context = tf.placeholder(tf.int32, [args.batch_size, None]) context_in = randomize(context, hparams, args.noise) output = model.model(hparams=hparams, X=context_in) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=context[:, 1:], logits=output['logits'][:, :-1])) if args.val_every > 0: val_context = tf.placeholder(tf.int32, [args.val_batch_size, None]) val_output = model.model(hparams=hparams, X=val_context) val_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=val_context[:, 1:], logits=val_output['logits'][:, :-1])) val_loss_summary = tf.summary.scalar('val_loss', val_loss) tf_sample = sample.sample_sequence(hparams=hparams, length=args.sample_length, context=context, batch_size=args.batch_size, temperature=1.0, top_k=args.top_k, top_p=args.top_p, epsilon=epsilon) all_vars = [v for v in tf.trainable_variables() if 'model' in v.name] train_vars = [v for v in all_vars if '/h' in v.name ] if args.only_train_transformer_layers else all_vars parameter_count = sum([np.prod(v.shape.as_list()) for v in train_vars]) print("This model is using %d parameters (%.2fM)" % (parameter_count, parameter_count / (1024.0 * 1024.0))) with tf.variable_scope(tf.get_variable_scope().name, reuse=tf.AUTO_REUSE): global_step = tflex.get_variable('global_step') or tf.get_variable( 'global_step', shape=(), dtype=tf.int32, trainable=False) current_step = args.learning_rate_initial_step global_step.load(current_step, session=sess) if args.learning_rate_cos: lr = tflex_sgdr.sgdr_decay_with_warmup( args.learning_rate, global_step, warmup_steps=args.learning_rate_warmup, initial_period_steps=args.learning_rate_period, learning_rate_min=args.learning_rate_min) else: lr = tflex.get_variable('learn_rate') or tf.get_variable( 'learn_rate', shape=(), dtype=tf.float32, trainable=False) lr.load(args.learning_rate, session=sess) def update_lr(rate=None, step=None): if not args.learning_rate_cos: if step is None: step = global_step.eval(session=sess) if rate is None: rate = args.learning_rate if callable(rate): rate = rate(step) lr.load(rate, session=sess) return lr.eval(session=sess) @tflex.register_command def set_learning_rate(): print("Current learn rate: %0.8f" % update_lr()) print("New learn rate?") rate = input('') if not rate: print("Empty input; not changing anything.") else: try: rate = float(rate) except: print("Invalid input; must be a float") print("Setting learn rate to %0.8f" % rate) args.learning_rate = rate if args.optimizer == 'adam': opt = tf.train.AdamOptimizer(learning_rate=lr) elif args.optimizer == 'sgd': opt = tf.train.GradientDescentOptimizer(learning_rate=lr) elif args.optimizer == 'ada': import tensor2tensor.utils.optimize from tensor2tensor.utils import hparam import tensor2tensor.models.research from tensor2tensor.utils import registry ada_hparams = registry.hparams('afx_mimic_adam') ada_hparams.optimizer_adafactor_beta1 = 0.0 ada_hparams.optimizer_adafactor_factored = True opt = tensor2tensor.utils.optimize.adafactor(learning_rate=lr, hparams=ada_hparams) else: exit('Bad optimizer:', args.optimizer) #if tpu_addr: # # https://pulsejet.github.io/blog/posts/tpu-without-estimator/ # from tensorflow.contrib.tpu.python.tpu import tpu_function # tpu_function.get_tpu_context().set_number_of_shards(8) # opt = tf.contrib.tpu.CrossShardOptimizer(opt) if args.accumulate_gradients > 1: if args.memory_saving_gradients: exit( "Memory saving gradients are not implemented for gradient accumulation yet." ) opt = AccumulatingOptimizer(opt=opt, var_list=train_vars) opt_reset = opt.reset() opt_compute = opt.compute_gradients(loss) opt_apply = opt.apply_gradients() summary_loss = tf.summary.scalar('loss', opt_apply) else: if args.memory_saving_gradients: opt_grads = memory_saving_gradients.gradients(loss, train_vars) else: opt_grads = tf.gradients(loss, train_vars) opt_grads = list(zip(opt_grads, train_vars)) opt_apply = opt.apply_gradients(opt_grads) summary_loss = tf.summary.scalar('loss', loss) summary_lr = tf.summary.scalar('learning_rate', lr) summaries = tf.summary.merge([summary_lr, summary_loss]) summary_log = tf.summary.FileWriter( os.path.join(CHECKPOINT_DIR, args.run_name)) if args.save_graph: summary_log.add_graph(tf.get_default_graph()) saver = tflex.Saver(var_list=all_vars, max_to_keep=args.max_to_keep, keep_checkpoint_every_n_hours=2, reshape=args.truncate_weights) sess.run(tf.global_variables_initializer()) if args.restore_from == 'latest': ckpt = tflex.latest_checkpoint( os.path.join(CHECKPOINT_DIR, args.run_name)) if ckpt is None: # Get fresh GPT weights if new run. ckpt = tflex.latest_checkpoint( os.path.join('models', args.model_name)) elif args.restore_from == 'fresh': ckpt = tflex.latest_checkpoint( os.path.join('models', args.model_name)) else: ckpt = tflex.latest_checkpoint(args.restore_from) print('Loading snapshot %s...' % ckpt) t0 = time.time() if not args.fresh_model: saver.restore(sess, ckpt) t1 = time.time() print('Loaded in %f seconds' % (t1 - t0)) def make_sampler(dataset, enc, seed, combine): if os.path.isdir(dataset) or dataset.endswith('.npz'): chunks = load_dataset(enc, dataset, combine) data_sampler = Sampler(chunks, seed=seed) print('dataset has', data_sampler.total_size, 'tokens', len(chunks), 'chunks') else: data_sampler = TextSampler(dataset, enc, seed=seed) return data_sampler print('Loading dataset...') seed = None if args.seed < 0 else args.seed data_sampler = make_sampler(dataset=args.dataset, enc=enc, seed=seed, combine=args.combine) if args.val_every > 0: # Sample from validation set once with fixed seed to make # it deterministic during training as well as across runs. val_dataset = args.val_dataset if args.val_dataset else args.dataset val_data_sampler = make_sampler(dataset=val_dataset, enc=enc, seed=1, combine=args.combine) val_batches = [[ val_data_sampler.sample(hparams.n_ctx) for _ in range(args.val_batch_size) ] for _ in range(args.val_batch_count)] print('Training...') counter = 1 counter_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'counter') if os.path.exists(counter_path): # Load the step number if we're resuming a run # Add 1 so we don't immediately try to save again with open(counter_path, 'r') as fp: counter = int(fp.read()) + 1 @tflex.register_command def save(): maketree(os.path.join(CHECKPOINT_DIR, args.run_name)) print( 'Saving', os.path.join(CHECKPOINT_DIR, args.run_name, 'model-{}').format(counter)) t0 = time.time() saver.save(sess, os.path.join(CHECKPOINT_DIR, args.run_name, 'model'), global_step=counter) t1 = time.time() print('Saved in %f seconds' % (t1 - t0)) with open(counter_path, 'w') as fp: fp.write(str(counter) + '\n') @tflex.register_command def generate_samples(): print('Generating samples...') context_tokens = data_sampler.sample(1) all_text = [] index = 0 while index < args.sample_num: out = sess.run( tf_sample, feed_dict={context: args.batch_size * [context_tokens]}) for i in range(min(args.sample_num - index, args.batch_size)): text = enc.decode(out[i]) text = '======== SAMPLE {} ========\n{}\n'.format( index + 1, text) print(text) all_text.append(text) index += 1 maketree(os.path.join(SAMPLE_DIR, args.run_name)) with open( os.path.join(SAMPLE_DIR, args.run_name, 'samples-{}').format(counter), 'w') as fp: fp.write('\n'.join(all_text)) @tflex.register_command def validation(): if args.val_every <= 0: return print('Calculating validation loss...') losses = [] for batch in tqdm.tqdm(val_batches): loss = sess.run(val_loss, feed_dict={val_context: batch}) losses.append(loss) v_val_loss = np.mean(losses) print('{n} loss={loss:2.4f} avg={avg:2.4f}'.format( n=len(losses), loss=loss, avg=v_val_loss)) print('losses', losses) v_summary = sess.run(val_loss_summary, feed_dict={val_loss: v_val_loss}) summary_log.add_summary(v_summary, counter) summary_log.flush() print( '{stamp} [{counter} | {time:2.4f}] validation loss = {loss:2.4f}' .format(stamp=timestamp(), counter=counter, time=time.time() - start_time, loss=v_val_loss)) start_time = time.time() def elapsed(): return time.time() - start_time def say(msg): print('{stamp} [{counter} | {time:2.4f}] {msg}'.format( counter=counter, time=elapsed(), msg=msg, stamp=timestamp())) def sample_batch(): #return [data_sampler.sample(args.sample_ctx) for _ in range(args.batch_size)] #say('Sampling batch...') r = [] times = [] for _ in range(args.batch_size): start = time.time() sample = data_sampler.sample(args.sample_ctx) end = time.time() elapsed = (end - start) r += [sample] times += [elapsed] total = sum(times) avg = total / len(times) #say('Sampled %d batches in %.4f seconds (avg per batch: %.4f)' % (args.batch_size, total, avg)) return r prev_time = time.time() avg_loss = (0.0, 0.0) if args.debug_before_training: import pdb pdb.set_trace() last_saved_time = elapsed() while True: try: now = elapsed() if args.save_time > 0 and (( (now - last_saved_time) / 60.0) >= args.save_time): save() last_saved_time = now elif args.save_every > 0 and (counter % args.save_every == 0): save() if args.sample_every > 0 and counter % args.sample_every == 0: generate_samples() if args.val_every > 0 and (counter % args.val_every == 0 or counter == 1): validation() v_rate = update_lr() if args.accumulate_gradients > 1: #say('Running opt_reset...') sess.run(opt_reset) for _ in range(args.accumulate_gradients): batch = sample_batch() say('Running opt_compute...') sess.run(opt_compute, feed_dict={context: batch}) say('Running opt_apply...') (v_loss, v_summary) = sess.run((opt_apply, summaries)) else: batch = sample_batch() say('Running opt_apply...') (_, v_loss, v_summary) = sess.run( (opt_apply, loss, summaries), feed_dict={context: batch}) if args.float16: v_loss = tf.to_float(v_loss).eval() summary_log.add_summary(v_summary, counter) summary_log.flush() avg_loss = (avg_loss[0] * 0.99 + v_loss, avg_loss[1] * 0.99 + 1.0) now = time.time() print( '{stamp} [{counter} | {time:2.4f} | {delta:2.2f}s | {ops:2.6f}tokens/s] loss={loss:2.4f} avg={avg:2.4f} rate={rate:0.7f} step={step}' .format( stamp=timestamp(), counter=counter, time=now - start_time, delta=now - prev_time, ops=args.sample_ctx * args.batch_size / (now - prev_time), rate=v_rate, loss=v_loss, avg=avg_loss[0] / avg_loss[1], step=current_step, )) counter += 1 current_step += 1 global_step.load(current_step, session=sess) tflex.check_commands_with_args( session=sess, stamp=timestamp(), counter=counter, time=now - start_time, delta=now - prev_time, ops=args.batch_size / (now - prev_time), rate=v_rate, loss=v_loss, avg=avg_loss[0] / avg_loss[1], avg_loss=avg_loss, step=current_step, train_vars=train_vars, all_vars=all_vars, args=args, data_sampler=data_sampler, ckpt=ckpt, saver=saver, ) if tflex.should_quit(): break prev_time = now if args.debug_print_all_vars: print('all variables:') print('name/shape/parameter_count') param_count = 0 for x in tf.all_variables(): shape = x.shape.as_list() count = np.prod(shape) print(x.name, shape, count) param_count += count print('Total parameters:', param_count) args.debug_print_all_vars = False if args.debug_print_trainable_vars: print('trainable variables:') print('name/shape/parameter_count') param_count = 0 for x in tf.trainable_variables(): shape = x.shape.as_list() count = np.prod(shape) print(x.name, shape, count) param_count += count print('Total parameters:', param_count) args.debug_print_trainable_vars = False except KeyboardInterrupt: print('interrupted') if args.save_on_ctrlc: save() if args.debug_on_ctrlc: import pdb pdb.set_trace() else: break
def interact_model(model_name='117M', seed=None, nsamples=1, batch_size=1, length=None, temperature=1, seed_word="I am", top_k=0, top_p=0.0): """ Interactively run the model :model_name=117M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=1 : Number of samples to return total :batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples. :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. :top_p=0.0 : Float value controlling diversity. Implements nucleus sampling, overriding top_k if set to a value > 0. A good setting is 0.9. """ if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence(hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k, top_p=top_p) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name)) saver.restore(sess, ckpt) raw_text = seed_word if not raw_text: print("You should provide a seed sentence or word.") quit() context_tokens = enc.encode(raw_text) out = sess.run(output, feed_dict={ context: [context_tokens for _ in range(batch_size)] })[:, len(context_tokens):] text = enc.decode(out[0]) # treat output text output_text = text.split('.')[0] + (".") #output_text = text print(output_text)
def interact_model( model_name='117M', seed=None, nsamples=1, batch_size=1, length=None, temperature=1, top_k=0, ): """ Interactively run the model :model_name=117M : String, which model to use :seed=None : Integer seed for random number generators, fix seed to reproduce results :nsamples=1 : Number of samples to return total :batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples. :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=0 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. """ if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence( hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k ) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name)) saver.restore(sess, ckpt) while True: raw_text = input("Model prompt >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Model prompt >>> ") context_tokens = enc.encode(raw_text) generated = 0 for _ in range(nsamples // batch_size): out = sess.run(output, feed_dict={ context: [context_tokens for _ in range(batch_size)] })[:, len(context_tokens):] for i in range(batch_size): generated += 1 text = enc.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80)
def interact_model( model_name='117M', seed=None, nsamples=1, batch_size=None, length=1000, temperature=0.8, top_k=40, ): if batch_size is None: batch_size = 1 assert nsamples % batch_size == 0 enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx // 2 elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = sample.sample_sequence(hparams=hparams, length=length, context=context, batch_size=batch_size, temperature=temperature, top_k=top_k) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name)) saver.restore(sess, ckpt) while True: food = random_food() raw_text = food + " Recipe:" try: wiki = wikipedia.page(food) for image in wiki.images: if "svg" not in image: image = "<img style='max-height:300px;' src='" + wiki.images[ 0] + "'>" break except: image = "" context_tokens = enc.encode(raw_text) generated = 0 for _ in range(nsamples // batch_size): out = sess.run(output, feed_dict={ context: [context_tokens for _ in range(batch_size)] })[:, len(context_tokens):] for i in range(batch_size): generated += 1 text = enc.decode(out[i]) text = text.split("<|endoftext|>")[0] text = text.replace("\n", "<br>") text_file = open("/var/www/recipe/index.html", "w") text = "<div style='width:66%;position:absolute;left:16%'><h1>" + raw_text + "</h1>" + image + "<br>" + str( text) + "</div>" text_file.write(text) text_file.close()