def cur_text_summary(para_str_in): # prompt = 'News villagers , fishermen and hotel residents found the dolphins \' carcasses on friday and alerted officials .it was not immediately clear what killed the 400 dolphins , though scientists ruled out poisoning .narriman jidawi , a marine biologist at the institute of marine science in zanzibar , said their carcasses were strewn along a 4km stretch of nungwi .but the bottleneck dolphins , which live in deep offshore waters , had empty stomachs , meaning that they could have been disoriented and were swimming for some time to reorient themselves .they did not starve to death and were not poisoned , jidawi said .in the united states , experts were investigating the possibility that sonar from us submarines could have been responsible for a similar incident in marathon , florida , where 68 deep-water dolphins stranded themselves in march 2005 .a us navy task force patrols the east africa coast .a navy official was not immediately available for comment , but the service rarely comments on the location of submarines at sea .the deaths are a blow to the tourism industry in zanzibar , where thousands of visitors go to watch and swim with wild dolphins TL;DR:' prompt = para_str_in.split('\\n') # split on newlines if provided summary_str_out = '' # tokenize provided prompt split_prompt = ' \n '.join(bpe.apply(prompt)) split_prompt = split_prompt.split(' ') if not any(split_prompt[0] == x for x in CONTROL_CODES.keys()): print( "WARNING! You are not starting your generation from a control code so you won't get good results" ) text = [word2idx[i] for i in split_prompt] # pad with 0s and create a mini-batch of 2 (arbitrary, for ease of code) padded_text = text + [0] * ( args.generate_num - len(text) ) # !!! len(text) must little than args.generate_num tokens_generated = np.tile(padded_text, (1, 1)) try: # print(len(text) - 1, args.generate_num - 1) for token in range(len(text) - 1, args.generate_num - 1): # get the logits from the prediction function # the logic here is a bit convoluted because we are allowing generation past 512 tokens # this is done by sliding the window over (past 512 tokens) and continuing prediction # I'm sure this can be simplified (TODO) if token <= seq_length: prompt_logits = predict_fn( {'input_1': tokens_generated[:, :seq_length] })['tied_embedding_softmax'].squeeze() / ( temperature if temperature > 0 else 1.) _token = token if token < seq_length else -1 else: _token = -1 start = token - seq_length + 2 end = token + 1 prompt_logits = \ predict_fn({'input_1': np.hstack((tokens_generated[:, 0:1], tokens_generated[:, start:end]))})[ 'tied_embedding_softmax'].squeeze() / (temperature if temperature > 0 else 1.) # if penalty (for repetition) is non-zero, # discount the logits from already generated tokens if penalty > 0: penalized_so_far = set() for _ in range(token + 1): generated_token = tokens_generated[0][_] # don't penalize newlines # you could also choose not to penalize frequent words # (which incidentally are sorted in the vocab file) # but I don't do that # if it prints too many new lines instead of continuing generating text, # you might want to comment this out if idx2word[generated_token] == '\n': continue if generated_token in penalized_so_far: continue penalized_so_far.add(generated_token) prompt_logits[_token][generated_token] /= penalty # disallow some tokens prompt_logits[_token][word2idx['<unk>']] = -1e8 # sometimes, when generating from reddit, # it tries to generate the Score (reddit Karma) immediately after generating the Title: # to disallow this, we can just prevent it from generating Score prompt_logits[_token][word2idx['Sco@@']] = -1e8 # compute probabilities from logits prompt_probs = np.exp(prompt_logits[_token]) prompt_probs = prompt_probs / sum(prompt_probs) pruned_list = np.argsort(prompt_probs)[::-1] # if you are using nucleus prob, then compute the nucleus probability size if nucleusprob > 0.: minimum_topk = 1 nucleus = max( np.where( np.cumsum(np.sort(prompt_probs)[::-1]) > nucleusprob) [0][0], minimum_topk) elif topk > 0: # we are over-loading notation here # if you choose to specify a topk instead of a nucleus, # we will hardcode the nucleus to be just that nucleus = topk else: # if you specify neither nucleus or topk, # then we will use the whole list nucleus = len(pruned_list) pruned_list = pruned_list[:nucleus] # if you want to disallow more complex tokens, you can do so here # for instance, if you want to disallow anything with the phrase `http`, # you can delete theme from the pruned_list # you can comment this out, I'm keeping it in for demonstration purpose tokens_to_disallow = [] for _ in range(len(pruned_list)): if 'http' in idx2word[pruned_list[_]]: tokens_to_disallow.append(_) pruned_list = np.delete(pruned_list, tokens_to_disallow) if args.topn > 0: print('TOPN :: top-n alternatives:', [idx2word[_] for _ in pruned_list[:args.topn]]) # if temperature is 0 # just pick the first (most probable) token if temperature == 0: idx = pruned_list[0] else: # else, # sample from the pruned_list with the logits chosen_idx = int( tf.random.categorical(np.expand_dims( prompt_logits[_token][pruned_list], 0), num_samples=1).numpy()) idx = pruned_list[chosen_idx] if args.topn > 0: print('TOPN :: chosen word:', idx2word[idx]) # assign the token for generation tokens_generated[0][token + 1] = idx # clear screen if you want to # os.system("clear") # tokens_generated_so_far = ' '.join([idx2word[c] for c in tokens_generated[0].squeeze()[:token + 2]]) # tokens_generated_so_far = re.sub('(@@ )', '', string=tokens_generated_so_far) # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far) tokens_generated_only = ' '.join([ idx2word[c] for c in tokens_generated[0].squeeze()[len(text):token + 2] ]) tokens_generated_only = re.sub('(@@ )', '', string=tokens_generated_only) tokens_generated_only = re.sub('(@@ ?$)', '', string=tokens_generated_only) summary_str_out = tokens_generated_only if idx2word[idx].find( '.') >= 0: # 有可能是 subword, '.' == idx2word[idx]: print(idx2word[idx], " has triggled stop flag -- .") break # if not args.print_once: # # idx == tokens_generated[0].squeeze()[token+1] # print('************************************************') # print(tokens_generated_only) # print() print(summary_str_out) except KeyboardInterrupt: # Exception as e: print("cur str_in has triggled KeyboardInterrupt.") return "" return summary_str_out
"Outdoor people", "Outdoor a man", "Outdoor car", "Outdoora church", "Outdoor stop" ] i = 0 while i < len(prompt_used): # prompt = raw_input('ENTER PROMPT: ') if not use_py3 else input('ENTER PROMPT: ') prompt = prompt_used[i] print("PROMPT CHOOSEN :\n") print(prompt + '\n') prompt = prompt.split('\\n') # split on newlines if provided # tokenize provided prompt split_prompt = ' \n '.join(bpe.apply(prompt)) split_prompt = split_prompt.split(' ') if not any(split_prompt[0] == x for x in CONTROL_CODES.keys()): print( "WARNING! You are not starting your generation from a control code so you won't get good results" ) text = [word2idx[i] for i in split_prompt] # pad with 0s and create a mini-batch of 2 (arbitrary, for ease of code) padded_text = text + [0] * (args.generate_num - len(text)) tokens_generated = np.tile(padded_text, (1, 1)) try: for token in range(len(text) - 1, args.generate_num - 1): # get the logits from the prediction function # the logic here is a bit convoluted because we are allowing generation past 512 tokens # this is done by sliding the window over (past 512 tokens) and continuing prediction # I'm sure this can be simplified (TODO) if token <= seq_length:
print() for i, prompt in enumerate(prompts): print("-----------Starting caption {} of {}-----------".format( i, tot_prompts)) prompt = prompt.split('\\n') # split on newlines if provided # tokenize provided prompt split_prompt = ' \n '.join(bpe.apply(prompt)) split_prompt = split_prompt.split(' ') if not any( split_prompt[0] == x for x in [CONTROL_CODES.keys(), "caption", "formal", "informal"]): print( "WARNING! You are not starting your generation from a control code so you won't get good results" ) text = [word2idx[i] for i in split_prompt] # pad with 0s and create a mini-batch of 2 (arbitrary, for ease of code) padded_text = text + [0] * (args.generate_num - len(text)) tokens_generated = np.tile(padded_text, (1, 1)) try: for token in range(len(text) - 1, args.generate_num - 1): # get the logits from the prediction function # the logic here is a bit convoluted because we are allowing generation past 512 tokens # this is done by sliding the window over (past 512 tokens) and continuing prediction # I'm sure this can be simplified (TODO)