예제 #1
0
def cur_text_summary(para_str_in):
    # prompt = 'News villagers , fishermen and hotel residents found the dolphins \' carcasses on friday and alerted officials .it was not immediately clear what killed the 400 dolphins , though scientists ruled out poisoning .narriman jidawi , a marine biologist at the institute of marine science in zanzibar , said their carcasses were strewn along a 4km stretch of nungwi .but the bottleneck dolphins , which live in deep offshore waters , had empty stomachs , meaning that they could have been disoriented and were swimming for some time to reorient themselves .they did not starve to death and were not poisoned , jidawi said .in the united states , experts were investigating the possibility that sonar from us submarines could have been responsible for a similar incident in marathon , florida , where 68 deep-water dolphins stranded themselves in march 2005 .a us navy task force patrols the east africa coast .a navy official was not immediately available for comment , but the service rarely comments on the location of submarines at sea .the deaths are a blow to the tourism industry in zanzibar , where thousands of visitors go to watch and swim with wild dolphins TL;DR:'
    prompt = para_str_in.split('\\n')  # split on newlines if provided
    summary_str_out = ''
    # tokenize provided prompt
    split_prompt = ' \n '.join(bpe.apply(prompt))
    split_prompt = split_prompt.split(' ')
    if not any(split_prompt[0] == x for x in CONTROL_CODES.keys()):
        print(
            "WARNING! You are not starting your generation from a control code so you won't get good results"
        )
    text = [word2idx[i] for i in split_prompt]

    # pad with 0s and create a mini-batch of 2 (arbitrary, for ease of code)
    padded_text = text + [0] * (
        args.generate_num - len(text)
    )  # !!! len(text) must little than args.generate_num
    tokens_generated = np.tile(padded_text, (1, 1))
    try:
        # print(len(text) - 1, args.generate_num - 1)
        for token in range(len(text) - 1, args.generate_num - 1):
            # get the logits from the prediction function
            # the logic here is a bit convoluted because we are allowing generation past 512 tokens
            # this is done by sliding the window over (past 512 tokens) and continuing prediction
            # I'm sure this can be simplified (TODO)
            if token <= seq_length:
                prompt_logits = predict_fn(
                    {'input_1': tokens_generated[:, :seq_length]
                     })['tied_embedding_softmax'].squeeze() / (
                         temperature if temperature > 0 else 1.)
                _token = token if token < seq_length else -1
            else:
                _token = -1
                start = token - seq_length + 2
                end = token + 1
                prompt_logits = \
                predict_fn({'input_1': np.hstack((tokens_generated[:, 0:1], tokens_generated[:, start:end]))})[
                    'tied_embedding_softmax'].squeeze() / (temperature if temperature > 0 else 1.)

            # if penalty (for repetition) is non-zero,
            # discount the logits from already generated tokens
            if penalty > 0:
                penalized_so_far = set()
                for _ in range(token + 1):
                    generated_token = tokens_generated[0][_]
                    # don't penalize newlines
                    # you could also choose not to penalize frequent words
                    # (which incidentally are sorted in the vocab file)
                    # but I don't do that
                    # if it prints too many new lines instead of continuing generating text,
                    # you might want to comment this out
                    if idx2word[generated_token] == '\n':
                        continue
                    if generated_token in penalized_so_far:
                        continue
                    penalized_so_far.add(generated_token)
                    prompt_logits[_token][generated_token] /= penalty

            # disallow some tokens
            prompt_logits[_token][word2idx['<unk>']] = -1e8

            # sometimes, when generating from reddit,
            # it tries to generate the Score (reddit Karma) immediately after generating the Title:
            # to disallow this, we can just prevent it from generating Score
            prompt_logits[_token][word2idx['Sco@@']] = -1e8

            # compute probabilities from logits
            prompt_probs = np.exp(prompt_logits[_token])
            prompt_probs = prompt_probs / sum(prompt_probs)
            pruned_list = np.argsort(prompt_probs)[::-1]
            # if you are using nucleus prob, then compute the nucleus probability size
            if nucleusprob > 0.:
                minimum_topk = 1
                nucleus = max(
                    np.where(
                        np.cumsum(np.sort(prompt_probs)[::-1]) > nucleusprob)
                    [0][0], minimum_topk)
            elif topk > 0:
                # we are over-loading notation here
                # if you choose to specify a topk instead of a nucleus,
                # we will hardcode the nucleus to be just that
                nucleus = topk
            else:
                # if you specify neither nucleus or topk,
                # then we will use the whole list
                nucleus = len(pruned_list)

            pruned_list = pruned_list[:nucleus]
            # if you want to disallow more complex tokens, you can do so here
            # for instance, if you want to disallow anything with the phrase `http`,
            # you can delete theme from the pruned_list
            # you can comment this out, I'm keeping it in for demonstration purpose
            tokens_to_disallow = []
            for _ in range(len(pruned_list)):
                if 'http' in idx2word[pruned_list[_]]:
                    tokens_to_disallow.append(_)
            pruned_list = np.delete(pruned_list, tokens_to_disallow)

            if args.topn > 0:
                print('TOPN :: top-n alternatives:',
                      [idx2word[_] for _ in pruned_list[:args.topn]])

            # if temperature is 0
            # just pick the first (most probable) token
            if temperature == 0:
                idx = pruned_list[0]
            else:
                # else,
                # sample from the pruned_list with the logits
                chosen_idx = int(
                    tf.random.categorical(np.expand_dims(
                        prompt_logits[_token][pruned_list], 0),
                                          num_samples=1).numpy())
                idx = pruned_list[chosen_idx]

            if args.topn > 0:
                print('TOPN :: chosen word:', idx2word[idx])

            # assign the token for generation
            tokens_generated[0][token + 1] = idx

            # clear screen if you want to
            # os.system("clear")
            # tokens_generated_so_far = ' '.join([idx2word[c] for c in tokens_generated[0].squeeze()[:token + 2]])
            # tokens_generated_so_far = re.sub('(@@ )', '', string=tokens_generated_so_far)
            # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)

            tokens_generated_only = ' '.join([
                idx2word[c]
                for c in tokens_generated[0].squeeze()[len(text):token + 2]
            ])
            tokens_generated_only = re.sub('(@@ )',
                                           '',
                                           string=tokens_generated_only)
            tokens_generated_only = re.sub('(@@ ?$)',
                                           '',
                                           string=tokens_generated_only)
            summary_str_out = tokens_generated_only
            if idx2word[idx].find(
                    '.') >= 0:  # 有可能是 subword, '.' == idx2word[idx]:
                print(idx2word[idx], " has triggled stop flag -- .")
                break
            # if not args.print_once:
            #     # idx == tokens_generated[0].squeeze()[token+1]
            #     print('************************************************')
            #     print(tokens_generated_only)
            #     print()
        print(summary_str_out)
    except KeyboardInterrupt:  # Exception as e:
        print("cur str_in has triggled KeyboardInterrupt.")
        return ""
    return summary_str_out
    "Outdoor people", "Outdoor a man", "Outdoor car", "Outdoora church",
    "Outdoor stop"
]
i = 0
while i < len(prompt_used):
    # prompt = raw_input('ENTER PROMPT: ') if not use_py3 else input('ENTER PROMPT: ')
    prompt = prompt_used[i]
    print("PROMPT CHOOSEN :\n")
    print(prompt + '\n')
    prompt = prompt.split('\\n')  # split on newlines if provided

    # tokenize provided prompt
    split_prompt = ' \n '.join(bpe.apply(prompt))
    split_prompt = split_prompt.split(' ')

    if not any(split_prompt[0] == x for x in CONTROL_CODES.keys()):
        print(
            "WARNING! You are not starting your generation from a control code so you won't get good results"
        )
    text = [word2idx[i] for i in split_prompt]

    # pad with 0s and create a mini-batch of 2 (arbitrary, for ease of code)
    padded_text = text + [0] * (args.generate_num - len(text))
    tokens_generated = np.tile(padded_text, (1, 1))
    try:
        for token in range(len(text) - 1, args.generate_num - 1):
            # get the logits from the prediction function
            # the logic here is a bit convoluted because we are allowing generation past 512 tokens
            # this is done by sliding the window over (past 512 tokens) and continuing prediction
            # I'm sure this can be simplified (TODO)
            if token <= seq_length:
예제 #3
0
print()

for i, prompt in enumerate(prompts):

    print("-----------Starting caption {} of {}-----------".format(
        i, tot_prompts))

    prompt = prompt.split('\\n')  # split on newlines if provided

    # tokenize provided prompt
    split_prompt = ' \n '.join(bpe.apply(prompt))
    split_prompt = split_prompt.split(' ')

    if not any(
            split_prompt[0] == x
            for x in [CONTROL_CODES.keys(), "caption", "formal", "informal"]):
        print(
            "WARNING! You are not starting your generation from a control code so you won't get good results"
        )
    text = [word2idx[i] for i in split_prompt]

    # pad with 0s and create a mini-batch of 2 (arbitrary, for ease of code)
    padded_text = text + [0] * (args.generate_num - len(text))
    tokens_generated = np.tile(padded_text, (1, 1))

    try:
        for token in range(len(text) - 1, args.generate_num - 1):
            # get the logits from the prediction function
            # the logic here is a bit convoluted because we are allowing generation past 512 tokens
            # this is done by sliding the window over (past 512 tokens) and continuing prediction
            # I'm sure this can be simplified (TODO)