Python add_person_tokens示例，projects.self_feeding.utils.add_person_tokens Python示例

示例#1

0

显示文件

def main(config):
    """
    Creates .stitched files from .suggested files.

    input: a .suggested file of logs (in ParlaiDialog format) from Mturk task 2, each of
        which starts with an initial prompt or topic request, and ends with a y
        that corresponds to the y_exp given in the previous turn
    output: a .stitched file (in self-feeding format) with the original mistake by the
        bot replace with the mturked y (based on y_exp)
    """
    examples = []
    episodes = [e for e in extract_parlai_episodes(config['infile'])]
    for episode in episodes:
        history = []
        num_parleys = len(episode)
        for i, parley in enumerate(episode):
            if i == 0:  # Don't include the topic request
                history.append(parley.response)
                continue
            elif i == num_parleys - 3:
                # third to last was mistake and negative feedback
                continue
            elif i == num_parleys - 2:
                # penultimate turn was suggestion request and y_exp
                continue
            elif i == num_parleys - 1:
                # ultimate turn was verbatim request and y
                example = Parley(
                    context=add_person_tokens(history, last_speaker=1),
                    response=parley.response,  # y
                )
                examples.append(example)
            else:
                # normal turn; just add to history
                history.append(parley.context)
                history.append(parley.response)

    # Write new episodes to self-feeding format
    with PathManager.open(config['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(
        f"Extracted {len(examples)} self-feeding episodes out of "
        f"{len(episodes)} parlai episodes and wrote them to {config['outfile']}."
    )

示例#2

0

显示文件

    def _setup_data(self, path):  # Make private method for ParlAIDialogTeacher
        """
        Reads data in the fbdialog format.

        Returns ``((x,y,r,c), new_episode?)`` tuples.
        """
        print("[ Loading Self-Feeding text data:" + path + "]")
        self.episodes = []
        self.num_exs = 0
        self.max_train = self.opt.get('max_train', 0)
        with PathManager.open(path, 'r') as f:
            for line in f.readlines():
                if self.max_train and self.num_exs >= self.max_train:
                    break
                parley = json.loads(line)

                # NOTE: History is trimmed here, not by TorchAgent (except in
                # interactive mode)
                if self.opt['history_size'] == 0:
                    parley['context'] = '__null__'
                elif self.opt['history_size'] > 0:
                    utterances = re.split(r'__p\d__', parley['context'])[1:]
                    trimmed = utterances[-(self.opt['history_size']):]
                    parley['context'] = add_person_tokens(trimmed,
                                                          last_speaker=1)

                # WARNING: STRIPPING AWAY MEMORIES
                parley['memories'] = []

                episode = {
                    'text': parley['context'],
                    'labels': [parley['response']],
                    'label_candidates': parley.get('candidates', []),
                    'reward': parley.get('reward', 0),
                    'episode_done': True,
                }

                # Convert integer labels (e.g., polarization dataset) to strings
                episode['labels'] = [str(l) for l in episode['labels']]

                self.num_exs += 1
                self.episodes.append([episode])

示例#3

0

显示文件

def main(config):
    """
    Creates .identity files from .sliced files.

    input: a .sliced file of logs (in ParlaiDialog format) from Mturk task 1, each of
        which starts with an initial prompt or topic request, and ends with a y_exp
    output: an .identity file (in self-feeding format) with y_exps used as though they
        were ys
    """
    examples = []
    episodes = [e for e in extract_parlai_episodes(config['infile'])]
    for episode in episodes:
        history = []
        num_parleys = len(episode)
        for i, parley in enumerate(episode):
            if i == 0:  # Don't include the topic request
                history.append(parley.response)
                continue
            elif i == num_parleys - 2:
                # penultimate turn was mistake and negative feedback
                continue
            elif i == num_parleys - 1:
                # ultimate turn was correction request and y_exp
                example = Parley(
                    context=add_person_tokens(history, last_speaker=1),
                    response=parley.response,  # y_exp
                )
                examples.append(example)
            else:
                # normal turn; just add to history
                history.append(parley.context)
                history.append(parley.response)

    # Write new episodes to self-feeding format
    with PathManager.open(config['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(
        f"Extracted {len(examples)} self-feeding episodes out of "
        f"{len(episodes)} parlai episodes and wrote them to {config['outfile']}."
    )

示例#4

0

显示文件

def main(config):
    """
    Creates .unfiltered files from .sliced files.

    input: a .sliced file of logs (in ParlaiDialog format) from Mturk task 1, each of
        which starts with an initial prompt or topic request, and ends with a y_exp
    output: a .unfiltered file (in self-feeding format) with every utterance output by
        bot used as a label (i.e., act as though the bot was a human and we want to
        train in a normal supervised way).
    """
    examples = []
    episodes = [e for e in extract_parlai_episodes(config['infile'])]
    for episode in episodes:
        history = []
        num_parleys = len(episode)
        for i, parley in enumerate(episode):
            if i == 0:  # Don't include the topic request
                history.append(parley.response)
                continue
            elif i == num_parleys - 1:
                # ultimate turn was correction request and explanation
                continue
            else:
                example = Parley(
                    context=add_person_tokens(history, last_speaker=1),
                    response=parley.context,  # What the bot said
                )
                examples.append(example)
                history.append(parley.context)
                history.append(parley.response)

    # Write new episodes to self-feeding format
    with PathManager.open(config['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(
        f"Extracted {len(examples)} self-feeding episodes out of "
        f"{len(episodes)} parlai episodes and wrote them to {config['outfile']}."
    )

示例#5

0

显示文件

文件： convert_chatted_to_identity.py 项目： J-Douglas/PodBot

def main(opt):
    """
    Extracts training data for the negative response classifier (NRC) from Mturk logs.

    input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn
        quality ratings 1-5
    output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating
        positive/negative example
    """
    examples = []
    positives = opt['positives'].split(',')
    negatives = opt['negatives'].split(',')
    assert len(set(positives).intersection(set(negatives))) == 0

    num_episodes = 0
    num_parleys = 0
    for episode in extract_parlai_episodes(opt['infile']):
        num_episodes += 1
        history = []
        for parley in episode:
            num_parleys += 1

            # Update history (not including stock control flow responses)
            if parley.context.startswith(INITIAL_PROMPT):
                # Conversation prompt, first utterance
                # Begin history
                history = [parley.response]
            elif parley.context.startswith(EXP_REQUEST):
                # Asked for y_exp, got y_exp
                # Messed up, so blast history
                example = Parley(
                    context=add_person_tokens(history[:-2], last_speaker=1),
                    response=parley.response,  # y_exp
                )
                examples.append(example)
                history = []
            elif parley.context.startswith(NEWTOPIC):
                # Asked for new topic, got a first utterance
                # Begin new history
                history = [parley.response]
            elif parley.context.startswith(RAT_REQUEST):
                # Asked for rating, got one-word rating
                # Nothing to update in history
                pass
            elif CONTINUE in parley.context:
                # if response was negative, history will get blasted in EXP_REQUEST
                # if we're here, response was neutral/positive, so continue the history
                history.append(parley.context[parley.context.rindex(':') + 1:])
                history.append(parley.response)
            else:
                # normal turn: maintain the history
                history.append(parley.context)
                history.append(parley.response)

    with PathManager.open(opt['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(f"Extracted {len(examples)} ratings out of {num_episodes} episodes "
          f"({num_parleys} parleys) and wrote them to {opt['outfile']} with "
          f"histsz == {opt['history_size']}.")

示例#6

0

显示文件

文件： convert_chatted_to_polarized.py 项目： J-Douglas/PodBot

def main(opt):
    """
    Extracts training data for the negative response classifier (NRC) from Mturk logs.

    input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn
        quality ratings 1-5
    output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating
        positive/negative example
    """
    examples = []
    positives = opt['positives'].split(',')
    negatives = opt['negatives'].split(',')
    assert len(set(positives).intersection(set(negatives))) == 0

    num_episodes = 0
    num_parleys = 0
    for episode in extract_parlai_episodes(opt['infile']):
        num_episodes += 1
        history = []
        for parley in episode:
            num_parleys += 1

            # Update history (not including stock control flow responses)
            if parley.context.startswith(INITIAL_PROMPT):
                # Conversation prompt, first utterance
                # Begin history
                history = [parley.response]
            elif parley.context.startswith(EXP_REQUEST):
                # Asked for y_exp, got y_exp
                # Messed up, so blast history
                history = []
            elif parley.context.startswith(NEWTOPIC):
                # Asked for new topic, got a first utterance
                # Begin new history
                history = [parley.response]
            elif parley.context.startswith(RAT_REQUEST):
                # Concatenate history and add speaker tokens as necessary
                # history_size refers to the total number of utterances
                # (history_size == 0 means predict sentiment from '__null__')
                # response that's being classified (so if history_size == 0 then
                # classify based only on the response w/o any extra context).
                # Note that the response being classified should always be preceded by
                # __p1__ (the human), not __p2__ (the bot).
                if opt['history_size'] < 0:
                    utterances = history
                elif opt['history_size'] == 0:
                    utterances = ['__null__']
                else:
                    utterances = history[-opt['history_size']:]
                context = add_person_tokens(utterances, last_speaker=1)

                if parley.response in positives:
                    label = 1
                elif parley.response in negatives:
                    label = -1
                else:
                    label = 0

                if label:
                    example = Parley(context, label)
                    examples.append(example)

            elif CONTINUE in parley.context:
                # if response was negative, history will get blasted in EXP_REQUEST
                # if we're here, response was neutral/positive, so continue the history
                history.append(parley.context[parley.context.rindex(':') + 1:])
                history.append(parley.response)
            else:
                history.append(parley.context)
                history.append(parley.response)

    with PathManager.open(opt['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(f"Extracted {len(examples)} ratings out of {num_episodes} episodes "
          f"({num_parleys} parleys) and wrote them to {opt['outfile']} with "
          f"histsz == {opt['history_size']}.")

示例#7

0

显示文件

def main(opt):
    """
    Extracts training data for the negative response classifier (NRC) from Mturk logs.

    input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn
        quality ratings 1-5
    output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating
        positive/negative example
    """
    examples = []

    num_episodes = 0
    num_parleys = 0
    for episode in extract_parlai_episodes(opt['infile']):
        num_episodes += 1
        history = []
        for parley in episode:
            num_parleys += 1
            # Update history (not including stock control flow responses)
            if parley.context.startswith(
                    INITIAL_PROMPT) or parley.context.startswith(NEWTOPIC):
                # a prompt, first utterance
                # Begin history
                history = [parley.response]
                # NOTE: we now allow these one-utterance episodes to be examples
                # continue
            elif parley.context.startswith(
                    EXP_REQUEST) or parley.context.startswith(RAT_REQUEST):
                # If 'filter_accusation' is on and the last example added was a human,
                # toss the previous example, which is when the human expressed
                # dissatisfaction
                if (opt['mode'] == 'human' and opt['filter_accusation']
                        and parley.context.startswith(EXP_REQUEST)
                        and len(examples) > 0):
                    examples.pop()
                # If 'filter_mistake' is on and the last example in the queue was a bot,
                # toss it too, since that's when the bot messed up
                if (opt['mode'] == 'bot' and opt['filter_mistake']
                        and parley.context.startswith(EXP_REQUEST)
                        and len(examples) > 0):
                    examples.pop()

                # Asked for y_exp or rating, got it
                # Messed up, so blast history
                history = []
                continue
            elif CONTINUE in parley.context:
                # if response was negative, history will get blasted in EXP_REQUEST
                # if we're here, response was neutral/positive, so continue the history
                history.append(parley.context[parley.context.rindex(':') + 1:])
                history.append(parley.response)
            else:
                # normal turn: maintain the history
                history.append(parley.context)
                history.append(parley.response)

            if opt['mode'] in ['bot'] and len(history) >= 2:
                if len(history) == 2:
                    example = Parley(context='__null__', response=history[0])
                else:
                    example = Parley(
                        context=add_person_tokens(history[:-2],
                                                  last_speaker=1),
                        response=history[-2],  # What the bot said
                    )
                examples.append(example)

            if opt['mode'] in ['human']:
                if len(history) == 1:
                    example = Parley(context='__null__', response=history[0])
                else:
                    example = Parley(
                        # this is not technically true:
                        # the last speaker was the bot (__p2__),
                        # not the human (__p1__), but in all our data, __p1__ is always
                        # the speaking partner of the learner
                        context=add_person_tokens(history[:-1],
                                                  last_speaker=1),
                        response=history[-1],  # What the bot said
                    )
                examples.append(example)

    with PathManager.open(opt['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(f"Extracted {len(examples)} examples out of {num_episodes} episodes "
          f"({num_parleys} parleys) and wrote them to {opt['outfile']} with "
          f"histsz == {opt['history_size']}.")