def get_training_instances(listener=False): h, s, v = munroecorpus.get_training_handles() insts = [(Instance(input=name, output=color) if listener else Instance(input=color, output=name)) for name in h for color in load_colors(h[name], s[name], v[name])] rng.shuffle(insts) return insts
def get_eval_instances(handles, listener=False): insts = [(Instance(input=name, output=tuple(color)) if listener else Instance(input=tuple(color), output=name)) for name, handle in handles.iteritems() for color in munroecorpus.open_datafile(handle)] rng.shuffle(insts) return insts
def triples_to_insts(data, listener=False): return [ (Instance(input=name, output=color, alt_outputs=context) if listener else Instance(input=color, alt_inputs=context, output=name)) for name, color, context in data ]
def pairs_to_insts(data, listener=False): return [ (Instance(input=name, output=color) if listener else Instance(input=color, output=name)) for name, color in data ]
def trials_to_insts(trials, listener=False): insts = [] for trial in trials: desc = tuple(d.string_description for d in trial.descriptions) desc_attrs = tuple( tuple(sorted(set([str(a) for a in d.attribute_set]))) for d in trial.descriptions) targets = tuple(i for i, e in enumerate(trial.entities) if e.is_target()) alt_referents = tuple( tuple(str(a) for a in e.attributes) for e in trial.entities) if listener: insts.append( Instance(input=desc, annotated_input=desc_attrs, output=targets, alt_outputs=alt_referents, source=trial.filenames)) else: insts.append( Instance(input=targets, alt_inputs=alt_referents, output=desc, annotated_output=desc_attrs, source=trial.filenames)) return insts
def hawkins_target(listener=False): insts = hawkins_context(listener=listener) return [(Instance(output=inst.alt_outputs[inst.output], input=inst.input, source=inst.__dict__) if listener else Instance(input=inst.alt_inputs[inst.input], output=inst.output, source=inst.__dict__)) for inst in insts]
def filtered(listener=False): global FILTERED_DATASET_LISTENER, FILTERED_DATASET_SPEAKER, FILTERED_SPLIT if FILTERED_DATASET_LISTENER is not None: if listener: return FILTERED_DATASET_LISTENER else: return FILTERED_DATASET_SPEAKER FILTERED_SPLIT = [] FILTERED_DATASET_LISTENER = [] instances = defaultdict(list) with open('behavioralAnalysis/humanOutput/filteredCorpus.csv', 'r') as infile: for row in csv.DictReader(infile): if row['role'] != 'speaker': continue key = (row['gameid'], row['roundNum']) if len(FILTERED_SPLIT) < len(FILTERED_SPLIT_IDS) and \ key[0] == FILTERED_SPLIT_IDS[len(FILTERED_SPLIT)]: FILTERED_SPLIT.append(len(FILTERED_DATASET_LISTENER)) message = row['contents'] if key in instances: current_dict = dict( FILTERED_DATASET_LISTENER[instances[key]].__dict__) current_dict['input'] = ' ~ '.join( (current_dict['input'], message)) FILTERED_DATASET_LISTENER[instances[key]] = Instance( **current_dict) continue instances[key] = len(FILTERED_DATASET_LISTENER) target_idx, alt_colors = context_from_row(row) FILTERED_DATASET_LISTENER.append( Instance(input=message, output=target_idx, alt_outputs=alt_colors, source=key + (row['condition'], ))) FILTERED_DATASET_SPEAKER = [ inst.inverted() for inst in FILTERED_DATASET_LISTENER ] if listener: return FILTERED_DATASET_LISTENER # [:206] else: return FILTERED_DATASET_SPEAKER
def reference_game(insts, gen_func, listener=False): options = config.options() for i in range(len(insts)): color = insts[i].output if listener else insts[i].input distractors = [gen_func(color) for _ in range(options.num_distractors)] answer = rng.randint(0, len(distractors) + 1) context = distractors[:answer] + [color] + distractors[answer:] ref_inst = (Instance(insts[i].input, answer, alt_outputs=context) if listener else Instance( answer, insts[i].output, alt_inputs=context)) insts[i] = ref_inst return insts
def reference_game(insts, gen_func, listener=False): options = config.options() result = [] for inst in insts: color = inst.output if listener else inst.input distractors = [gen_func(color) for _ in range(options.num_distractors)] answer = rng.randint(0, len(distractors) + 1) context = distractors[:answer] + [color] + distractors[answer:] ref_inst = (Instance(inst.input, answer, alt_outputs=context) if listener else Instance(answer, inst.output, alt_inputs=context)) result.append(ref_inst) return result
def hawkins_context(listener=False, speakerID='speaker', suffix=''): messages = defaultdict(list) with open('hawkins_data/colorReferenceMessage%s.csv' % suffix, 'r') as infile: for row in csv.DictReader(infile): if row['sender'] == speakerID: message = row['contents'].decode( 'utf-8') # TODO: clean, tokenize? messages[(row['gameid'], row['roundNum'])].append(message) result = [] with open('hawkins_data/colorReferenceClicks%s.csv' % suffix, 'r') as infile: reader = csv.DictReader(infile) seen = set() for row in reader: key = (row['gameid'], row['roundNum']) if key in seen: # print('Duplicate key: %s' % (key,)) continue seen.add(key) context = [(hsl_to_hsv( (row['%sColH' % i], row['%sColS' % i], row['%sColL' % i])), row['%sLocS' % i], row['%sStatus' % i]) for i in ('click', 'alt1', 'alt2')] context.sort(key=lambda c: c[1]) target_idx = [ i for i, (_, _, status) in enumerate(context) if status == 'target' ] assert len(target_idx) == 1, context target_idx = target_idx[0] alt_colors = [c for (c, _, _) in context] message = ' ~ '.join(messages[key]) if suffix == 'Chinese_filtered' and not message.replace( '~', '').strip(): continue if listener: inst = Instance(input=message, output=target_idx, alt_outputs=alt_colors, source=key + (row['condition'], )) else: inst = Instance(input=target_idx, alt_inputs=alt_colors, output=message, source=key + (row['condition'], )) result.append(inst) return result
def bilingual_tag_instance(inst, lang, listener=False, unicodify=False): inp, out = inst.input, inst.output if listener: if unicodify: assert isinstance(inp, basestring), repr(inp) inp = unicode(inp) if isinstance(inp, basestring): new_inp = ':'.join((lang, inp)) else: new_inp = (lang, inp) new_out = out else: new_inp = inp if unicodify: assert isinstance(out, basestring), repr(out) out = unicode(out) if isinstance(out, basestring): new_out = ':'.join((lang, out)) else: new_out = (lang, out) return Instance(input=new_inp, output=new_out, alt_inputs=inst.alt_inputs, alt_outputs=inst.alt_outputs, source=inst.source)
def dataset(filename): if not filename: return openfunc = gzip.open if filename.endswith('.gz') else open with openfunc(filename, 'r') as infile: for line in infile: yield Instance(**json.loads(line.strip()))
def hawkins_context(listener=False): messages = defaultdict(list) with open('hawkins_data/colorReferenceMessage.csv', 'r') as infile: for row in csv.DictReader(infile): if row['sender'] == 'speaker': message = row['contents'] # TODO: clean, tokenize? messages[(row['gameid'], row['roundNum'])].append(message) result = [] with open('hawkins_data/colorReferenceClicks.csv', 'r') as infile: reader = csv.DictReader(infile) for row in reader: context = [(hsl_to_hsv( (row['%sColH' % i], row['%sColS' % i], row['%sColL' % i])), row['%sLocS' % i], row['%sStatus' % i]) for i in ('click', 'alt1', 'alt2')] context.sort(key=lambda c: c[1]) target_idx = [ i for i, (_, _, status) in enumerate(context) if status == 'target' ] assert len(target_idx) == 1, context target_idx = target_idx[0] alt_colors = [c for (c, _, _) in context] key = (row['gameid'], row['roundNum']) message = ' ~ '.join(messages[key]) if listener: inst = Instance(input=message, output=target_idx, alt_outputs=alt_colors, source=key) else: inst = Instance(input=target_idx, alt_inputs=alt_colors, output=message, source=key) result.append(inst) return result
def cycle_shuffled(insts): ''' A generator that cycles through insts, but in a random order each time through the list. Note: destructively modifies the order of the list! (but not the instances themselves) ''' repeat = False while insts: for inst in insts: if repeat: inst = Instance(**inst.__dict__) if isinstance(inst.source, tuple): inst.source += ('repeat', ) elif isinstance(inst.source, dict): inst.source = dict(inst.source) inst.source['repeat'] = True elif inst.source is None: inst.source = {'repeat': True} yield inst rng.shuffle(insts) repeat = True
def cycle_shuffled(insts): ''' A generator that cycles through insts, but in a random order each time through the list. Note: destructively modifies the order of the list! (but not the instances themselves) ''' repeat = False while insts: for inst in insts: if repeat: inst = Instance(**inst.__dict__) if isinstance(inst.source, tuple): inst.source += ('repeat',) elif isinstance(inst.source, dict): inst.source = dict(inst.source) inst.source['repeat'] = True elif inst.source is None: inst.source = {'repeat': True} yield inst rng.shuffle(insts) repeat = True
def next_action(): global NEXT_ACTION_DATASET, NEXT_ACTION_SPLIT if NEXT_ACTION_DATASET is not None: return NEXT_ACTION_DATASET NEXT_ACTION_SPLIT = [] NEXT_ACTION_DATASET = [] previous = [] prev_key = None prev_context = None with open('behavioralAnalysis/humanOutput/filteredCorpus.csv', 'r') as infile: for row in csv.DictReader(infile): key = (row['gameid'], row['roundNum']) if len(NEXT_ACTION_SPLIT) < len(FILTERED_SPLIT_IDS) and \ key[0] == FILTERED_SPLIT_IDS[len(NEXT_ACTION_SPLIT)]: NEXT_ACTION_SPLIT.append(len(NEXT_ACTION_DATASET)) if key != prev_key: action = ACTION_CHOOSE elif row['role'] == 'listener': action = ACTION_SPEAK else: action = ACTION_NONE new_message = ('| ' if row['role'] == 'listener' else '') + row['contents'] prev_message = ' ~ '.join(previous) context = context_from_row(row) if prev_key is not None: target_idx, alt_colors = prev_context NEXT_ACTION_DATASET.append( Instance(input=prev_message, output=action, alt_outputs=alt_colors, source=prev_key + (row['condition'], len(previous)))) if key != prev_key: previous = [] previous.append(new_message) prev_key = key prev_context = context return NEXT_ACTION_DATASET
def sample(model_pkl_file, device, insts_file): dev_insts = [] with open(insts_file, 'r') as infile: for line in infile: line = line.strip() if line: dev_insts.append(Instance(**json.loads(line))) with thutils.device_context(device): with open(model_pkl_file, 'rb') as infile: model = pickle.load(infile) import pdb; pdb.set_trace() samples = model.predict(dev_insts, random=True, verbosity=0) for inst, sample in zip(dev_insts, samples): print(json.dumps(sample))
def sample_unicode(model_pkl_file, device): dev_insts = [] with open('data/unicode_dev.json', 'r') as infile: for line in infile: line = line.strip() if line: dev_insts.append(Instance(**json.loads(line))) dev_insts = dev_insts[:256] with thutils.device_context(device): with open(model_pkl_file, 'rb') as infile: model = pickle.load(infile) samples = model.predict(dev_insts, random=True) for inst, sample in zip(dev_insts, samples): char = chr(int(inst.input, 16)) print(f'{char} U+{inst.input} {sample}')
def get_input_instance(self, game, dialogue, invert=False): if invert: rewards = self.infer_their_rewards(game, self.dialogue) else: rewards = game[1] pieces = [f'{game[0][0]} {rewards[0]} {game[0][1]} {rewards[1]} {game[0][2]} {rewards[2]}'] for entry in dialogue: if invert: entry = entry.replace('YOU:', 'XYOU:') entry = entry.replace('THEM:', 'YOU:') entry = entry.replace('XYOU:', 'THEM:') pieces.append(f'{entry} <eos>') input = ' '.join(pieces) if dialogue: input = input[:-len(' <eos>')] result = Instance(input, '') if self.options.verbosity >= 6: print(result.__dict__) return result
def score_all(self, colors, descriptions, format='rgb'): ''' Return a list of log probabilities (base e) for the descriptions in `descriptions`, conditioned on the corresponding colors in `colors`, which are expressed in the colorspace given by `format` (one of 'rgb', 'hsv', 'hsl'). `descriptions` and `colors` must have the same length. >>> cd = ColorDescriber() >>> cd.score_all([(255, 0, 0), (0, 0, 255)], ['red', 'blue']) # doctest: +ELLIPSIS [-0.23..., -0.26...] ''' convert = { 'hsv': (lambda c: c), 'hsl': hsl_to_hsv, 'rgb': rgb_to_hsv, }[format] insts = [Instance(convert(c), d) for c, d in zip(colors, descriptions)] return self.model.score(insts)
def describe_all(self, colors, format='rgb', sample=False): ''' Return a list of descriptions, one for each color in `colors`, which is expressed in the colorspace given by `format` (one of 'rgb', 'hsv', 'hsl'). If `sample` is `True`, return descriptions sampled from the model's probability distribution; otherwise return the most likely, common descriptions. >>> cd = ColorDescriber() >>> cd.describe_all([(255, 0, 0), (0, 0, 255)]) ['red', 'blue'] ''' convert = { 'hsv': (lambda c: c), 'hsl': hsl_to_hsv, 'rgb': rgb_to_hsv, }[format] insts = [Instance(convert(c)) for c in colors] return self.model.predict(insts, random=sample)
def hawkins_context(listener=False): assert not listener result = [] with open('hawkins_data/colorReferenceClicks.csv', 'r') as infile: reader = csv.DictReader(infile) for row in reader: context = [ (hsl_to_hsv((row['%sColH' % i], row['%sColS' % i], row['%sColL' % i])), row['%sLocS' % i], row['%sStatus' % i]) for i in ('click', 'alt1', 'alt2') ] context.sort(key=lambda c: c[1]) target_idx = [i for i, (_, _, status) in enumerate(context) if status == 'target'] assert len(target_idx) == 1, context target_idx = target_idx[0] alt_colors = [c for (c, _, _) in context] result.append(Instance(input=target_idx, alt_inputs=alt_colors, output='')) return result
def score_all(self, colors, descriptions, format='rgb'): ''' Return a list of log probabilities (base e) for the descriptions in `descriptions`, conditioned on the corresponding colors in `colors`. `descriptions` and `colors` have the same length. is expressed in the colorspace given by `format` (one of 'rgb', 'hsv', 'hsl'). If `sample` is `True`, return descriptions sampled from the model's probability distribution; otherwise return the most likely, common descriptions. >>> cd = ColorDescriber() >>> cd.score_all([(255, 0, 0), (0, 0, 255)], ['red', 'blue']) # doctest: +ELLIPSIS [-0.23..., -0.26...] ''' convert = { 'hsv': (lambda c: c), 'hsl': hsl_to_hsv, 'rgb': rgb_to_hsv, }[format] insts = [Instance(convert(c), d) for c, d in zip(colors, descriptions)] return self.model.score(insts)
def foobar_train(): return [Instance(input='foo', output='bar') for _ in range(1000)]
def process_all(self, inputs): insts = [Instance(i) for i in inputs] return self.model.predict(insts, verbosity=0)
def _responses(insts): return [ Instance(inst.input, inst.output[0], source=inst.source) for inst in insts ]
def _selections(insts): return [ Instance(inst.input, inst.output[1], source=inst.source) for inst in insts ]
def fill_score_instance(self, inst, rewards, counts): inst_dict = inst.__dict__.copy() inst_dict['output'] = \ f'{counts[0]} {rewards[0]} {counts[1]} {rewards[1]} {counts[2]} {rewards[2]}' return Instance(**inst_dict)
def json_file_test(listener='ignored'): options = config.options() with open(options.test_data_file, 'r') as infile: dataset = [json.loads(line.strip()) for line in infile] return [Instance(**d) for d in dataset]
import cPickle as pickle from stanza.research.instance import Instance if __name__ == '__main__': with open('runs/speaker_fourier_3d0L/quickpickle.p', 'rb') as infile: model = pickle.load(infile) print(model.score([Instance((120., 100., 100.), 'green')]))