def main(): """ example: "sarah discovered the vase that the dog might take ." vs. "sarah discovered what the dog might take the vase ." """ nouns_s = get_legal_words(tag='NN') excluded_verbs_base = ('put', 'run', 'say', 'be', 'give', 'tell', 'live') verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) excluded_verbs_past = ('started', 'let', 'told') verbs_past = get_legal_words(tag='VBD', exclude=excluded_verbs_past) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) def add_preposition_after_vb(v: str): if v == 'play': return 'play with' elif v == 'point': return 'point to' elif v == 'turn': return 'turn to' elif v == 'work': return 'work with' else: return v while True: # random choices slot2filler = { 'name': random.choice(names), 'nn1': random.choice(nouns_s), 'nn2': random.choice(animates), 'vbd': random.choice(verbs_past), 'vbd2': random.choice(verbs_past), # used in template2 only 'vb': random.choice(verbs_base), # used in template 1 only } slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb']) yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "sam found one purple dog and karen revealed more ." vs. "sam found one dog and karen revealed more purple." """ vbds = get_legal_words(tag='VBD') adjectives = get_legal_words(tag='JJ') nouns_mass = (configs.Dirs.legal_words / 'nouns_mass.txt').open().read().split() nouns_s = get_legal_words(tag='NN', exclude=tuple(nouns_mass)) animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) determiners = ['a', 'the', 'this', 'some', 'that'] + ['your', 'his', 'her'] number_words = ['several', 'more', 'two', 'three', 'a lot more'] # , 'some'] while True: # random choices slot2filler = { 'name1': random.choice(names), 'name2': random.choice(names), 'nn': random.choice(nouns_s), 'nn2': random.choice(animates), 'vbd': random.choice(vbds), 'det': random.choice(determiners), 'jj': random.choice(adjectives), 'number': random.choice(number_words), } yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def get_legal_words(tag: str, second_tag: Optional[str] = None, # also counterbalance list of other word forms (e.g. plural) seed: int = configs.Data.seed, exclude: Optional[Tuple[str, ...]] = None, verbose: bool = False, ) -> Union[List[str], List[Tuple[str, str]]]: print(f'Obtaining counterbalanced subset of legal words with tag={tag} and second_tag={second_tag}') # get words with requested tag and order df_legal = pd.read_csv(configs.Dirs.legal_words / f'{tag}.csv') bool_ids = df_legal['is_legal'].astype(bool).tolist() first_forms_ = df_legal['word'][bool_ids].tolist() # exclude any words ? if exclude: first_forms_ = [w for w in first_forms_ if w not in exclude] # also counterbalance 2nd forms of words ? if second_tag is None: second_forms_ = None elif second_tag == 'NNP': plural = inflect.engine() second_forms_ = [plural.plural(w) for w in first_forms_] elif second_tag.startswith('VB'): lemmas = [getLemma(w, upos='VERB')[0] for w in first_forms_] second_forms_ = [getInflection(lemma, tag=second_tag)[0] for lemma in lemmas] # requires lemma as input else: raise AttributeError('Invalid arg to second_tag') # remove words if their 2nd form is not in vocab or if it is identical to 1st form if second_tag is not None: first_forms = [] second_forms = [] for w1, w2 in zip(first_forms_, second_forms_): if w2 in vocab and w2 != w1: first_forms.append(w1) second_forms.append(w2) if verbose: print(f'Included {w1:<12} and {w2:<12}') assert first_forms assert second_forms else: first_forms = first_forms_ second_forms = second_forms_ # find subset of words such that their total corpus frequencies are approx equal across corpora num_words_in_sample = configs.Data.tag2num_words[tag] res = find_counterbalanced_subset(first_forms, min_size=num_words_in_sample, max_size=num_words_in_sample+100, second_forms=second_forms, seed=seed, verbose=verbose, ) return res
def main(): """ example: "he made the van this challenge ." vs. "the van made he this challenge ." """ # counterbalance both forms of verb as different forms are the contrast vbds = [ 'brought', 'made', 'built', 'gave', 'showed', ] nouns_s = get_legal_words(tag='NN') animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) personal_pronouns_obj = ['me', 'him', 'her', 'us', 'them'] # in the objective case personal_pronouns_subj = ['i', 'he', 'she', 'we', 'they'] # in the subjective case determiners = ['a', 'one', 'this', 'that', 'the', 'my', 'his', 'her'] vowels = {'a', 'e', 'i', 'o', 'u'} while True: vbd = random.choice(vbds) # template 1 # random choices slot2filler = { 'nn': random.choice(animates), 'nn2': random.choice(nouns_s), 'det': random.choice(determiners), 'prp_obj': random.choice(personal_pronouns_obj), 'prp_subj': random.choice(personal_pronouns_subj), 'vbd': vbd, } if slot2filler['det'] == 'a' and slot2filler['nn2'][0] in vowels: slot2filler['det'] += 'n' yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good
def main(): """ example: "a documentary was there looking at dogs ." vs. "there was a documentary looking at dogs ." note: this task is too difficult for babyBERTa """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') adjectives = get_legal_words(tag='JJ') quantifiers = ['each', 'most', 'all', 'every'] copula_p = ['were', 'are', "were not", "aren't"] copula_s = ['was', 'is', "was not", "isn't"] vowels = {'a', 'e', 'i', 'o', 'u'} gerunds_ = [ 'looking', 'becoming', 'falling', 'leaving', 'eating', 'increasing', 'moving', 'opening', 'existing', 'containing', 'standing', 'changing', 'surrounding', 'adding', 'acting', ] gerunds = find_counterbalanced_subset(gerunds_, min_size=8, max_size=len(gerunds_)) # a linker can be a preposition or determiner phrase gerund2linker = { 'looking': 'like a', 'becoming': 'some kind of a', 'falling': 'on the', 'leaving': 'us by the', 'eating': 'one piece of this', 'increasing': 'the size of the', 'moving': 'to the', 'opening': 'the door to a', 'existing': 'without a', 'containing': 'a', 'standing': 'on top of a', 'changing': 'the', 'surrounding': 'the', 'adding': 'to the', 'acting': 'like a', } while True: # random choices noun_s, noun_p = random.choice(nouns_s_and_p) adj = random.choice(adjectives) quantifier = random.choice(quantifiers) subj_s, sub_p = random.choice(nouns_s_and_p) gerund = random.choice(gerunds) # plural vs. singular copula if quantifier in {'most', 'all'}: copula = random.choice(copula_p) subj1 = sub_p # for template 1 else: copula = random.choice(copula_s) subj1 = subj_s # "a" vs. "an" linker = gerund2linker[gerund] if linker.endswith('a') and adj[0] in vowels: linker += 'n' # contrast is about word order yield template1.format('there', copula, quantifier, subj1, gerund, linker, adj, noun_s) # bad yield template1.format(quantifier, subj1, copula, 'there', gerund, linker, adj, noun_s) # good
def main(): """ example: "no cat can jump on more than two dogs ." vs. "no cat jump on at least two dogs ." """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') number_words_ = (configs.Dirs.legal_words / "number_words.txt").open().read().split() number_words = find_counterbalanced_subset(number_words_, min_size=6, max_size=len(number_words_)) quantifiers_g_b = [ ('more than', 'at least'), ('fewer than', 'at most'), ] animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) verbs_ = [ 'become', 'catch', 'leave', 'increase', 'move', 'open', 'exist', 'contain', 'stand', 'change', 'surround', 'carry', 'act', ] verbs = find_counterbalanced_subset(verbs_, min_size=8, max_size=len(verbs_)) # a linker can be a preposition or determiner phrase verb2linker = { 'become': None, 'catch': None, 'leave': None, 'increase': 'the size of', 'move': 'to', 'open': 'the door to', 'exist': 'without', 'contain': None, 'stand': 'on top of', 'change': None, 'surround': None, 'carry': None, 'act': 'like', } while True: # random choices animate = random.choice(animates) noun_s, noun_p = random.choice(nouns_s_and_p) number_word = random.choice(number_words) quantifier_g, quantifier_b = random.choice(quantifiers_g_b) verb = random.choice(verbs) aux = random.choice(['can', 'could']) verb_and_optional_linker = verb if verb2linker[verb] is not None: verb_and_optional_linker += ' ' + verb2linker[verb] if number_word == 'one': noun = noun_s else: noun = noun_p yield template1.format(animate, aux, verb_and_optional_linker, quantifier_b, number_word, noun) # bad yield template1.format(animate, aux, verb_and_optional_linker, quantifier_g, number_word, noun) # good
def main(): """ example: "is the bell ringing ?" vs "is the bell rings ?" """ # counterbalance both forms of verb as different forms are the contrast vbgs_and_vbzs = get_legal_words(tag='VBG', second_tag='VBZ', exclude=('facing', 'naming', 'training', 'setting', 'meaning')) vbs_and_vbzs = get_legal_words(tag='VB', second_tag='VBZ') nouns_s = get_legal_words(tag='NN') animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) def add_argument_after_vb( v: str, arg1: str, arg2: str, ) -> str: if v in {'saying', 'says', 'say'}: return f'{v} something' elif v in {'using', 'uses', 'use'}: return f'{v} {arg1}' elif v in {'telling', 'tells', 'tell'}: return f'{v} me about {arg1}' elif v in {'making', 'makes', 'make'}: return f'{v} {arg1} something' elif v in {'planning', 'plans', 'plan'}: return f'{v} to do something with {arg1}' elif v in {'taking', 'takes', 'take'}: return f'{v} {arg1} away' elif v in {'giving', 'gives', 'give'}: return f'{v} {arg1} {arg2}' elif v in {'falling', 'falls', 'fall'}: return f'{v} in {arg1}' elif v in {'showing', 'shows', 'show'}: return f'{v} {arg1} to {arg2}' elif v in {'seeing', 'sees', 'see'}: return f'{v} how the {arg1} works' elif v in {'finding', 'finds', 'find'}: return f'{v} {arg1}' elif v in {'coming', 'comes', 'come'}: return f'{v} to {arg1}' elif v in {'getting', 'gets', 'get'}: return f'{v} {arg1}' elif v in {'depending', 'depends', 'depend'}: return f'{v} on {arg1}' else: return v while True: vbg1, vbz1 = random.choice(vbgs_and_vbzs) # template 1 vb2, vbz2 = random.choice(vbs_and_vbzs) # template 2 # sample argument once, so that the same argument is used by both bad and good sentences. # note: pronouns don't get determiners, but nouns do argument1 = random.choice(['you', 'him', 'her', 'it'] + [f'the {nn}' for nn in nouns_s]) argument2 = random.choice(['you', 'him', 'her', 'it'] + [f'the {nn}' for nn in nouns_s]) # random choices slot2filler = { 'name': random.choice(names), 'nn': random.choice(nouns_s + animates), 'vb2': add_argument_after_vb(vb2, argument1, argument2), 'vbz2': add_argument_after_vb(vbz2, argument1, argument2), 'vbg1': add_argument_after_vb(vbg1, argument1, argument2), 'vbz1': add_argument_after_vb(vbz1, argument1, argument2), } yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "there was a documentary about dogs ." vs. "there was each documentary about dogs ." """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') adjectives = get_legal_words(tag='JJ') quantifiers_good = ['a', 'no', 'some', 'many', 'few'] quantifiers_bad = ['each', 'most', 'all', 'every'] template1_subjects_s_and_p = [ ('movie', 'movies'), ('book', 'books'), ('story', 'stories'), ('sign', 'signs'), ] vowels = {'a', 'e', 'i', 'o', 'u'} copula_p = ['were', 'are', "were not", "aren't"] copula_s = ['was', 'is', 'was not', "isn't"] names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) while True: # random choices noun_s, noun_p = random.choice(nouns_s_and_p) adj = random.choice(adjectives) quantifier_b = random.choice(quantifiers_bad) quantifier_g = random.choice(quantifiers_good) subj_s, sub_p = random.choice(template1_subjects_s_and_p) name = random.choice(names) # plural vs. singular copula if quantifier_g in {'some', 'many', 'few'}: copula = random.choice(copula_p) subj1 = sub_p # for template 1 subj2 = noun_p # for template 2 else: copula = random.choice(copula_s) subj1 = subj_s subj2 = noun_s # "a" vs. "an" if subj1[0] in vowels and quantifier_g == 'a': quantifier_g = 'an' if subj1[0] in vowels and quantifier_b == 'a': quantifier_b = 'an' if subj2[0] in vowels and quantifier_g == 'a': quantifier_g = 'an' if subj2[0] in vowels and quantifier_b == 'a': quantifier_b = 'an' # prevent double negation if quantifier_g == 'no' and ('not' in copula or "n't" in copula): copula = copula.replace(' not', '') copula = copula.replace(" n't", '') yield template1.format(copula, quantifier_b, subj1, adj, noun_p) # bad yield template1.format(copula, quantifier_g, subj1, adj, noun_p) # good yield template2.format(copula, quantifier_b, subj2, name) # bad yield template2.format(copula, quantifier_g, subj2, name) # good
def main(): """ example: sam questioned the dog that can hurt sara ." vs "sam questioned who the dog can hurt sara." """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') excluded_verbs_base = ('put', 'run', 'say', 'be', 'give', 'tell', 'live') verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) excluded_verbs_past = ('started', 'let', 'told') verbs_past = get_legal_words(tag='VBD', exclude=excluded_verbs_past) excluded_verbs_gerund = ('saying', ) verbs_gerund = get_legal_words(tag='VBG', exclude=excluded_verbs_gerund) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) def add_preposition_after_vb(v: str): if v in {'acting', 'act'}: return f'{v} like' elif v in { 'standing', 'stand', 'falling', 'fall', 'depending', 'depend' }: return f'{v} on' elif v in {'asking', 'ask', 'writing', 'write', 'thinking', 'think'}: return f'{v} about' elif v in {'swimming', 'swim', 'sleeping', 'sleep'}: return f'{v} in' elif v in {'driving', 'drive', 'coming', 'come', 'related', 'relate'}: return f'{v} to' elif v in {'flying', 'fly', 'working', 'work'}: return f'{v} with' else: return v while True: # random choices slot2filler = { 'name': random.choice(names), 'nn': random.choice(nouns_s_and_p)[0], 'nns': random.choice(nouns_s_and_p)[1], 'vbd': random.choice(verbs_past), 'vbg': random.choice(verbs_gerund), # used in template2 only 'vb': random.choice(verbs_base), # used in template 1 only } slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb']) slot2filler['vbd'] = add_preposition_after_vb(slot2filler['vbd']) slot2filler['vbg'] = add_preposition_after_vb(slot2filler['vbg']) yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "katherine will help herself do something" vs. "katerine will help himself do something" """ excluded_verbs_base = ('say', ) verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) nouns_s = get_legal_words(tag='NN') names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) auxiliaries = ['can', 'could', 'will', 'would', 'must', 'should'] def add_misc_after_prp( prp: str, v: str, arg1: str, ) -> str: if v in { 'take', }: return f'{prp} to {arg1}' elif v in {'make', 'give', 'put'}: return f'{prp} {arg1}' elif v in { 'work', }: return f'{prp} on {arg1}' elif v in { 'turn', }: return f'{prp} around' elif v in { 'tell', }: return f'{prp} about {arg1}' else: return prp def add_preposition_after_vb(v: str) -> str: if v in {'work', 'study', 'live'}: return f'{v} with' elif v in {'point', 'run'}: return f'{v} to' elif v in { 'be', }: return f'{v} like' else: return v while True: vb = random.choice(verbs_base) # random choices slot2filler = { 'aux': random.choice(auxiliaries), 'nn_m': random.choice([name for name in names if name in names_m] + ['he', 'the man', 'a man', 'that man']), 'nn_f': random.choice([name for name in names if name in names_f] + ['she', 'the woman', 'a woman', 'that woman']), } # sample argument once, so that the same argument is used by both bad and good sentences. # note: pronouns don't get determiners, but nouns do if vb == 'put': argument1 = random.choice(['in danger', 'in this situation']) else: argument1 = random.choice([f'the {nn}' for nn in nouns_s]) # first, add some miscellaneous component slot2filler['prp_m'] = add_misc_after_prp('himself', vb, argument1) slot2filler['prp_f'] = add_misc_after_prp('herself', vb, argument1) # second, add a preposition slot2filler['vb'] = add_preposition_after_vb(vb) yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good # use negation only in non-question, in templates 3, and 4 if random.random() < 0.5: slot2filler['aux'] += ' ' + 'not' if random.random() < 0.1: slot2filler['aux'] = 'did not' yield template3['b'].format(**slot2filler) # bad yield template3['g'].format(**slot2filler) # good yield template4['b'].format(**slot2filler) # bad yield template4['g'].format(**slot2filler) # good
def main(): """ example: "who should sarah hug after shocking the dog ?" vs "who should sarah hug the dog after shocking ?" note: this task is too difficult for babyBERTa """ nouns_s = get_legal_words(tag='NN') excluded_verbs_base = ('run', 'say', 'be', 'give', 'tell', 'live', 'force') verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) excluded_verbs_gerund = ('saying', ) verbs_gerund = get_legal_words(tag='VBG', exclude=excluded_verbs_gerund) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) pps = ['after', 'before', 'while', 'without'] def add_preposition_after_vb(v: str): if v == 'related': return 'related to' elif v == 'acting': return 'acting like' elif v == 'put': return 'put on' elif v == 'work': return 'work for' elif v == 'sleeping': return 'sleeping in' elif v == 'standing': return 'standing on' elif v == 'depending': return 'depending on' elif v == 'flying': return 'flying over' elif v == 'falling': return 'falling on' elif v == 'asking': return 'asking about' elif v == 'swimming': return 'swimming in' elif v == 'asking': return 'asking for' elif v == 'coming': return 'coming to' else: return v while True: # random choices slot2filler = { 'name': random.choice(names), 'nn': random.choice(nouns_s), 'pp': random.choice(pps), 'vb': random.choice(verbs_base), 'vbg': random.choice(verbs_gerund), } slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb']) slot2filler['vbg'] = add_preposition_after_vb(slot2filler['vbg']) yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "give me the frog ." vs. "the frog gives me ." """ # counterbalance both forms of verb as different forms are the contrast vbs_and_vbzs_1 = [ ('give', 'gives'), ] vbs_and_vbzs_2 = [ ('ask', 'asked'), # "asks" is not in vocab ('tell', 'tells'), ] prps_obj = ['me', 'you', 'him', 'her', 'them'] conjunctions = ['when', 'but', 'with', 'and'] # TODO use a counterbalanced verb list vbzs3_and_continuations = [ # contains a mix of past and present tense forms # past tense form ('saw', '{prp_obj} there', '{prp_obj} by'), ('created', '{prp_obj}', '{det}'), ('told', '{prp_obj} about that', '{prp_obj} about {det}'), ('wrote', '{prp_obj} something', '{prp_obj} {det}'), ('wanted', '{prp_obj}', 'to'), ('asked', 'about {prp_obj}', '{prp_obj} about'), ('sold', '{prp_obj} that', 'that to'), ('changed', '{prp_obj}', '{det}'), # present tense form ('looks', 'at {prp_obj}', 'at {det}'), ('plays', 'with {prp_obj}', 'with {det}'), ('thinks', 'about {prp_obj}', 'about {det}'), ('moves', 'fast', 'to'), ('works', 'well', '{conjunction}'), ] adjectives = get_legal_words(tag='JJ') nouns_s = get_legal_words(tag='NN') animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) personal_pronouns_obj = ['me', 'him', 'her', 'us', 'them'] # in the objective case personal_pronouns_subj = ['i', 'he', 'she', 'we', 'they'] # in the subjective case determiners = ['a', 'one', 'the', 'my', 'his', 'some'] # do not include "this" or "that" or "her" vowels = {'a', 'e', 'i', 'o', 'u'} while True: vb1, vbz1 = random.choice(vbs_and_vbzs_1) # template 1 vb2, vbz2 = random.choice(vbs_and_vbzs_2) # template 2 vbz3, cont_g, cont_b = random.choice( vbzs3_and_continuations) # template 3 # good and bad continuations prp_obj = random.choice(prps_obj) conjunction = random.choice(conjunctions) cont_g = cont_g.format(prp_obj=prp_obj) cont_b = cont_b.format(prp_obj=prp_obj, det=random.choice(determiners), conjunction=conjunction) # random choices slot2filler = { 'det': random.choice(determiners), 'jj': random.choice(adjectives), 'nn': random.choice(animates), 'nn2': random.choice(nouns_s), 'prp_obj': random.choice(personal_pronouns_obj), 'prp_subj': random.choice(personal_pronouns_subj), 'vb1': vb1, 'vbz1': vbz1, 'vb2': vb2, 'vbz2': vbz2, 'vbz3': vbz3, 'cont_g': cont_g, 'cont_b': cont_b, } if slot2filler['det'] == 'a' and slot2filler['nn2'][0] in vowels: slot2filler['det'] += 'n' yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good yield template3['b'].format(**slot2filler) # bad yield template3['g'].format(**slot2filler) # good
def main(): """ example: "who must sarah and the dog kiss ?" vs "who must sarah kiss and the dog ?" """ excluded_verbs_base = ('run', 'be', 'live', 'force', 'order') verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) excluded_verbs_gerund = ('',) verbs_gerund = get_legal_words(tag='VBG', exclude=excluded_verbs_gerund) animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) def add_preposition_after_vb(v: str, arg: str, ): if v == 'related': return 'related to' elif v == 'put': return 'put on' elif v == 'work': return 'work for' elif v == 'acting': return 'acting like' elif v == 'sleeping': return 'sleeping in' elif v == 'falling': return 'falling on' elif v == 'looking': return 'looking for' elif v == 'running': return 'running to' elif v == 'talking': return 'talking about' elif v == 'thinking': return 'thinking about' elif v == 'reaching': return 'reaching for' elif v == 'work': return f'work {arg}' else: return v while True: # random choices slot2filler = { 'name': random.choice(names), 'nn': random.choice(animates), 'vb': random.choice(verbs_base), 'vbg': random.choice(verbs_gerund), } arg = random.choice(["him", "her", "them", "us"]) slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb'], arg) slot2filler['vbg'] = add_preposition_after_vb(slot2filler['vbg'], arg) # exclude bad combinations that involve "who", e.g. "saying who" if slot2filler['vbg'] not in {'saying', 'drinking', 'eating', 'open'}\ and slot2filler['vb'] not in {'need', 'feel', 'open'}: if slot2filler['vb'] == 'tell': slot2filler['vb'] = 'tell something' yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good yield template3['b'].format(**slot2filler) # bad yield template3['g'].format(**slot2filler) # good yield template4['b'].format(**slot2filler) # bad yield template4['g'].format(**slot2filler) # good
def main(): """ example: "sarah laughs" vs. "sarah gives" """ # we need a lot of verbs here, so temporarily reduce restrictions imposed by counterbalancing tmp1 = configs.Data.tag2num_words['VB'] tmp2 = configs.Data.tag2num_words['VBZ'] tmp3 = configs.Data.tag2num_words['VBD'] tmp4 = configs.Data.bias_tolerance configs.Data.tag2num_words['VB'] = 30 configs.Data.tag2num_words['VBZ'] = 50 configs.Data.tag2num_words['VBD'] = 50 configs.Data.bias_tolerance = 7000 vbs = get_legal_words(tag='VB', exclude=('fit', 'come', 'point')) vbzs = get_legal_words(tag='VBZ', exclude=('points', )) vbds = get_legal_words(tag='VBD', exclude=('fit', 'dropped', 'signed', 'formed', 'managed')) configs.Data.tag2num_words['VB'] = tmp1 configs.Data.tag2num_words['VBZ'] = tmp2 configs.Data.tag2num_words['VBD'] = tmp3 configs.Data.bias_tolerance = tmp4 animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) prps_s = ['she', 'he', 'it'] prps_p = ['we', 'they'] prps = prps_s + prps_p aux_s = ['does'] auxiliaries = ['could', 'can', 'would', 'will', 'did'] + aux_s determiners = ['the', 'this', 'some', 'that', 'every'] + ['your', 'his', 'her'] names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) vbs_intransitive = [ 'run', 'work', 'turn', 'eat', 'live', 'read', 'trade', 'play', 'know', 'study', 'think', 'change', ] vbzs_intransitive = [ 'moves', 'lives', 'lies', 'knows', 'waves', 'changes', 'works', 'dies', 'leads', 'appears', 'thinks', 'falls', 'matters', 'turns', 'stands', 'stands', 'runs', 'calls', 'races', ] vbds_intransitive = [ 'occurred', 'married', 'moved', 'looked', 'changed', 'finished', 'grew', 'broke', 'started', 'improved', 'worked', 'thought', 'came', 'tried', 'read', 'lost', 'knew', 'lived', 'accepted', 'developed', 'joined', 'joined', 'decided', 'learned', 'occurred', 'happened', 'fell', 'refused', 'returned', ] vbs_intransitive = vbs_intransitive vbs_transitive = [v for v in vbs if v not in vbs_intransitive] vbzs_transitive = [v for v in vbzs if v not in vbzs_intransitive] vbzs_or_vbds_intransitive = vbzs_intransitive + vbds_intransitive vbzs_or_vbds_transitive = [v for v in vbzs + vbds if v not in vbzs_or_vbds_intransitive] while True: # random choices slot2filler = { 'nn1': random.choice(animates + names + prps), 'aux': random.choice(auxiliaries), 'vbz_or_vbd_intransitive': random.choice(vbzs_or_vbds_intransitive), 'vbz_or_vbd_transitive': random.choice(vbzs_or_vbds_transitive), 'vb_intransitive': random.choice(vbs_intransitive), 'vb_transitive': random.choice(vbs_transitive), } # handle exception: "occurred" and "happened" cannot have animate subject if slot2filler['vbz_or_vbd_intransitive'] in ['occurred', 'happened'] and\ slot2filler['nn1'] not in ['it', 'that', 'this']: continue # add determiner to animate noun if slot2filler['nn1'] in animates: slot2filler['nn1'] = random.choice(determiners) + ' ' + slot2filler['nn1'] # do not use template 1 with plural pronoun and VBZ if not (slot2filler['nn1'] in prps_p and slot2filler['vbz_or_vbd_intransitive'] in vbzs_intransitive or slot2filler['vbz_or_vbd_transitive'] in vbzs_transitive): yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good # do not use template 2 with plural pronoun and singular aux (e.g. "does") if not (slot2filler['nn1'] in prps_p and slot2filler['aux'] in aux_s): yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "only sarah could ever talk." vs. "even sarah could ever talk" """ vbzs = get_legal_words(tag='VBZ', exclude=('happens', 'says', )) vbs = get_legal_words(tag='VB') nouns_s = get_legal_words(tag='NN') animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) determiners = ['the', 'this', 'some', 'that'] + ['your', 'his', 'her'] auxiliaries = ['could', 'can', 'would', 'will'] def add_argument_after_vb(v: str, argument1: str, ) -> str: if v in {'thinks', 'reads'}: return f'{v} about' elif v in {'lives', 'falls', 'is', 'be'}: return f'{v} in' elif v in {'stands', 'turns'}: return f'{v} on' elif v in {'acts', 'looks'}: return f'{v} like' elif v in {'goes', 'comes'}: return f'{v} to' elif v in {'gives', 'gives'}: return f'{v} {argument1}' elif v in {'plays', 'play', 'shows', 'show', 'tells', 'tell'}: return f'{v} {argument1}' else: return v while True: arg1 = random.choice(['him', 'her']) vbz = random.choice(vbzs) vb = random.choice(vbs) # random choices slot2filler = { 'nn1': random.choice(names + animates), 'nn2': random.choice(nouns_s), 'vbz': add_argument_after_vb(vbz, arg1), 'vb': add_argument_after_vb(vb, arg1), 'det': random.choice(determiners), 'aux': random.choice(auxiliaries) } # add determiner to animate noun if slot2filler['nn1'] in animates: slot2filler['nn1'] = random.choice(determiners) + ' ' + slot2filler['nn1'] yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "has sam ever worried sarah ?" vs. "jane has ever worried sarah ." """ vbs = get_legal_words(tag='VB') nouns_s = get_legal_words(tag='NN') animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) auxiliaries = ['does', 'will', 'could', 'did', 'should', 'would'] determiners = ['a', 'the', 'this', 'some', 'that'] + ['your', 'his', 'her'] def add_argument_after_vb( v: str, arg1: str, arg2: str, ) -> str: if v in {'say'}: return f'{v} something' elif v in {'read'}: return f'{v} a book' elif v in {'play'}: return f'{v} with {arg1}' elif v in { 'use', 'find', 'get', 'be', 'order', 'need', 'have', 'control', 'want', 'free', 'keep' }: return f'{v} {arg1}' elif v in {'tell'}: return f'{v} me about {arg1}' elif v in {'plan'}: return f'{v} to do something with {arg1}' elif v in {'take'}: return f'{v} {arg1} away' elif v in {'give', 'show', 'present'}: return f'{v} {arg1} to {arg2}' elif v in {'put'}: return f'{v} {arg1} on {arg2}' elif v in {'fall'}: return f'{v} in {arg1}' elif v in {'see'}: return f'{v} how the {arg1} works' elif v in {'come'}: return f'{v} to {arg1}' else: return v while True: # sample argument once, so that the same argument is used by both bad and good sentences. # note: pronouns don't get determiners, but nouns do argument1 = random.choice(['you', 'him', 'her', 'it'] + [f'the {nn}' for nn in nouns_s]) argument2 = random.choice(['you', 'him', 'her', 'it'] + [f'the {nn}' for nn in nouns_s]) vb = random.choice(vbs) # random choices slot2filler = { 'name': random.choice(names), 'nn': random.choice(animates), 'vb': add_argument_after_vb(vb, argument1, argument2), 'aux': random.choice(auxiliaries), 'det': random.choice(determiners), } if slot2filler['aux'] in {'did', 'does'} and vb == 'be': continue yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "we can help him do something" vs. "we can help himself do something" """ # counterbalance both forms of verb as different forms are the contrast excluded_verbs_base = ('say', 'live') verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) nouns_s = get_legal_words(tag='NN') prps_obj_and_poss = [ ('him', 'his'), ('her', 'hers'), ('us', 'our'), ('them', 'theirs'), ] animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) auxiliaries = ['can', 'could', 'will', 'would', 'must', 'should'] def add_misc_after_prp( prp: str, v: str, arg1: str, ) -> str: if v in {'take'}: return f'{prp} to {arg1}' elif v in {'make'}: return f'{prp} do {arg1}' elif v in {'work', 'put'}: return f'{prp} on {arg1}' elif v in {'turn'}: return f'{prp} around' elif v in {'tell'}: return f'{prp} about {arg1}' else: return prp def add_preposition_after_vb(v: str) -> str: if v in {'work', 'study'}: return f'{v} with' elif v in {'point', 'run'}: return f'{v} to' elif v in {'be'}: return f'{v} like' else: return v while True: prp_obj, prp_poss = random.choice(prps_obj_and_poss) # random choices slot2filler = { 'aux': random.choice(auxiliaries), 'prp_poss': prp_poss, 'prp_obj': prp_obj, 'nn': random.choice(animates), 'vb': random.choice(verbs_base), } # sample argument once, so that the same argument is used by both bad and good sentences. # note: pronouns don't get determiners, but nouns do argument1 = random.choice([f'the {nn}' for nn in nouns_s]) # first, add some miscellaneous component slot2filler['prp_poss'] = add_misc_after_prp(prp_poss, slot2filler['vb'], argument1) slot2filler['prp_obj'] = add_misc_after_prp(prp_obj, slot2filler['vb'], argument1) # lastly, add a preposition slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb']) yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good
def main(): """ example: "a big dog fell down the stairs ." vs. "a big dog fallen down the stairs ." """ vocab = get_vocab_words() modifiers = [ 'over there', 'some time ago', 'this morning', 'at home', 'last night' ] names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) vbds_vbns_args = [ ('arose', 'arisen', ['']), # optional arguments ('knew', 'known', ['a lot of things', 'she could do it']), ('saw', 'seen', ['a bird', 'a shape', 'something']), ('began', 'begun', ['to work']), ('fell', 'fallen', ['down the stairs']), ('flew', 'flown', ['into the sky', 'away']), ('drove', 'driven', [ 'out of the garage', 'down the road', 'with one wheel', 'without looking' ]), ('grew', 'grown', [ 'quickly', ]), ('hid', 'hidden', ['from view', 'behind the bush']), ('rose', 'risen', ['from bed']), ('swore', 'sworn', ['not to do it again']), ('drank', 'drunk', ['some juice', 'the soup', 'your coffee']), ('ate', 'eaten', ['a lot', 'more than me', 'some ice cream']), ('drew', 'drawn', ['a picture', 'a map', 'a round circle']), ('wrote', 'written', ['a story', 'a note', 'into a book', 'with a large pen']), ('sang', 'sung', [ 'a nice song', 'in the theater', 'with a pretty voice', 'my favorite song' ]), ('spoke', 'spoken', ['very fast', 'to me', 'about many things', 'without thinking']), ('came', 'come', ['to the store', 'just in time', 'when we needed her', 'too late']), # transitive ('was', 'been', ['here', 'alone', 'afraid']), ('beat', 'beaten', ['the dough', 'a little boy', 'their pet']), ('became', 'become', ['angry', 'very different', 'someone else']), ('bit', 'bitten', ['her own tongue', 'into the cake', 'off a big chunk']), ('blew', 'blown', [ 'out the candle', 'away the dirt', ]), ('chose', 'chosen', [ 'the best option', 'the good one', ]), ('did', 'done', ['nothing wrong', 'something bad', 'the best she could ']), ('forgave', 'forgiven', ['her', 'the child', 'him']), ('gave', 'given', [ 'a book to a student', 'something sweet to the baby', 'money to the man' ]), ('rode', 'ridden', ['a horse', 'a cart', 'in the front seat', 'away']), ('shook', 'shaken', ['the plate', 'the table', 'the bowl']), ('strode', 'stridden', ['']), ('took', 'taken', ['a paper', 'some food', 'the bell', 'it', 'them']), ('threw', 'thrown', ['the trash out', 'the paper ball', 'some away', 'his ball']), ] while True: # random choices name = random.choice(names) mod = random.choice(modifiers) vbd, vbn, args = random.choice(vbds_vbns_args) arg = random.choice(args) if (vbd not in vocab or vbn not in vocab) or vbd == vbn: # print(f'"{verb_base:<22} excluded due to some forms not in vocab') continue if arg == '': continue # vbd is correct yield template.format(name, vbn, arg, mod) # bad yield template.format(name, vbd, arg, mod) # good # vbn is correct yield template.format(name, 'had ' + vbd, arg, mod) yield template.format(name, 'had ' + vbn, arg, mod)