def main(): """ example: "look at this house" vs. "look at this houses" """ demonstratives_singular = ["this", "that"] demonstratives_plural = ["these", "those"] nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') adjectives = get_legal_words(tag='JJ') while True: # random choices noun_s, noun_p = random.choice(nouns_s_and_p) adj = random.choice(adjectives) for dem_s in demonstratives_singular: yield template1.format(dem_s, noun_p, adj) # bad yield template1.format(dem_s, noun_s, adj) # good yield template2.format(dem_s, noun_p, adj) yield template2.format(dem_s, noun_s, adj) for dem_p in demonstratives_plural: yield template1.format(dem_p, noun_s, adj) # bad yield template1.format(dem_p, noun_p, adj) # good yield template2.format(dem_p, noun_s, adj) yield template2.format(dem_p, noun_p, adj)
def main(): """ example: "sarah discovered the vase that the dog might take ." vs. "sarah discovered what the dog might take the vase ." """ nouns_s = get_legal_words(tag='NN') excluded_verbs_base = ('put', 'run', 'say', 'be', 'give', 'tell', 'live') verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) excluded_verbs_past = ('started', 'let', 'told') verbs_past = get_legal_words(tag='VBD', exclude=excluded_verbs_past) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) def add_preposition_after_vb(v: str): if v == 'play': return 'play with' elif v == 'point': return 'point to' elif v == 'turn': return 'turn to' elif v == 'work': return 'work with' else: return v while True: # random choices slot2filler = { 'name': random.choice(names), 'nn1': random.choice(nouns_s), 'nn2': random.choice(animates), 'vbd': random.choice(verbs_past), 'vbd2': random.choice(verbs_past), # used in template2 only 'vb': random.choice(verbs_base), # used in template 1 only } slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb']) yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "where does the dog go?" vs. "where does the dogs go?" """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') adjectives = get_legal_words(tag='JJ') doing_singular = ["does"] doing_plural = ["do"] while True: # random choices adj = random.choice(adjectives) noun_s, noun_p = random.choice(nouns_s_and_p) for doing_s in doing_singular: yield template1.format(doing_s, noun_p) # bad yield template1.format(doing_s, noun_s) # good yield template2.format(doing_s, noun_p) yield template2.format(doing_s, noun_s) yield template3.format(doing_s, noun_p) yield template3.format(doing_s, noun_s) yield template4.format(doing_s, noun_p, adj) yield template4.format(doing_s, noun_s, adj) yield template5.format(doing_s, noun_p) yield template5.format(doing_s, noun_s) yield template6.format(doing_s, noun_p) yield template6.format(doing_s, noun_s) for doing_p in doing_plural: yield template1.format(doing_p, noun_s) # bad yield template1.format(doing_p, noun_p) # good yield template2.format(doing_p, noun_s) yield template2.format(doing_p, noun_p) yield template3.format(doing_p, noun_s) yield template3.format(doing_p, noun_p) yield template4.format(doing_p, noun_s, adj) yield template4.format(doing_p, noun_p, adj) yield template5.format(doing_p, noun_s) yield template5.format(doing_p, noun_p) yield template6.format(doing_p, noun_s) yield template6.format(doing_p, noun_p)
def main(): """ example: "the dog that i like is green" vs. "the dogs that i like is green" """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') adjectives = get_legal_words(tag='JJ') copulas_singular = ["is", "was"] copulas_plural = ["are", "were"] pronouns_1p_2p = ['i', 'you', 'we'] pronouns_3p = ['he', 'she', 'it'] assert len(pronouns_3p) == len(pronouns_1p_2p) while True: # random choices noun_s, noun_p = random.choice(nouns_s_and_p) adj = random.choice(adjectives) for copula_s in copulas_singular: # object-relative for pronoun_1p_2p in pronouns_1p_2p: yield template1a.format(noun_p, pronoun_1p_2p, copula_s, adj) # bad yield template1a.format(noun_s, pronoun_1p_2p, copula_s, adj) # good for pronoun_3p in pronouns_3p: yield template1b.format(noun_p, pronoun_3p, copula_s, adj) yield template1b.format(noun_s, pronoun_3p, copula_s, adj) # subject-relative yield template2a.format(noun_p, copula_s, adj) yield template2a.format(noun_s, copula_s, adj) for copula_p in copulas_plural: # object-relative for pronoun_1p_2p in pronouns_1p_2p: yield template1a.format(noun_s, pronoun_1p_2p, copula_p, adj) yield template1a.format(noun_p, pronoun_1p_2p, copula_p, adj) for pronoun_3p in pronouns_3p: yield template1b.format(noun_s, pronoun_3p, copula_p, adj) yield template1b.format(noun_p, pronoun_3p, copula_p, adj) # subject-relative yield template2b.format(noun_s, copula_p, adj) yield template2b.format(noun_p, copula_p, adj)
def main(): """ example: "sam found one purple dog and karen revealed more ." vs. "sam found one dog and karen revealed more purple." """ vbds = get_legal_words(tag='VBD') adjectives = get_legal_words(tag='JJ') nouns_mass = (configs.Dirs.legal_words / 'nouns_mass.txt').open().read().split() nouns_s = get_legal_words(tag='NN', exclude=tuple(nouns_mass)) animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) determiners = ['a', 'the', 'this', 'some', 'that'] + ['your', 'his', 'her'] number_words = ['several', 'more', 'two', 'three', 'a lot more'] # , 'some'] while True: # random choices slot2filler = { 'name1': random.choice(names), 'name2': random.choice(names), 'nn': random.choice(nouns_s), 'nn2': random.choice(animates), 'vbd': random.choice(vbds), 'det': random.choice(determiners), 'jj': random.choice(adjectives), 'number': random.choice(number_words), } yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "look at this green house ." vs. "look at this green houses ." "this green house went there ." vs. "this green houses went there." """ demonstratives_singular = ["this", "that"] demonstratives_plural = ["these", "those"] nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') adjectives = get_legal_words(tag='JJ') while True: # random choices adj = random.choice(adjectives) noun_s, noun_p = random.choice(nouns_s_and_p) for dem_s in demonstratives_singular: yield template1.format(dem_s, adj, noun_p) # odd numbered line: bad yield template1.format(dem_s, adj, noun_s) # even numbered line: good yield template2.format(dem_s, adj, noun_p) yield template2.format(dem_s, adj, noun_s) yield template3.format(dem_s, adj, noun_p) yield template3.format(dem_s, adj, noun_s) yield template4.format(dem_s, adj, noun_p) yield template4.format(dem_s, adj, noun_s) for dem_p in demonstratives_plural: yield template1.format(dem_p, adj, noun_s) # odd numbered line: bad yield template1.format(dem_p, adj, noun_p) # even numbered line: good yield template2.format(dem_p, adj, noun_s) yield template2.format(dem_p, adj, noun_p) yield template3.format(dem_p, adj, noun_s) yield template3.format(dem_p, adj, noun_p) yield template4.format(dem_p, adj, noun_s) yield template4.format(dem_p, adj, noun_p)
def main(): """ example: "the dog on the mats is brown" vs "the dog on the mats are brown" considerations: 1. use equal proportion of sentences containing plural vs. singular subject nouns 2. use equal proportion of sentences containing plural vs. singular object nouns 2. subject with object number is counterbalanced such that: -singular subjects occur with 50:50 singular:plural objects -plural subjects occur with 50:50 singular:plural objects """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') adjectives = get_legal_words(tag='JJ') copulas_singular = ["is", "was"] copulas_plural = ["are", "were"] while True: # counter-balance singular vs plural with subj vs. obj sub_s, sub_p = random.choice(nouns_s_and_p) obj_s, obj_p = random.choice(nouns_s_and_p) # random choices template = random.choice([template1, template2]) adj = random.choice(adjectives) for copula_s in copulas_singular: # contrast is in number agreement between subject and copula yield template.format(sub_p, obj_s, copula_s, adj) # bad yield template.format(sub_s, obj_s, copula_s, adj) # good # same as above, except that object number is opposite yield template.format(sub_p, obj_p, copula_s, adj) yield template.format(sub_s, obj_p, copula_s, adj) for copula_p in copulas_plural: # contrast is in number agreement between subject and copula yield template.format(sub_s, obj_s, copula_p, adj) # bad yield template.format(sub_p, obj_s, copula_p, adj) # good # same as above, except that object number is opposite yield template.format(sub_s, obj_p, copula_p, adj) yield template.format(sub_p, obj_p, copula_p, adj)
def main(): """ example: "he made the van this challenge ." vs. "the van made he this challenge ." """ # counterbalance both forms of verb as different forms are the contrast vbds = [ 'brought', 'made', 'built', 'gave', 'showed', ] nouns_s = get_legal_words(tag='NN') animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) personal_pronouns_obj = ['me', 'him', 'her', 'us', 'them'] # in the objective case personal_pronouns_subj = ['i', 'he', 'she', 'we', 'they'] # in the subjective case determiners = ['a', 'one', 'this', 'that', 'the', 'my', 'his', 'her'] vowels = {'a', 'e', 'i', 'o', 'u'} while True: vbd = random.choice(vbds) # template 1 # random choices slot2filler = { 'nn': random.choice(animates), 'nn2': random.choice(nouns_s), 'det': random.choice(determiners), 'prp_obj': random.choice(personal_pronouns_obj), 'prp_subj': random.choice(personal_pronouns_subj), 'vbd': vbd, } if slot2filler['det'] == 'a' and slot2filler['nn2'][0] in vowels: slot2filler['det'] += 'n' yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good
def main(): """ example: "a documentary was there looking at dogs ." vs. "there was a documentary looking at dogs ." note: this task is too difficult for babyBERTa """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') adjectives = get_legal_words(tag='JJ') quantifiers = ['each', 'most', 'all', 'every'] copula_p = ['were', 'are', "were not", "aren't"] copula_s = ['was', 'is', "was not", "isn't"] vowels = {'a', 'e', 'i', 'o', 'u'} gerunds_ = [ 'looking', 'becoming', 'falling', 'leaving', 'eating', 'increasing', 'moving', 'opening', 'existing', 'containing', 'standing', 'changing', 'surrounding', 'adding', 'acting', ] gerunds = find_counterbalanced_subset(gerunds_, min_size=8, max_size=len(gerunds_)) # a linker can be a preposition or determiner phrase gerund2linker = { 'looking': 'like a', 'becoming': 'some kind of a', 'falling': 'on the', 'leaving': 'us by the', 'eating': 'one piece of this', 'increasing': 'the size of the', 'moving': 'to the', 'opening': 'the door to a', 'existing': 'without a', 'containing': 'a', 'standing': 'on top of a', 'changing': 'the', 'surrounding': 'the', 'adding': 'to the', 'acting': 'like a', } while True: # random choices noun_s, noun_p = random.choice(nouns_s_and_p) adj = random.choice(adjectives) quantifier = random.choice(quantifiers) subj_s, sub_p = random.choice(nouns_s_and_p) gerund = random.choice(gerunds) # plural vs. singular copula if quantifier in {'most', 'all'}: copula = random.choice(copula_p) subj1 = sub_p # for template 1 else: copula = random.choice(copula_s) subj1 = subj_s # "a" vs. "an" linker = gerund2linker[gerund] if linker.endswith('a') and adj[0] in vowels: linker += 'n' # contrast is about word order yield template1.format('there', copula, quantifier, subj1, gerund, linker, adj, noun_s) # bad yield template1.format(quantifier, subj1, copula, 'there', gerund, linker, adj, noun_s) # good
def main(): """ example: "there was a documentary about dogs ." vs. "there was each documentary about dogs ." """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') adjectives = get_legal_words(tag='JJ') quantifiers_good = ['a', 'no', 'some', 'many', 'few'] quantifiers_bad = ['each', 'most', 'all', 'every'] template1_subjects_s_and_p = [ ('movie', 'movies'), ('book', 'books'), ('story', 'stories'), ('sign', 'signs'), ] vowels = {'a', 'e', 'i', 'o', 'u'} copula_p = ['were', 'are', "were not", "aren't"] copula_s = ['was', 'is', 'was not', "isn't"] names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) while True: # random choices noun_s, noun_p = random.choice(nouns_s_and_p) adj = random.choice(adjectives) quantifier_b = random.choice(quantifiers_bad) quantifier_g = random.choice(quantifiers_good) subj_s, sub_p = random.choice(template1_subjects_s_and_p) name = random.choice(names) # plural vs. singular copula if quantifier_g in {'some', 'many', 'few'}: copula = random.choice(copula_p) subj1 = sub_p # for template 1 subj2 = noun_p # for template 2 else: copula = random.choice(copula_s) subj1 = subj_s subj2 = noun_s # "a" vs. "an" if subj1[0] in vowels and quantifier_g == 'a': quantifier_g = 'an' if subj1[0] in vowels and quantifier_b == 'a': quantifier_b = 'an' if subj2[0] in vowels and quantifier_g == 'a': quantifier_g = 'an' if subj2[0] in vowels and quantifier_b == 'a': quantifier_b = 'an' # prevent double negation if quantifier_g == 'no' and ('not' in copula or "n't" in copula): copula = copula.replace(' not', '') copula = copula.replace(" n't", '') yield template1.format(copula, quantifier_b, subj1, adj, noun_p) # bad yield template1.format(copula, quantifier_g, subj1, adj, noun_p) # good yield template2.format(copula, quantifier_b, subj2, name) # bad yield template2.format(copula, quantifier_g, subj2, name) # good
def main(): """ example: sam questioned the dog that can hurt sara ." vs "sam questioned who the dog can hurt sara." """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') excluded_verbs_base = ('put', 'run', 'say', 'be', 'give', 'tell', 'live') verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) excluded_verbs_past = ('started', 'let', 'told') verbs_past = get_legal_words(tag='VBD', exclude=excluded_verbs_past) excluded_verbs_gerund = ('saying', ) verbs_gerund = get_legal_words(tag='VBG', exclude=excluded_verbs_gerund) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) def add_preposition_after_vb(v: str): if v in {'acting', 'act'}: return f'{v} like' elif v in { 'standing', 'stand', 'falling', 'fall', 'depending', 'depend' }: return f'{v} on' elif v in {'asking', 'ask', 'writing', 'write', 'thinking', 'think'}: return f'{v} about' elif v in {'swimming', 'swim', 'sleeping', 'sleep'}: return f'{v} in' elif v in {'driving', 'drive', 'coming', 'come', 'related', 'relate'}: return f'{v} to' elif v in {'flying', 'fly', 'working', 'work'}: return f'{v} with' else: return v while True: # random choices slot2filler = { 'name': random.choice(names), 'nn': random.choice(nouns_s_and_p)[0], 'nns': random.choice(nouns_s_and_p)[1], 'vbd': random.choice(verbs_past), 'vbg': random.choice(verbs_gerund), # used in template2 only 'vb': random.choice(verbs_base), # used in template 1 only } slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb']) slot2filler['vbd'] = add_preposition_after_vb(slot2filler['vbd']) slot2filler['vbg'] = add_preposition_after_vb(slot2filler['vbg']) yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "we can help him do something" vs. "we can help himself do something" """ # counterbalance both forms of verb as different forms are the contrast excluded_verbs_base = ('say', 'live') verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) nouns_s = get_legal_words(tag='NN') prps_obj_and_poss = [ ('him', 'his'), ('her', 'hers'), ('us', 'our'), ('them', 'theirs'), ] animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) auxiliaries = ['can', 'could', 'will', 'would', 'must', 'should'] def add_misc_after_prp( prp: str, v: str, arg1: str, ) -> str: if v in {'take'}: return f'{prp} to {arg1}' elif v in {'make'}: return f'{prp} do {arg1}' elif v in {'work', 'put'}: return f'{prp} on {arg1}' elif v in {'turn'}: return f'{prp} around' elif v in {'tell'}: return f'{prp} about {arg1}' else: return prp def add_preposition_after_vb(v: str) -> str: if v in {'work', 'study'}: return f'{v} with' elif v in {'point', 'run'}: return f'{v} to' elif v in {'be'}: return f'{v} like' else: return v while True: prp_obj, prp_poss = random.choice(prps_obj_and_poss) # random choices slot2filler = { 'aux': random.choice(auxiliaries), 'prp_poss': prp_poss, 'prp_obj': prp_obj, 'nn': random.choice(animates), 'vb': random.choice(verbs_base), } # sample argument once, so that the same argument is used by both bad and good sentences. # note: pronouns don't get determiners, but nouns do argument1 = random.choice([f'the {nn}' for nn in nouns_s]) # first, add some miscellaneous component slot2filler['prp_poss'] = add_misc_after_prp(prp_poss, slot2filler['vb'], argument1) slot2filler['prp_obj'] = add_misc_after_prp(prp_obj, slot2filler['vb'], argument1) # lastly, add a preposition slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb']) yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good
def main(): """ example: "only sarah could ever talk." vs. "even sarah could ever talk" """ vbzs = get_legal_words(tag='VBZ', exclude=('happens', 'says', )) vbs = get_legal_words(tag='VB') nouns_s = get_legal_words(tag='NN') animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) determiners = ['the', 'this', 'some', 'that'] + ['your', 'his', 'her'] auxiliaries = ['could', 'can', 'would', 'will'] def add_argument_after_vb(v: str, argument1: str, ) -> str: if v in {'thinks', 'reads'}: return f'{v} about' elif v in {'lives', 'falls', 'is', 'be'}: return f'{v} in' elif v in {'stands', 'turns'}: return f'{v} on' elif v in {'acts', 'looks'}: return f'{v} like' elif v in {'goes', 'comes'}: return f'{v} to' elif v in {'gives', 'gives'}: return f'{v} {argument1}' elif v in {'plays', 'play', 'shows', 'show', 'tells', 'tell'}: return f'{v} {argument1}' else: return v while True: arg1 = random.choice(['him', 'her']) vbz = random.choice(vbzs) vb = random.choice(vbs) # random choices slot2filler = { 'nn1': random.choice(names + animates), 'nn2': random.choice(nouns_s), 'vbz': add_argument_after_vb(vbz, arg1), 'vb': add_argument_after_vb(vb, arg1), 'det': random.choice(determiners), 'aux': random.choice(auxiliaries) } # add determiner to animate noun if slot2filler['nn1'] in animates: slot2filler['nn1'] = random.choice(determiners) + ' ' + slot2filler['nn1'] yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "has sam ever worried sarah ?" vs. "jane has ever worried sarah ." """ vbs = get_legal_words(tag='VB') nouns_s = get_legal_words(tag='NN') animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) auxiliaries = ['does', 'will', 'could', 'did', 'should', 'would'] determiners = ['a', 'the', 'this', 'some', 'that'] + ['your', 'his', 'her'] def add_argument_after_vb( v: str, arg1: str, arg2: str, ) -> str: if v in {'say'}: return f'{v} something' elif v in {'read'}: return f'{v} a book' elif v in {'play'}: return f'{v} with {arg1}' elif v in { 'use', 'find', 'get', 'be', 'order', 'need', 'have', 'control', 'want', 'free', 'keep' }: return f'{v} {arg1}' elif v in {'tell'}: return f'{v} me about {arg1}' elif v in {'plan'}: return f'{v} to do something with {arg1}' elif v in {'take'}: return f'{v} {arg1} away' elif v in {'give', 'show', 'present'}: return f'{v} {arg1} to {arg2}' elif v in {'put'}: return f'{v} {arg1} on {arg2}' elif v in {'fall'}: return f'{v} in {arg1}' elif v in {'see'}: return f'{v} how the {arg1} works' elif v in {'come'}: return f'{v} to {arg1}' else: return v while True: # sample argument once, so that the same argument is used by both bad and good sentences. # note: pronouns don't get determiners, but nouns do argument1 = random.choice(['you', 'him', 'her', 'it'] + [f'the {nn}' for nn in nouns_s]) argument2 = random.choice(['you', 'him', 'her', 'it'] + [f'the {nn}' for nn in nouns_s]) vb = random.choice(vbs) # random choices slot2filler = { 'name': random.choice(names), 'nn': random.choice(animates), 'vb': add_argument_after_vb(vb, argument1, argument2), 'aux': random.choice(auxiliaries), 'det': random.choice(determiners), } if slot2filler['aux'] in {'did', 'does'} and vb == 'be': continue yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "maybe the black dog was taken by him ." vs. "maybe the black dog was took by him ." a paradigm that uses just adjectives results in use of 2 adjectives ("taken", "broken") only, when the vocab size is 8192 - hence, we do not use this paradigm. instead, we use the passive construction which allows us to include the verb "given". """ vocab = get_vocab_words() modifiers = ['maybe', 'i think', 'we hope that', 'he said that'] nouns_s = get_legal_words(tag='NN') adjectives = get_legal_words(tag='JJ') determiners = ['the', 'this', 'one', 'your'] vds_vns = [ ('wore', 'worn'), ('broke', 'broken'), ('hid', 'hidden'), ('forgot', 'forgotten'), ('took', 'taken'), ('ate', 'eaten'), ('drank', 'drunk'), ('saw', 'seen'), ('chose', 'chosen'), ('threw', 'thrown'), ('beat', 'beaten'), # ditransitive ('forbade', 'forbidden'), ('gave', 'given'), ] while True: # random choices noun = random.choice(nouns_s) det = random.choice(determiners) adj = random.choice(adjectives) mod = random.choice(modifiers) # get two contrasting irregular inflected forms. # past participle (vn) is always correct vd, vn = random.choice(vds_vns) if (vn not in vocab or vd not in vocab) or vn == vd: continue # exceptional case if vn == 'given': yield template3.format(mod, det, adj, noun, vd) # bad yield template3.format(mod, det, adj, noun, vn) # good yield template4.format(mod, det, adj, noun, vd) yield template4.format(mod, det, adj, noun, vn) else: yield template1.format(mod, det, adj, noun, vd) yield template1.format(mod, det, adj, noun, vn) yield template2.format(mod, det, adj, noun, vd) yield template2.format(mod, det, adj, noun, vn)
def main(): """ example: "sarah thinks about herself listening to the dog." vs. "sarah thinks about herself listened to that girl."" """ vbds1_and_vbgs1 = get_legal_words(tag='VBD', second_tag='VBG', exclude=('told', 'forgot', 'thought', 'said', 'happened')) vbzs2_and_vbgs2 = get_legal_words(tag='VBZ', second_tag='VBG', exclude=('tells', 'forgets', 'thinks', 'says', 'happens')) nouns_s = get_legal_words(tag='NN') vowels = {'a', 'e', 'i', 'o', 'u'} determiners = ['a', 'the', 'this', 'some', 'that'] vbs = ['thinks about', 'thought about', 'did not think about', 'could think about', 'must think about', 'must not think about', ] def add_preposition_after_vb(v: str) -> str: if v in {'falling', 'fell'}: return f'{v} on' elif v in {'came', 'come', 'comes', 'coming', 'went', 'go', 'goes', 'going', 'wrote', 'write', 'writes', 'writing', 'ran', 'run', 'runs', 'running', }: return f'{v} to' elif v in {'lived', 'live', 'lives', 'living'}: return f'{v} in' elif v in {'looked', 'look', 'looks', 'looking'}: return f'{v} at' elif v in {'reached', 'reach', 'reaches', 'reaching'}: return f'{v} for' elif v in {'showed', 'show', 'shows', 'showing'}: return f'{v} off' elif v in {'set', 'sets', 'setting',}: return f'{v} up' elif v in {'put', 'puts', 'putting'}: return f'{v} away' else: return v while True: vbd1, vbg1 = random.choice(vbds1_and_vbgs1) vbz2, vbg2 = random.choice(vbzs2_and_vbgs2) # random choices slot2filler = { 'nn_m': random.choice(names_m), 'nn_f': random.choice(names_f), 'nn2': random.choice(nouns_s), 'vbd1': add_preposition_after_vb(vbd1), 'vbg1': add_preposition_after_vb(vbg1), 'vbz2': add_preposition_after_vb(vbz2), 'vbg2': add_preposition_after_vb(vbg2), 'vb': random.choice(vbs), 'det': random.choice(determiners), 'prp_reflexive_m': 'himself', 'prp_reflexive_f': 'herself', } if slot2filler['det'] == 'a' and slot2filler['nn2'][0] in vowels: slot2filler['det'] += 'n' yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good yield template3['b'].format(**slot2filler) # bad yield template3['g'].format(**slot2filler) # good yield template4['b'].format(**slot2filler) # bad yield template4['g'].format(**slot2filler) # good
def main(): """ example: "sarah laughs" vs. "sarah gives" """ # we need a lot of verbs here, so temporarily reduce restrictions imposed by counterbalancing tmp1 = configs.Data.tag2num_words['VB'] tmp2 = configs.Data.tag2num_words['VBZ'] tmp3 = configs.Data.tag2num_words['VBD'] tmp4 = configs.Data.bias_tolerance configs.Data.tag2num_words['VB'] = 30 configs.Data.tag2num_words['VBZ'] = 50 configs.Data.tag2num_words['VBD'] = 50 configs.Data.bias_tolerance = 7000 vbs = get_legal_words(tag='VB', exclude=('fit', 'come', 'point')) vbzs = get_legal_words(tag='VBZ', exclude=('points', )) vbds = get_legal_words(tag='VBD', exclude=('fit', 'dropped', 'signed', 'formed', 'managed')) configs.Data.tag2num_words['VB'] = tmp1 configs.Data.tag2num_words['VBZ'] = tmp2 configs.Data.tag2num_words['VBD'] = tmp3 configs.Data.bias_tolerance = tmp4 animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) prps_s = ['she', 'he', 'it'] prps_p = ['we', 'they'] prps = prps_s + prps_p aux_s = ['does'] auxiliaries = ['could', 'can', 'would', 'will', 'did'] + aux_s determiners = ['the', 'this', 'some', 'that', 'every'] + ['your', 'his', 'her'] names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) vbs_intransitive = [ 'run', 'work', 'turn', 'eat', 'live', 'read', 'trade', 'play', 'know', 'study', 'think', 'change', ] vbzs_intransitive = [ 'moves', 'lives', 'lies', 'knows', 'waves', 'changes', 'works', 'dies', 'leads', 'appears', 'thinks', 'falls', 'matters', 'turns', 'stands', 'stands', 'runs', 'calls', 'races', ] vbds_intransitive = [ 'occurred', 'married', 'moved', 'looked', 'changed', 'finished', 'grew', 'broke', 'started', 'improved', 'worked', 'thought', 'came', 'tried', 'read', 'lost', 'knew', 'lived', 'accepted', 'developed', 'joined', 'joined', 'decided', 'learned', 'occurred', 'happened', 'fell', 'refused', 'returned', ] vbs_intransitive = vbs_intransitive vbs_transitive = [v for v in vbs if v not in vbs_intransitive] vbzs_transitive = [v for v in vbzs if v not in vbzs_intransitive] vbzs_or_vbds_intransitive = vbzs_intransitive + vbds_intransitive vbzs_or_vbds_transitive = [v for v in vbzs + vbds if v not in vbzs_or_vbds_intransitive] while True: # random choices slot2filler = { 'nn1': random.choice(animates + names + prps), 'aux': random.choice(auxiliaries), 'vbz_or_vbd_intransitive': random.choice(vbzs_or_vbds_intransitive), 'vbz_or_vbd_transitive': random.choice(vbzs_or_vbds_transitive), 'vb_intransitive': random.choice(vbs_intransitive), 'vb_transitive': random.choice(vbs_transitive), } # handle exception: "occurred" and "happened" cannot have animate subject if slot2filler['vbz_or_vbd_intransitive'] in ['occurred', 'happened'] and\ slot2filler['nn1'] not in ['it', 'that', 'this']: continue # add determiner to animate noun if slot2filler['nn1'] in animates: slot2filler['nn1'] = random.choice(determiners) + ' ' + slot2filler['nn1'] # do not use template 1 with plural pronoun and VBZ if not (slot2filler['nn1'] in prps_p and slot2filler['vbz_or_vbd_intransitive'] in vbzs_intransitive or slot2filler['vbz_or_vbd_transitive'] in vbzs_transitive): yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good # do not use template 2 with plural pronoun and singular aux (e.g. "does") if not (slot2filler['nn1'] in prps_p and slot2filler['aux'] in aux_s): yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "where is the house?" vs "where is the houses?" todo "where is the house?" vs "where are the house?" """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') adjectives = get_legal_words(tag='JJ') copulas_singular = ["is", "was"] copulas_plural = ["are", "were"] while True: # random choices noun_s, noun_p = random.choice(nouns_s_and_p) adj = random.choice(adjectives) for copula_s in copulas_singular: yield template1.format(copula_s, noun_p) yield template1.format(copula_s, noun_s) yield template2.format(copula_s, noun_p) yield template2.format(copula_s, noun_s) yield template3.format(copula_s, noun_p) yield template3.format(copula_s, noun_s) yield template4.format(copula_s, noun_p, adj) yield template4.format(copula_s, noun_s, adj) yield template5.format(copula_s, noun_p) yield template5.format(copula_s, noun_s) # skip template 6 because it is specific to plural copula yield template7.format(copula_s, noun_p) yield template7.format(copula_s, noun_s) yield template8.format(copula_s, noun_p) yield template8.format(copula_s, noun_s) for copula_p in copulas_plural: yield template1.format(copula_p, noun_s) yield template1.format(copula_p, noun_p) yield template2.format(copula_p, noun_s) yield template2.format(copula_p, noun_p) yield template3.format(copula_p, noun_s) yield template3.format(copula_p, noun_p) yield template4.format(copula_p, noun_s, adj) yield template4.format(copula_p, noun_p, adj) # skip template 5 because it is specific to singular copula yield template6.format(copula_p, noun_s) yield template6.format(copula_p, noun_p) yield template7.format(copula_p, noun_s) yield template7.format(copula_p, noun_p) yield template8.format(copula_p, noun_s) yield template8.format(copula_p, noun_p)
def main(): """ example: "is the bell ringing ?" vs "is the bell rings ?" """ # counterbalance both forms of verb as different forms are the contrast vbgs_and_vbzs = get_legal_words(tag='VBG', second_tag='VBZ', exclude=('facing', 'naming', 'training', 'setting', 'meaning')) vbs_and_vbzs = get_legal_words(tag='VB', second_tag='VBZ') nouns_s = get_legal_words(tag='NN') animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) def add_argument_after_vb( v: str, arg1: str, arg2: str, ) -> str: if v in {'saying', 'says', 'say'}: return f'{v} something' elif v in {'using', 'uses', 'use'}: return f'{v} {arg1}' elif v in {'telling', 'tells', 'tell'}: return f'{v} me about {arg1}' elif v in {'making', 'makes', 'make'}: return f'{v} {arg1} something' elif v in {'planning', 'plans', 'plan'}: return f'{v} to do something with {arg1}' elif v in {'taking', 'takes', 'take'}: return f'{v} {arg1} away' elif v in {'giving', 'gives', 'give'}: return f'{v} {arg1} {arg2}' elif v in {'falling', 'falls', 'fall'}: return f'{v} in {arg1}' elif v in {'showing', 'shows', 'show'}: return f'{v} {arg1} to {arg2}' elif v in {'seeing', 'sees', 'see'}: return f'{v} how the {arg1} works' elif v in {'finding', 'finds', 'find'}: return f'{v} {arg1}' elif v in {'coming', 'comes', 'come'}: return f'{v} to {arg1}' elif v in {'getting', 'gets', 'get'}: return f'{v} {arg1}' elif v in {'depending', 'depends', 'depend'}: return f'{v} on {arg1}' else: return v while True: vbg1, vbz1 = random.choice(vbgs_and_vbzs) # template 1 vb2, vbz2 = random.choice(vbs_and_vbzs) # template 2 # sample argument once, so that the same argument is used by both bad and good sentences. # note: pronouns don't get determiners, but nouns do argument1 = random.choice(['you', 'him', 'her', 'it'] + [f'the {nn}' for nn in nouns_s]) argument2 = random.choice(['you', 'him', 'her', 'it'] + [f'the {nn}' for nn in nouns_s]) # random choices slot2filler = { 'name': random.choice(names), 'nn': random.choice(nouns_s + animates), 'vb2': add_argument_after_vb(vb2, argument1, argument2), 'vbz2': add_argument_after_vb(vbz2, argument1, argument2), 'vbg1': add_argument_after_vb(vbg1, argument1, argument2), 'vbz1': add_argument_after_vb(vbz1, argument1, argument2), } yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "give me the frog ." vs. "the frog gives me ." """ # counterbalance both forms of verb as different forms are the contrast vbs_and_vbzs_1 = [ ('give', 'gives'), ] vbs_and_vbzs_2 = [ ('ask', 'asked'), # "asks" is not in vocab ('tell', 'tells'), ] prps_obj = ['me', 'you', 'him', 'her', 'them'] conjunctions = ['when', 'but', 'with', 'and'] # TODO use a counterbalanced verb list vbzs3_and_continuations = [ # contains a mix of past and present tense forms # past tense form ('saw', '{prp_obj} there', '{prp_obj} by'), ('created', '{prp_obj}', '{det}'), ('told', '{prp_obj} about that', '{prp_obj} about {det}'), ('wrote', '{prp_obj} something', '{prp_obj} {det}'), ('wanted', '{prp_obj}', 'to'), ('asked', 'about {prp_obj}', '{prp_obj} about'), ('sold', '{prp_obj} that', 'that to'), ('changed', '{prp_obj}', '{det}'), # present tense form ('looks', 'at {prp_obj}', 'at {det}'), ('plays', 'with {prp_obj}', 'with {det}'), ('thinks', 'about {prp_obj}', 'about {det}'), ('moves', 'fast', 'to'), ('works', 'well', '{conjunction}'), ] adjectives = get_legal_words(tag='JJ') nouns_s = get_legal_words(tag='NN') animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) personal_pronouns_obj = ['me', 'him', 'her', 'us', 'them'] # in the objective case personal_pronouns_subj = ['i', 'he', 'she', 'we', 'they'] # in the subjective case determiners = ['a', 'one', 'the', 'my', 'his', 'some'] # do not include "this" or "that" or "her" vowels = {'a', 'e', 'i', 'o', 'u'} while True: vb1, vbz1 = random.choice(vbs_and_vbzs_1) # template 1 vb2, vbz2 = random.choice(vbs_and_vbzs_2) # template 2 vbz3, cont_g, cont_b = random.choice( vbzs3_and_continuations) # template 3 # good and bad continuations prp_obj = random.choice(prps_obj) conjunction = random.choice(conjunctions) cont_g = cont_g.format(prp_obj=prp_obj) cont_b = cont_b.format(prp_obj=prp_obj, det=random.choice(determiners), conjunction=conjunction) # random choices slot2filler = { 'det': random.choice(determiners), 'jj': random.choice(adjectives), 'nn': random.choice(animates), 'nn2': random.choice(nouns_s), 'prp_obj': random.choice(personal_pronouns_obj), 'prp_subj': random.choice(personal_pronouns_subj), 'vb1': vb1, 'vbz1': vbz1, 'vb2': vb2, 'vbz2': vbz2, 'vbz3': vbz3, 'cont_g': cont_g, 'cont_b': cont_b, } if slot2filler['det'] == 'a' and slot2filler['nn2'][0] in vowels: slot2filler['det'] += 'n' yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good yield template3['b'].format(**slot2filler) # bad yield template3['g'].format(**slot2filler) # good
def main(): """ example: "no cat can jump on more than two dogs ." vs. "no cat jump on at least two dogs ." """ nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP') number_words_ = (configs.Dirs.legal_words / "number_words.txt").open().read().split() number_words = find_counterbalanced_subset(number_words_, min_size=6, max_size=len(number_words_)) quantifiers_g_b = [ ('more than', 'at least'), ('fewer than', 'at most'), ] animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) verbs_ = [ 'become', 'catch', 'leave', 'increase', 'move', 'open', 'exist', 'contain', 'stand', 'change', 'surround', 'carry', 'act', ] verbs = find_counterbalanced_subset(verbs_, min_size=8, max_size=len(verbs_)) # a linker can be a preposition or determiner phrase verb2linker = { 'become': None, 'catch': None, 'leave': None, 'increase': 'the size of', 'move': 'to', 'open': 'the door to', 'exist': 'without', 'contain': None, 'stand': 'on top of', 'change': None, 'surround': None, 'carry': None, 'act': 'like', } while True: # random choices animate = random.choice(animates) noun_s, noun_p = random.choice(nouns_s_and_p) number_word = random.choice(number_words) quantifier_g, quantifier_b = random.choice(quantifiers_g_b) verb = random.choice(verbs) aux = random.choice(['can', 'could']) verb_and_optional_linker = verb if verb2linker[verb] is not None: verb_and_optional_linker += ' ' + verb2linker[verb] if number_word == 'one': noun = noun_s else: noun = noun_p yield template1.format(animate, aux, verb_and_optional_linker, quantifier_b, number_word, noun) # bad yield template1.format(animate, aux, verb_and_optional_linker, quantifier_g, number_word, noun) # good
def main(): """ example: "who should sarah hug after shocking the dog ?" vs "who should sarah hug the dog after shocking ?" note: this task is too difficult for babyBERTa """ nouns_s = get_legal_words(tag='NN') excluded_verbs_base = ('run', 'say', 'be', 'give', 'tell', 'live', 'force') verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) excluded_verbs_gerund = ('saying', ) verbs_gerund = get_legal_words(tag='VBG', exclude=excluded_verbs_gerund) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) pps = ['after', 'before', 'while', 'without'] def add_preposition_after_vb(v: str): if v == 'related': return 'related to' elif v == 'acting': return 'acting like' elif v == 'put': return 'put on' elif v == 'work': return 'work for' elif v == 'sleeping': return 'sleeping in' elif v == 'standing': return 'standing on' elif v == 'depending': return 'depending on' elif v == 'flying': return 'flying over' elif v == 'falling': return 'falling on' elif v == 'asking': return 'asking about' elif v == 'swimming': return 'swimming in' elif v == 'asking': return 'asking for' elif v == 'coming': return 'coming to' else: return v while True: # random choices slot2filler = { 'name': random.choice(names), 'nn': random.choice(nouns_s), 'pp': random.choice(pps), 'vb': random.choice(verbs_base), 'vbg': random.choice(verbs_gerund), } slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb']) slot2filler['vbg'] = add_preposition_after_vb(slot2filler['vbg']) yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good
def main(): """ example: "katherine will help herself do something" vs. "katerine will help himself do something" """ excluded_verbs_base = ('say', ) verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) nouns_s = get_legal_words(tag='NN') names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) auxiliaries = ['can', 'could', 'will', 'would', 'must', 'should'] def add_misc_after_prp( prp: str, v: str, arg1: str, ) -> str: if v in { 'take', }: return f'{prp} to {arg1}' elif v in {'make', 'give', 'put'}: return f'{prp} {arg1}' elif v in { 'work', }: return f'{prp} on {arg1}' elif v in { 'turn', }: return f'{prp} around' elif v in { 'tell', }: return f'{prp} about {arg1}' else: return prp def add_preposition_after_vb(v: str) -> str: if v in {'work', 'study', 'live'}: return f'{v} with' elif v in {'point', 'run'}: return f'{v} to' elif v in { 'be', }: return f'{v} like' else: return v while True: vb = random.choice(verbs_base) # random choices slot2filler = { 'aux': random.choice(auxiliaries), 'nn_m': random.choice([name for name in names if name in names_m] + ['he', 'the man', 'a man', 'that man']), 'nn_f': random.choice([name for name in names if name in names_f] + ['she', 'the woman', 'a woman', 'that woman']), } # sample argument once, so that the same argument is used by both bad and good sentences. # note: pronouns don't get determiners, but nouns do if vb == 'put': argument1 = random.choice(['in danger', 'in this situation']) else: argument1 = random.choice([f'the {nn}' for nn in nouns_s]) # first, add some miscellaneous component slot2filler['prp_m'] = add_misc_after_prp('himself', vb, argument1) slot2filler['prp_f'] = add_misc_after_prp('herself', vb, argument1) # second, add a preposition slot2filler['vb'] = add_preposition_after_vb(vb) yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good # use negation only in non-question, in templates 3, and 4 if random.random() < 0.5: slot2filler['aux'] += ' ' + 'not' if random.random() < 0.1: slot2filler['aux'] = 'did not' yield template3['b'].format(**slot2filler) # bad yield template3['g'].format(**slot2filler) # good yield template4['b'].format(**slot2filler) # bad yield template4['g'].format(**slot2filler) # good
def main(): """ example: "who must sarah and the dog kiss ?" vs "who must sarah kiss and the dog ?" """ excluded_verbs_base = ('run', 'be', 'live', 'force', 'order') verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base) excluded_verbs_gerund = ('',) verbs_gerund = get_legal_words(tag='VBG', exclude=excluded_verbs_gerund) animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split() animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_)) names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split() names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_)) def add_preposition_after_vb(v: str, arg: str, ): if v == 'related': return 'related to' elif v == 'put': return 'put on' elif v == 'work': return 'work for' elif v == 'acting': return 'acting like' elif v == 'sleeping': return 'sleeping in' elif v == 'falling': return 'falling on' elif v == 'looking': return 'looking for' elif v == 'running': return 'running to' elif v == 'talking': return 'talking about' elif v == 'thinking': return 'thinking about' elif v == 'reaching': return 'reaching for' elif v == 'work': return f'work {arg}' else: return v while True: # random choices slot2filler = { 'name': random.choice(names), 'nn': random.choice(animates), 'vb': random.choice(verbs_base), 'vbg': random.choice(verbs_gerund), } arg = random.choice(["him", "her", "them", "us"]) slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb'], arg) slot2filler['vbg'] = add_preposition_after_vb(slot2filler['vbg'], arg) # exclude bad combinations that involve "who", e.g. "saying who" if slot2filler['vbg'] not in {'saying', 'drinking', 'eating', 'open'}\ and slot2filler['vb'] not in {'need', 'feel', 'open'}: if slot2filler['vb'] == 'tell': slot2filler['vb'] = 'tell something' yield template1['b'].format(**slot2filler) # bad yield template1['g'].format(**slot2filler) # good yield template2['b'].format(**slot2filler) # bad yield template2['g'].format(**slot2filler) # good yield template3['b'].format(**slot2filler) # bad yield template3['g'].format(**slot2filler) # good yield template4['b'].format(**slot2filler) # bad yield template4['g'].format(**slot2filler) # good