Exemplo n.º 1
0
def main():
    """
    example:
    "sarah discovered the vase that the dog might take ." vs. "sarah discovered what the dog might take the vase ."

    """

    nouns_s = get_legal_words(tag='NN')

    excluded_verbs_base = ('put', 'run', 'say', 'be', 'give', 'tell', 'live')
    verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base)

    excluded_verbs_past = ('started', 'let', 'told')
    verbs_past = get_legal_words(tag='VBD', exclude=excluded_verbs_past)

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_,
                                        min_size=10,
                                        max_size=len(names_))

    animates_ = (configs.Dirs.legal_words /
                 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_,
                                           min_size=8,
                                           max_size=len(animates_))

    def add_preposition_after_vb(v: str):
        if v == 'play':
            return 'play with'
        elif v == 'point':
            return 'point to'
        elif v == 'turn':
            return 'turn to'
        elif v == 'work':
            return 'work with'
        else:
            return v

    while True:

        # random choices
        slot2filler = {
            'name': random.choice(names),
            'nn1': random.choice(nouns_s),
            'nn2': random.choice(animates),
            'vbd': random.choice(verbs_past),
            'vbd2': random.choice(verbs_past),  # used in template2 only
            'vb': random.choice(verbs_base),  # used in template 1 only
        }

        slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb'])

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good

        yield template2['b'].format(**slot2filler)  # bad
        yield template2['g'].format(**slot2filler)  # good
Exemplo n.º 2
0
Arquivo: n_bar.py Projeto: phueb/Zorro
def main():
    """
    example:
    "sam found one purple dog and karen revealed more ." vs. "sam found one dog and karen revealed more purple."
    """

    vbds = get_legal_words(tag='VBD')
    adjectives = get_legal_words(tag='JJ')

    nouns_mass = (configs.Dirs.legal_words /
                  'nouns_mass.txt').open().read().split()
    nouns_s = get_legal_words(tag='NN', exclude=tuple(nouns_mass))

    animates_ = (configs.Dirs.legal_words /
                 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_,
                                           min_size=8,
                                           max_size=len(animates_))

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_,
                                        min_size=10,
                                        max_size=len(names_))

    determiners = ['a', 'the', 'this', 'some', 'that'] + ['your', 'his', 'her']

    number_words = ['several', 'more', 'two', 'three',
                    'a lot more']  # , 'some']

    while True:

        # random choices
        slot2filler = {
            'name1': random.choice(names),
            'name2': random.choice(names),
            'nn': random.choice(nouns_s),
            'nn2': random.choice(animates),
            'vbd': random.choice(vbds),
            'det': random.choice(determiners),
            'jj': random.choice(adjectives),
            'number': random.choice(number_words),
        }

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good

        yield template2['b'].format(**slot2filler)  # bad
        yield template2['g'].format(**slot2filler)  # good
Exemplo n.º 3
0
Arquivo: words.py Projeto: phueb/Zorro
def get_legal_words(tag: str,
                    second_tag: Optional[str] = None,  # also counterbalance list of other word forms (e.g. plural)
                    seed: int = configs.Data.seed,
                    exclude: Optional[Tuple[str, ...]] = None,
                    verbose: bool = False,
                    ) -> Union[List[str], List[Tuple[str, str]]]:

    print(f'Obtaining counterbalanced subset of legal words with tag={tag} and second_tag={second_tag}')

    # get words with requested tag and order
    df_legal = pd.read_csv(configs.Dirs.legal_words / f'{tag}.csv')
    bool_ids = df_legal['is_legal'].astype(bool).tolist()
    first_forms_ = df_legal['word'][bool_ids].tolist()

    # exclude any words ?
    if exclude:
        first_forms_ = [w for w in first_forms_ if w not in exclude]

    # also counterbalance 2nd forms of words ?
    if second_tag is None:
        second_forms_ = None
    elif second_tag == 'NNP':
        plural = inflect.engine()
        second_forms_ = [plural.plural(w) for w in first_forms_]
    elif second_tag.startswith('VB'):
        lemmas = [getLemma(w, upos='VERB')[0] for w in first_forms_]
        second_forms_ = [getInflection(lemma, tag=second_tag)[0] for lemma in lemmas]  # requires lemma as input
    else:
        raise AttributeError('Invalid arg to second_tag')

    # remove words if their 2nd form is not in vocab or if it is identical to 1st form
    if second_tag is not None:
        first_forms = []
        second_forms = []
        for w1, w2 in zip(first_forms_, second_forms_):
            if w2 in vocab and w2 != w1:
                first_forms.append(w1)
                second_forms.append(w2)
                if verbose:
                    print(f'Included {w1:<12} and {w2:<12}')
        assert first_forms
        assert second_forms
    else:
        first_forms = first_forms_
        second_forms = second_forms_

    # find subset of words such that their total corpus frequencies are approx equal across corpora
    num_words_in_sample = configs.Data.tag2num_words[tag]
    res = find_counterbalanced_subset(first_forms,
                                      min_size=num_words_in_sample,
                                      max_size=num_words_in_sample+100,
                                      second_forms=second_forms,
                                      seed=seed,
                                      verbose=verbose,
                                      )

    return res
Exemplo n.º 4
0
def main():
    """
    example:
    "he made the van this challenge ." vs. "the van made he this challenge ."
    """

    # counterbalance both forms of verb as different forms are the contrast

    vbds = [
        'brought',
        'made',
        'built',
        'gave',
        'showed',
    ]

    nouns_s = get_legal_words(tag='NN')

    animates_ = (configs.Dirs.legal_words /
                 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_,
                                           min_size=8,
                                           max_size=len(animates_))

    personal_pronouns_obj = ['me', 'him', 'her', 'us',
                             'them']  # in the objective case
    personal_pronouns_subj = ['i', 'he', 'she', 'we',
                              'they']  # in the subjective case

    determiners = ['a', 'one', 'this', 'that', 'the', 'my', 'his', 'her']

    vowels = {'a', 'e', 'i', 'o', 'u'}

    while True:

        vbd = random.choice(vbds)  # template 1

        # random choices
        slot2filler = {
            'nn': random.choice(animates),
            'nn2': random.choice(nouns_s),
            'det': random.choice(determiners),
            'prp_obj': random.choice(personal_pronouns_obj),
            'prp_subj': random.choice(personal_pronouns_subj),
            'vbd': vbd,
        }

        if slot2filler['det'] == 'a' and slot2filler['nn2'][0] in vowels:
            slot2filler['det'] += 'n'

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good
Exemplo n.º 5
0
def main():
    """
    example:
    "a documentary was there looking at dogs ." vs. "there was a documentary looking at dogs ."

    note: this task is too difficult for babyBERTa
    """

    nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP')
    adjectives = get_legal_words(tag='JJ')

    quantifiers = ['each', 'most', 'all', 'every']

    copula_p = ['were', 'are', "were not", "aren't"]
    copula_s = ['was', 'is', "was not", "isn't"]

    vowels = {'a', 'e', 'i', 'o', 'u'}

    gerunds_ = [
        'looking',
        'becoming',
        'falling',
        'leaving',
        'eating',
        'increasing',
        'moving',
        'opening',
        'existing',
        'containing',
        'standing',
        'changing',
        'surrounding',
        'adding',
        'acting',
    ]
    gerunds = find_counterbalanced_subset(gerunds_,
                                          min_size=8,
                                          max_size=len(gerunds_))

    # a linker can be a preposition or determiner phrase
    gerund2linker = {
        'looking': 'like a',
        'becoming': 'some kind of a',
        'falling': 'on the',
        'leaving': 'us by the',
        'eating': 'one piece of this',
        'increasing': 'the size of the',
        'moving': 'to the',
        'opening': 'the door to a',
        'existing': 'without a',
        'containing': 'a',
        'standing': 'on top of a',
        'changing': 'the',
        'surrounding': 'the',
        'adding': 'to the',
        'acting': 'like a',
    }

    while True:

        # random choices
        noun_s, noun_p = random.choice(nouns_s_and_p)
        adj = random.choice(adjectives)
        quantifier = random.choice(quantifiers)
        subj_s, sub_p = random.choice(nouns_s_and_p)
        gerund = random.choice(gerunds)

        # plural vs. singular copula
        if quantifier in {'most', 'all'}:
            copula = random.choice(copula_p)
            subj1 = sub_p  # for template 1
        else:
            copula = random.choice(copula_s)
            subj1 = subj_s

        # "a" vs. "an"
        linker = gerund2linker[gerund]
        if linker.endswith('a') and adj[0] in vowels:
            linker += 'n'

        # contrast is about word order
        yield template1.format('there', copula, quantifier, subj1, gerund,
                               linker, adj, noun_s)  # bad
        yield template1.format(quantifier, subj1, copula, 'there', gerund,
                               linker, adj, noun_s)  # good
Exemplo n.º 6
0
def main():
    """
    example:
    "no cat can jump on more than two dogs ." vs. "no cat jump on at least two dogs ."
    """

    nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP')
    number_words_ = (configs.Dirs.legal_words /
                     "number_words.txt").open().read().split()
    number_words = find_counterbalanced_subset(number_words_,
                                               min_size=6,
                                               max_size=len(number_words_))

    quantifiers_g_b = [
        ('more than', 'at least'),
        ('fewer than', 'at most'),
    ]

    animates_ = (configs.Dirs.legal_words /
                 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_,
                                           min_size=8,
                                           max_size=len(animates_))

    verbs_ = [
        'become',
        'catch',
        'leave',
        'increase',
        'move',
        'open',
        'exist',
        'contain',
        'stand',
        'change',
        'surround',
        'carry',
        'act',
    ]
    verbs = find_counterbalanced_subset(verbs_,
                                        min_size=8,
                                        max_size=len(verbs_))

    # a linker can be a preposition or determiner phrase
    verb2linker = {
        'become': None,
        'catch': None,
        'leave': None,
        'increase': 'the size of',
        'move': 'to',
        'open': 'the door to',
        'exist': 'without',
        'contain': None,
        'stand': 'on top of',
        'change': None,
        'surround': None,
        'carry': None,
        'act': 'like',
    }

    while True:

        # random choices
        animate = random.choice(animates)
        noun_s, noun_p = random.choice(nouns_s_and_p)
        number_word = random.choice(number_words)
        quantifier_g, quantifier_b = random.choice(quantifiers_g_b)
        verb = random.choice(verbs)
        aux = random.choice(['can', 'could'])

        verb_and_optional_linker = verb
        if verb2linker[verb] is not None:
            verb_and_optional_linker += ' ' + verb2linker[verb]

        if number_word == 'one':
            noun = noun_s
        else:
            noun = noun_p

        yield template1.format(animate, aux, verb_and_optional_linker,
                               quantifier_b, number_word, noun)  # bad
        yield template1.format(animate, aux, verb_and_optional_linker,
                               quantifier_g, number_word, noun)  # good
Exemplo n.º 7
0
def main():
    """
    example:
    "is the bell ringing ?" vs "is the bell rings ?"
    """

    # counterbalance both forms of verb as different forms are the contrast
    vbgs_and_vbzs = get_legal_words(tag='VBG',
                                    second_tag='VBZ',
                                    exclude=('facing', 'naming', 'training',
                                             'setting', 'meaning'))
    vbs_and_vbzs = get_legal_words(tag='VB', second_tag='VBZ')

    nouns_s = get_legal_words(tag='NN')

    animates_ = (configs.Dirs.legal_words /
                 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_,
                                           min_size=8,
                                           max_size=len(animates_))

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_,
                                        min_size=10,
                                        max_size=len(names_))

    def add_argument_after_vb(
        v: str,
        arg1: str,
        arg2: str,
    ) -> str:
        if v in {'saying', 'says', 'say'}:
            return f'{v} something'
        elif v in {'using', 'uses', 'use'}:
            return f'{v} {arg1}'
        elif v in {'telling', 'tells', 'tell'}:
            return f'{v} me about {arg1}'
        elif v in {'making', 'makes', 'make'}:
            return f'{v} {arg1} something'
        elif v in {'planning', 'plans', 'plan'}:
            return f'{v} to do something with {arg1}'
        elif v in {'taking', 'takes', 'take'}:
            return f'{v} {arg1} away'
        elif v in {'giving', 'gives', 'give'}:
            return f'{v} {arg1} {arg2}'
        elif v in {'falling', 'falls', 'fall'}:
            return f'{v} in {arg1}'
        elif v in {'showing', 'shows', 'show'}:
            return f'{v} {arg1} to {arg2}'
        elif v in {'seeing', 'sees', 'see'}:
            return f'{v} how the {arg1} works'
        elif v in {'finding', 'finds', 'find'}:
            return f'{v} {arg1}'
        elif v in {'coming', 'comes', 'come'}:
            return f'{v} to {arg1}'
        elif v in {'getting', 'gets', 'get'}:
            return f'{v} {arg1}'
        elif v in {'depending', 'depends', 'depend'}:
            return f'{v} on {arg1}'
        else:
            return v

    while True:

        vbg1, vbz1 = random.choice(vbgs_and_vbzs)  # template 1
        vb2, vbz2 = random.choice(vbs_and_vbzs)  # template 2

        # sample argument once, so that the same argument is used by both bad and good sentences.
        # note: pronouns don't get determiners, but nouns do
        argument1 = random.choice(['you', 'him', 'her', 'it'] +
                                  [f'the {nn}' for nn in nouns_s])
        argument2 = random.choice(['you', 'him', 'her', 'it'] +
                                  [f'the {nn}' for nn in nouns_s])

        # random choices
        slot2filler = {
            'name': random.choice(names),
            'nn': random.choice(nouns_s + animates),
            'vb2': add_argument_after_vb(vb2, argument1, argument2),
            'vbz2': add_argument_after_vb(vbz2, argument1, argument2),
            'vbg1': add_argument_after_vb(vbg1, argument1, argument2),
            'vbz1': add_argument_after_vb(vbz1, argument1, argument2),
        }

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good

        yield template2['b'].format(**slot2filler)  # bad
        yield template2['g'].format(**slot2filler)  # good
Exemplo n.º 8
0
def main():
    """
    example:
    "there was a documentary about dogs ." vs. "there was each documentary about dogs ."

    """

    nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP')
    adjectives = get_legal_words(tag='JJ')

    quantifiers_good = ['a', 'no', 'some', 'many', 'few']
    quantifiers_bad = ['each', 'most', 'all', 'every']

    template1_subjects_s_and_p = [
        ('movie', 'movies'),
        ('book', 'books'),
        ('story', 'stories'),
        ('sign', 'signs'),
    ]

    vowels = {'a', 'e', 'i', 'o', 'u'}

    copula_p = ['were', 'are', "were not", "aren't"]
    copula_s = ['was', 'is', 'was not', "isn't"]

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_,
                                        min_size=10,
                                        max_size=len(names_))

    while True:

        # random choices
        noun_s, noun_p = random.choice(nouns_s_and_p)
        adj = random.choice(adjectives)
        quantifier_b = random.choice(quantifiers_bad)
        quantifier_g = random.choice(quantifiers_good)
        subj_s, sub_p = random.choice(template1_subjects_s_and_p)
        name = random.choice(names)

        # plural vs. singular copula
        if quantifier_g in {'some', 'many', 'few'}:
            copula = random.choice(copula_p)
            subj1 = sub_p  # for template 1
            subj2 = noun_p  # for template 2
        else:
            copula = random.choice(copula_s)
            subj1 = subj_s
            subj2 = noun_s

        # "a" vs. "an"
        if subj1[0] in vowels and quantifier_g == 'a':
            quantifier_g = 'an'
        if subj1[0] in vowels and quantifier_b == 'a':
            quantifier_b = 'an'
        if subj2[0] in vowels and quantifier_g == 'a':
            quantifier_g = 'an'
        if subj2[0] in vowels and quantifier_b == 'a':
            quantifier_b = 'an'

        # prevent double negation
        if quantifier_g == 'no' and ('not' in copula or "n't" in copula):
            copula = copula.replace(' not', '')
            copula = copula.replace(" n't", '')

        yield template1.format(copula, quantifier_b, subj1, adj, noun_p)  # bad
        yield template1.format(copula, quantifier_g, subj1, adj,
                               noun_p)  # good

        yield template2.format(copula, quantifier_b, subj2, name)  # bad
        yield template2.format(copula, quantifier_g, subj2, name)  # good
Exemplo n.º 9
0
def main():
    """
    example:
    sam questioned the dog that can hurt sara ." vs "sam questioned who the dog can hurt sara."

    """

    nouns_s_and_p = get_legal_words(tag='NN', second_tag='NNP')

    excluded_verbs_base = ('put', 'run', 'say', 'be', 'give', 'tell', 'live')
    verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base)

    excluded_verbs_past = ('started', 'let', 'told')
    verbs_past = get_legal_words(tag='VBD', exclude=excluded_verbs_past)

    excluded_verbs_gerund = ('saying', )
    verbs_gerund = get_legal_words(tag='VBG', exclude=excluded_verbs_gerund)

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_,
                                        min_size=10,
                                        max_size=len(names_))

    def add_preposition_after_vb(v: str):
        if v in {'acting', 'act'}:
            return f'{v} like'
        elif v in {
                'standing', 'stand', 'falling', 'fall', 'depending', 'depend'
        }:
            return f'{v} on'
        elif v in {'asking', 'ask', 'writing', 'write', 'thinking', 'think'}:
            return f'{v} about'
        elif v in {'swimming', 'swim', 'sleeping', 'sleep'}:
            return f'{v} in'
        elif v in {'driving', 'drive', 'coming', 'come', 'related', 'relate'}:
            return f'{v} to'
        elif v in {'flying', 'fly', 'working', 'work'}:
            return f'{v} with'
        else:
            return v

    while True:

        # random choices
        slot2filler = {
            'name': random.choice(names),
            'nn': random.choice(nouns_s_and_p)[0],
            'nns': random.choice(nouns_s_and_p)[1],
            'vbd': random.choice(verbs_past),
            'vbg': random.choice(verbs_gerund),  # used in template2 only
            'vb': random.choice(verbs_base),  # used in template 1 only
        }

        slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb'])
        slot2filler['vbd'] = add_preposition_after_vb(slot2filler['vbd'])
        slot2filler['vbg'] = add_preposition_after_vb(slot2filler['vbg'])

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good

        yield template2['b'].format(**slot2filler)  # bad
        yield template2['g'].format(**slot2filler)  # good
Exemplo n.º 10
0
def main():
    """
    example:
    "katherine will help herself do something" vs. "katerine will help himself do something"
    """

    excluded_verbs_base = ('say', )
    verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base)

    nouns_s = get_legal_words(tag='NN')

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_,
                                        min_size=10,
                                        max_size=len(names_))

    auxiliaries = ['can', 'could', 'will', 'would', 'must', 'should']

    def add_misc_after_prp(
        prp: str,
        v: str,
        arg1: str,
    ) -> str:

        if v in {
                'take',
        }:
            return f'{prp} to {arg1}'

        elif v in {'make', 'give', 'put'}:
            return f'{prp} {arg1}'

        elif v in {
                'work',
        }:
            return f'{prp} on {arg1}'

        elif v in {
                'turn',
        }:
            return f'{prp} around'

        elif v in {
                'tell',
        }:
            return f'{prp} about {arg1}'
        else:
            return prp

    def add_preposition_after_vb(v: str) -> str:
        if v in {'work', 'study', 'live'}:
            return f'{v} with'
        elif v in {'point', 'run'}:
            return f'{v} to'
        elif v in {
                'be',
        }:
            return f'{v} like'
        else:
            return v

    while True:

        vb = random.choice(verbs_base)

        # random choices
        slot2filler = {
            'aux':
            random.choice(auxiliaries),
            'nn_m':
            random.choice([name for name in names if name in names_m] +
                          ['he', 'the man', 'a man', 'that man']),
            'nn_f':
            random.choice([name for name in names if name in names_f] +
                          ['she', 'the woman', 'a woman', 'that woman']),
        }

        # sample argument once, so that the same argument is used by both bad and good sentences.
        # note: pronouns don't get determiners, but nouns do
        if vb == 'put':
            argument1 = random.choice(['in danger', 'in this situation'])
        else:
            argument1 = random.choice([f'the {nn}' for nn in nouns_s])

        # first, add some miscellaneous component
        slot2filler['prp_m'] = add_misc_after_prp('himself', vb, argument1)
        slot2filler['prp_f'] = add_misc_after_prp('herself', vb, argument1)

        # second, add a preposition
        slot2filler['vb'] = add_preposition_after_vb(vb)

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good

        yield template2['b'].format(**slot2filler)  # bad
        yield template2['g'].format(**slot2filler)  # good

        # use negation only in non-question, in templates 3, and 4
        if random.random() < 0.5:
            slot2filler['aux'] += ' ' + 'not'
        if random.random() < 0.1:
            slot2filler['aux'] = 'did not'

        yield template3['b'].format(**slot2filler)  # bad
        yield template3['g'].format(**slot2filler)  # good

        yield template4['b'].format(**slot2filler)  # bad
        yield template4['g'].format(**slot2filler)  # good
Exemplo n.º 11
0
def main():
    """
    example:
    "who should sarah hug after shocking the dog ?" vs "who should sarah hug the dog after shocking ?"

    note: this task is too difficult for babyBERTa
    """

    nouns_s = get_legal_words(tag='NN')

    excluded_verbs_base = ('run', 'say', 'be', 'give', 'tell', 'live', 'force')
    verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base)

    excluded_verbs_gerund = ('saying', )
    verbs_gerund = get_legal_words(tag='VBG', exclude=excluded_verbs_gerund)

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_,
                                        min_size=10,
                                        max_size=len(names_))

    pps = ['after', 'before', 'while', 'without']

    def add_preposition_after_vb(v: str):
        if v == 'related':
            return 'related to'
        elif v == 'acting':
            return 'acting like'
        elif v == 'put':
            return 'put on'
        elif v == 'work':
            return 'work for'
        elif v == 'sleeping':
            return 'sleeping in'
        elif v == 'standing':
            return 'standing on'
        elif v == 'depending':
            return 'depending on'
        elif v == 'flying':
            return 'flying over'
        elif v == 'falling':
            return 'falling on'
        elif v == 'asking':
            return 'asking about'
        elif v == 'swimming':
            return 'swimming in'
        elif v == 'asking':
            return 'asking for'
        elif v == 'coming':
            return 'coming to'
        else:
            return v

    while True:

        # random choices
        slot2filler = {
            'name': random.choice(names),
            'nn': random.choice(nouns_s),
            'pp': random.choice(pps),
            'vb': random.choice(verbs_base),
            'vbg': random.choice(verbs_gerund),
        }

        slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb'])
        slot2filler['vbg'] = add_preposition_after_vb(slot2filler['vbg'])

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good

        yield template2['b'].format(**slot2filler)  # bad
        yield template2['g'].format(**slot2filler)  # good
Exemplo n.º 12
0
def main():
    """
    example:
    "give me the frog ." vs. "the frog gives me ."
    """

    # counterbalance both forms of verb as different forms are the contrast
    vbs_and_vbzs_1 = [
        ('give', 'gives'),
    ]

    vbs_and_vbzs_2 = [
        ('ask', 'asked'),  # "asks" is not in vocab
        ('tell', 'tells'),
    ]

    prps_obj = ['me', 'you', 'him', 'her', 'them']

    conjunctions = ['when', 'but', 'with', 'and']

    # TODO use a counterbalanced verb list

    vbzs3_and_continuations = [  # contains a mix of past and present tense forms
        # past tense form
        ('saw', '{prp_obj} there', '{prp_obj} by'),
        ('created', '{prp_obj}', '{det}'),
        ('told', '{prp_obj} about that', '{prp_obj} about {det}'),
        ('wrote', '{prp_obj} something', '{prp_obj} {det}'),
        ('wanted', '{prp_obj}', 'to'),
        ('asked', 'about {prp_obj}', '{prp_obj} about'),
        ('sold', '{prp_obj} that', 'that to'),
        ('changed', '{prp_obj}', '{det}'),
        # present tense form
        ('looks', 'at {prp_obj}', 'at {det}'),
        ('plays', 'with {prp_obj}', 'with {det}'),
        ('thinks', 'about {prp_obj}', 'about {det}'),
        ('moves', 'fast', 'to'),
        ('works', 'well', '{conjunction}'),
    ]

    adjectives = get_legal_words(tag='JJ')

    nouns_s = get_legal_words(tag='NN')

    animates_ = (configs.Dirs.legal_words /
                 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_,
                                           min_size=8,
                                           max_size=len(animates_))

    personal_pronouns_obj = ['me', 'him', 'her', 'us',
                             'them']  # in the objective case
    personal_pronouns_subj = ['i', 'he', 'she', 'we',
                              'they']  # in the subjective case

    determiners = ['a', 'one', 'the', 'my', 'his',
                   'some']  # do not include "this" or "that" or "her"

    vowels = {'a', 'e', 'i', 'o', 'u'}

    while True:

        vb1, vbz1 = random.choice(vbs_and_vbzs_1)  # template 1
        vb2, vbz2 = random.choice(vbs_and_vbzs_2)  # template 2
        vbz3, cont_g, cont_b = random.choice(
            vbzs3_and_continuations)  # template 3

        # good and bad continuations
        prp_obj = random.choice(prps_obj)
        conjunction = random.choice(conjunctions)
        cont_g = cont_g.format(prp_obj=prp_obj)
        cont_b = cont_b.format(prp_obj=prp_obj,
                               det=random.choice(determiners),
                               conjunction=conjunction)

        # random choices
        slot2filler = {
            'det': random.choice(determiners),
            'jj': random.choice(adjectives),
            'nn': random.choice(animates),
            'nn2': random.choice(nouns_s),
            'prp_obj': random.choice(personal_pronouns_obj),
            'prp_subj': random.choice(personal_pronouns_subj),
            'vb1': vb1,
            'vbz1': vbz1,
            'vb2': vb2,
            'vbz2': vbz2,
            'vbz3': vbz3,
            'cont_g': cont_g,
            'cont_b': cont_b,
        }

        if slot2filler['det'] == 'a' and slot2filler['nn2'][0] in vowels:
            slot2filler['det'] += 'n'

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good

        yield template2['b'].format(**slot2filler)  # bad
        yield template2['g'].format(**slot2filler)  # good

        yield template3['b'].format(**slot2filler)  # bad
        yield template3['g'].format(**slot2filler)  # good
Exemplo n.º 13
0
def main():
    """
    example:
    "who must sarah and the dog kiss ?" vs "who must sarah kiss and the dog ?"

    """

    excluded_verbs_base = ('run', 'be', 'live', 'force', 'order')
    verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base)

    excluded_verbs_gerund = ('',)
    verbs_gerund = get_legal_words(tag='VBG', exclude=excluded_verbs_gerund)

    animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_))

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_))

    def add_preposition_after_vb(v: str,
                                 arg: str,
                                 ):
        if v == 'related':
            return 'related to'
        elif v == 'put':
            return 'put on'
        elif v == 'work':
            return 'work for'
        elif v == 'acting':
            return 'acting like'
        elif v == 'sleeping':
            return 'sleeping in'
        elif v == 'falling':
            return 'falling on'
        elif v == 'looking':
            return 'looking for'
        elif v == 'running':
            return 'running to'
        elif v == 'talking':
            return 'talking about'
        elif v == 'thinking':
            return 'thinking about'
        elif v == 'reaching':
            return 'reaching for'
        elif v == 'work':
            return f'work {arg}'
        else:
            return v

    while True:

        # random choices
        slot2filler = {
            'name': random.choice(names),
            'nn': random.choice(animates),
            'vb': random.choice(verbs_base),
            'vbg': random.choice(verbs_gerund),
        }

        arg = random.choice(["him", "her", "them", "us"])
        slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb'], arg)
        slot2filler['vbg'] = add_preposition_after_vb(slot2filler['vbg'], arg)

        # exclude bad combinations that involve "who", e.g. "saying who"
        if slot2filler['vbg'] not in {'saying', 'drinking', 'eating', 'open'}\
                and slot2filler['vb'] not in {'need', 'feel', 'open'}:

            if slot2filler['vb'] == 'tell':
                slot2filler['vb'] = 'tell something'

            yield template1['b'].format(**slot2filler)  # bad
            yield template1['g'].format(**slot2filler)  # good

            yield template2['b'].format(**slot2filler)  # bad
            yield template2['g'].format(**slot2filler)  # good

        yield template3['b'].format(**slot2filler)  # bad
        yield template3['g'].format(**slot2filler)  # good

        yield template4['b'].format(**slot2filler)  # bad
        yield template4['g'].format(**slot2filler)  # good
Exemplo n.º 14
0
def main():
    """
    example:
    "sarah laughs" vs. "sarah gives"
    """

    # we need a lot of verbs here, so temporarily reduce restrictions imposed by counterbalancing
    tmp1 = configs.Data.tag2num_words['VB']
    tmp2 = configs.Data.tag2num_words['VBZ']
    tmp3 = configs.Data.tag2num_words['VBD']
    tmp4 = configs.Data.bias_tolerance
    configs.Data.tag2num_words['VB'] = 30
    configs.Data.tag2num_words['VBZ'] = 50
    configs.Data.tag2num_words['VBD'] = 50
    configs.Data.bias_tolerance = 7000

    vbs = get_legal_words(tag='VB', exclude=('fit', 'come', 'point'))
    vbzs = get_legal_words(tag='VBZ', exclude=('points', ))
    vbds = get_legal_words(tag='VBD', exclude=('fit', 'dropped', 'signed', 'formed', 'managed'))

    configs.Data.tag2num_words['VB'] = tmp1
    configs.Data.tag2num_words['VBZ'] = tmp2
    configs.Data.tag2num_words['VBD'] = tmp3
    configs.Data.bias_tolerance = tmp4

    animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_))

    prps_s = ['she', 'he', 'it']
    prps_p = ['we', 'they']
    prps = prps_s + prps_p

    aux_s = ['does']
    auxiliaries = ['could', 'can', 'would', 'will', 'did'] + aux_s

    determiners = ['the', 'this', 'some', 'that', 'every'] + ['your', 'his', 'her']

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_))

    vbs_intransitive = [
        'run',
        'work',
        'turn',
        'eat',
        'live',
        'read',
        'trade',
        'play',
        'know',
        'study',
        'think',
        'change',
    ]

    vbzs_intransitive = [
        'moves',
        'lives',
        'lies',
        'knows',
        'waves',
        'changes',
        'works',
        'dies',
        'leads',
        'appears',
        'thinks',
        'falls',
        'matters',
        'turns',
        'stands',
        'stands',
        'runs',
        'calls',
        'races',
    ]

    vbds_intransitive = [
        'occurred',
        'married',
        'moved',
        'looked',
        'changed',
        'finished',
        'grew',
        'broke',
        'started',
        'improved',
        'worked',
        'thought',
        'came',
        'tried',
        'read',
        'lost',
        'knew',
        'lived',
        'accepted',
        'developed',
        'joined',
        'joined',
        'decided',
        'learned',
        'occurred',
        'happened',
        'fell',
        'refused',
        'returned',
    ]

    vbs_intransitive = vbs_intransitive
    vbs_transitive = [v for v in vbs if v not in vbs_intransitive]

    vbzs_transitive = [v for v in vbzs if v not in vbzs_intransitive]

    vbzs_or_vbds_intransitive = vbzs_intransitive + vbds_intransitive
    vbzs_or_vbds_transitive = [v for v in vbzs + vbds if v not in vbzs_or_vbds_intransitive]

    while True:

        # random choices
        slot2filler = {
            'nn1': random.choice(animates + names + prps),
            'aux': random.choice(auxiliaries),
            'vbz_or_vbd_intransitive': random.choice(vbzs_or_vbds_intransitive),
            'vbz_or_vbd_transitive': random.choice(vbzs_or_vbds_transitive),
            'vb_intransitive': random.choice(vbs_intransitive),
            'vb_transitive': random.choice(vbs_transitive),
        }

        # handle exception: "occurred" and "happened" cannot have animate subject
        if slot2filler['vbz_or_vbd_intransitive'] in ['occurred', 'happened'] and\
                slot2filler['nn1'] not in ['it', 'that', 'this']:
            continue

        # add determiner to animate noun
        if slot2filler['nn1'] in animates:
            slot2filler['nn1'] = random.choice(determiners) + ' ' + slot2filler['nn1']

        # do not use template 1 with plural pronoun and VBZ
        if not (slot2filler['nn1'] in prps_p and
                slot2filler['vbz_or_vbd_intransitive'] in vbzs_intransitive or
                slot2filler['vbz_or_vbd_transitive'] in vbzs_transitive):

            yield template1['b'].format(**slot2filler)  # bad
            yield template1['g'].format(**slot2filler)  # good

        # do not use template 2 with plural pronoun and singular aux (e.g. "does")
        if not (slot2filler['nn1'] in prps_p and
                slot2filler['aux'] in aux_s):

            yield template2['b'].format(**slot2filler)  # bad
            yield template2['g'].format(**slot2filler)  # good
Exemplo n.º 15
0
def main():
    """
    example:
    "only sarah could ever talk." vs. "even sarah could ever talk"
    """

    vbzs = get_legal_words(tag='VBZ', exclude=('happens', 'says', ))
    vbs = get_legal_words(tag='VB')

    nouns_s = get_legal_words(tag='NN')

    animates_ = (configs.Dirs.legal_words / 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_, min_size=8, max_size=len(animates_))

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_, min_size=10, max_size=len(names_))

    determiners = ['the', 'this', 'some', 'that'] + ['your', 'his', 'her']

    auxiliaries = ['could', 'can', 'would', 'will']

    def add_argument_after_vb(v: str,
                              argument1: str,
                              ) -> str:
        if v in {'thinks', 'reads'}:
            return f'{v} about'
        elif v in {'lives', 'falls', 'is', 'be'}:
            return f'{v} in'
        elif v in {'stands', 'turns'}:
            return f'{v} on'
        elif v in {'acts', 'looks'}:
            return f'{v} like'
        elif v in {'goes', 'comes'}:
            return f'{v} to'
        elif v in {'gives', 'gives'}:
            return f'{v} {argument1}'
        elif v in {'plays', 'play', 'shows', 'show', 'tells', 'tell'}:
            return f'{v} {argument1}'
        else:
            return v

    while True:

        arg1 = random.choice(['him', 'her'])
        vbz = random.choice(vbzs)
        vb = random.choice(vbs)

        # random choices
        slot2filler = {
            'nn1': random.choice(names + animates),
            'nn2': random.choice(nouns_s),
            'vbz': add_argument_after_vb(vbz, arg1),
            'vb': add_argument_after_vb(vb, arg1),
            'det': random.choice(determiners),
            'aux': random.choice(auxiliaries)
        }

        # add determiner to animate noun
        if slot2filler['nn1'] in animates:
            slot2filler['nn1'] = random.choice(determiners) + ' ' + slot2filler['nn1']

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good

        yield template2['b'].format(**slot2filler)  # bad
        yield template2['g'].format(**slot2filler)  # good
Exemplo n.º 16
0
def main():
    """
    example:
    "has sam ever worried sarah ?" vs. "jane has ever worried sarah ."
    """

    vbs = get_legal_words(tag='VB')

    nouns_s = get_legal_words(tag='NN')

    animates_ = (configs.Dirs.legal_words /
                 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_,
                                           min_size=8,
                                           max_size=len(animates_))

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_,
                                        min_size=10,
                                        max_size=len(names_))

    auxiliaries = ['does', 'will', 'could', 'did', 'should', 'would']

    determiners = ['a', 'the', 'this', 'some', 'that'] + ['your', 'his', 'her']

    def add_argument_after_vb(
        v: str,
        arg1: str,
        arg2: str,
    ) -> str:
        if v in {'say'}:
            return f'{v} something'
        elif v in {'read'}:
            return f'{v} a book'
        elif v in {'play'}:
            return f'{v} with {arg1}'
        elif v in {
                'use', 'find', 'get', 'be', 'order', 'need', 'have', 'control',
                'want', 'free', 'keep'
        }:
            return f'{v} {arg1}'
        elif v in {'tell'}:
            return f'{v} me about {arg1}'
        elif v in {'plan'}:
            return f'{v} to do something with {arg1}'
        elif v in {'take'}:
            return f'{v} {arg1} away'
        elif v in {'give', 'show', 'present'}:
            return f'{v} {arg1} to {arg2}'
        elif v in {'put'}:
            return f'{v} {arg1} on {arg2}'
        elif v in {'fall'}:
            return f'{v} in {arg1}'
        elif v in {'see'}:
            return f'{v} how the {arg1} works'
        elif v in {'come'}:
            return f'{v} to {arg1}'
        else:
            return v

    while True:

        # sample argument once, so that the same argument is used by both bad and good sentences.
        # note: pronouns don't get determiners, but nouns do
        argument1 = random.choice(['you', 'him', 'her', 'it'] +
                                  [f'the {nn}' for nn in nouns_s])
        argument2 = random.choice(['you', 'him', 'her', 'it'] +
                                  [f'the {nn}' for nn in nouns_s])

        vb = random.choice(vbs)

        # random choices
        slot2filler = {
            'name': random.choice(names),
            'nn': random.choice(animates),
            'vb': add_argument_after_vb(vb, argument1, argument2),
            'aux': random.choice(auxiliaries),
            'det': random.choice(determiners),
        }

        if slot2filler['aux'] in {'did', 'does'} and vb == 'be':
            continue

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good

        yield template2['b'].format(**slot2filler)  # bad
        yield template2['g'].format(**slot2filler)  # good
Exemplo n.º 17
0
def main():
    """
    example:
    "we can help him do something" vs. "we can help himself do something"
    """

    # counterbalance both forms of verb as different forms are the contrast

    excluded_verbs_base = ('say', 'live')
    verbs_base = get_legal_words(tag='VB', exclude=excluded_verbs_base)

    nouns_s = get_legal_words(tag='NN')

    prps_obj_and_poss = [
        ('him', 'his'),
        ('her', 'hers'),
        ('us', 'our'),
        ('them', 'theirs'),
    ]

    animates_ = (configs.Dirs.legal_words /
                 'animates.txt').open().read().split()
    animates = find_counterbalanced_subset(animates_,
                                           min_size=8,
                                           max_size=len(animates_))

    auxiliaries = ['can', 'could', 'will', 'would', 'must', 'should']

    def add_misc_after_prp(
        prp: str,
        v: str,
        arg1: str,
    ) -> str:
        if v in {'take'}:
            return f'{prp} to {arg1}'
        elif v in {'make'}:
            return f'{prp} do {arg1}'
        elif v in {'work', 'put'}:
            return f'{prp} on {arg1}'
        elif v in {'turn'}:
            return f'{prp} around'
        elif v in {'tell'}:
            return f'{prp} about {arg1}'
        else:
            return prp

    def add_preposition_after_vb(v: str) -> str:
        if v in {'work', 'study'}:
            return f'{v} with'
        elif v in {'point', 'run'}:
            return f'{v} to'
        elif v in {'be'}:
            return f'{v} like'
        else:
            return v

    while True:

        prp_obj, prp_poss = random.choice(prps_obj_and_poss)

        # random choices
        slot2filler = {
            'aux': random.choice(auxiliaries),
            'prp_poss': prp_poss,
            'prp_obj': prp_obj,
            'nn': random.choice(animates),
            'vb': random.choice(verbs_base),
        }

        # sample argument once, so that the same argument is used by both bad and good sentences.
        # note: pronouns don't get determiners, but nouns do
        argument1 = random.choice([f'the {nn}' for nn in nouns_s])

        # first, add some miscellaneous component
        slot2filler['prp_poss'] = add_misc_after_prp(prp_poss,
                                                     slot2filler['vb'],
                                                     argument1)
        slot2filler['prp_obj'] = add_misc_after_prp(prp_obj, slot2filler['vb'],
                                                    argument1)

        # lastly, add a preposition
        slot2filler['vb'] = add_preposition_after_vb(slot2filler['vb'])

        yield template1['b'].format(**slot2filler)  # bad
        yield template1['g'].format(**slot2filler)  # good
Exemplo n.º 18
0
def main():
    """
    example:
    "a big dog fell down the stairs ." vs. "a big dog fallen down the stairs ."

    """

    vocab = get_vocab_words()
    modifiers = [
        'over there', 'some time ago', 'this morning', 'at home', 'last night'
    ]

    names_ = (configs.Dirs.legal_words / 'names.txt').open().read().split()
    names = find_counterbalanced_subset(names_,
                                        min_size=10,
                                        max_size=len(names_))

    vbds_vbns_args = [
        ('arose', 'arisen', ['']),

        # optional arguments
        ('knew', 'known', ['a lot of things', 'she could do it']),
        ('saw', 'seen', ['a bird', 'a shape', 'something']),
        ('began', 'begun', ['to work']),
        ('fell', 'fallen', ['down the stairs']),
        ('flew', 'flown', ['into the sky', 'away']),
        ('drove', 'driven', [
            'out of the garage', 'down the road', 'with one wheel',
            'without looking'
        ]),
        ('grew', 'grown', [
            'quickly',
        ]),
        ('hid', 'hidden', ['from view', 'behind the bush']),
        ('rose', 'risen', ['from bed']),
        ('swore', 'sworn', ['not to do it again']),
        ('drank', 'drunk', ['some juice', 'the soup', 'your coffee']),
        ('ate', 'eaten', ['a lot', 'more than me', 'some ice cream']),
        ('drew', 'drawn', ['a picture', 'a map', 'a round circle']),
        ('wrote', 'written',
         ['a story', 'a note', 'into a book', 'with a large pen']),
        ('sang', 'sung', [
            'a nice song', 'in the theater', 'with a pretty voice',
            'my favorite song'
        ]),
        ('spoke', 'spoken',
         ['very fast', 'to me', 'about many things', 'without thinking']),
        ('came', 'come',
         ['to the store', 'just in time', 'when we needed her', 'too late']),

        # transitive
        ('was', 'been', ['here', 'alone', 'afraid']),
        ('beat', 'beaten', ['the dough', 'a little boy', 'their pet']),
        ('became', 'become', ['angry', 'very different', 'someone else']),
        ('bit', 'bitten',
         ['her own tongue', 'into the cake', 'off a big chunk']),
        ('blew', 'blown', [
            'out the candle',
            'away the dirt',
        ]),
        ('chose', 'chosen', [
            'the best option',
            'the good one',
        ]),
        ('did', 'done',
         ['nothing wrong', 'something bad', 'the best she could ']),
        ('forgave', 'forgiven', ['her', 'the child', 'him']),
        ('gave', 'given', [
            'a book to a student', 'something sweet to the baby',
            'money to the man'
        ]),
        ('rode', 'ridden', ['a horse', 'a cart', 'in the front seat', 'away']),
        ('shook', 'shaken', ['the plate', 'the table', 'the bowl']),
        ('strode', 'stridden', ['']),
        ('took', 'taken', ['a paper', 'some food', 'the bell', 'it', 'them']),
        ('threw', 'thrown',
         ['the trash out', 'the paper ball', 'some away', 'his ball']),
    ]

    while True:

        # random choices
        name = random.choice(names)
        mod = random.choice(modifiers)
        vbd, vbn, args = random.choice(vbds_vbns_args)
        arg = random.choice(args)

        if (vbd not in vocab or vbn not in vocab) or vbd == vbn:
            # print(f'"{verb_base:<22} excluded due to some forms not in vocab')
            continue
        if arg == '':
            continue

        # vbd is correct
        yield template.format(name, vbn, arg, mod)  # bad
        yield template.format(name, vbd, arg, mod)  # good

        # vbn is correct
        yield template.format(name, 'had ' + vbd, arg, mod)
        yield template.format(name, 'had ' + vbn, arg, mod)