Python parse 예제들, sentence_tokenizer.parse Python 예제들

예제 #1

0

파일 보기

def parse(text):
    log("Checking for coherence in '{0}'".format(text), 2)

    family_hits = []
    family_stem_words = stemmed_words(family_words, 'family_words')
    for sentence in sentence_tokenizer.parse(text):
        tree = parsers.parse(sentence)[0]
        family_hits += [
            (a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower())
             in family_stem_words)
            for a_tree in tree.subtrees(lambda x: x.node in noun_tags)
        ]
    log("Family hits: {0}".format(family_hits), 4)
    family_hit_values = (len([hit for hit in family_hits
                              if hit[2]]), len(family_hits))
    log("%d/%d" % family_hit_values, 3)

    work_hits = []
    work_stem_words = stemmed_words(work_words, 'work_words')
    for sentence in sentence_tokenizer.parse(text):
        tree = parsers.parse(sentence)[0]
        work_hits += [
            (a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower())
             in work_stem_words)
            for a_tree in tree.subtrees(lambda x: x.node in noun_tags)
        ]
    log("Work hits: {0}".format(work_hits), 4)
    work_hit_values = (len([hit for hit in work_hits
                            if hit[2]]), len(work_hits))
    log("%d/%d" % work_hit_values, 3)

    return family_hit_values[0], work_hit_values[0], work_hit_values[1]

예제 #2

0

파일 보기

파일: grade_utils.py 프로젝트: nader92011/zdotfiles

def grade_3a(text):
    sentences = sentence_tokenizer.parse(text)
    num_sentences = len(sentences)
    if num_sentences >= 6:
        return 5
    else:
        return max(num_sentences - 1, 1)

예제 #3

0

파일 보기

def grade_3a(text):
    sentences = sentence_tokenizer.parse(text)
    num_sentences = len(sentences)
    if num_sentences >= 6:
        return 5
    else:
        return max(num_sentences - 1, 1)

예제 #4

0

파일 보기

파일: topic_coherence.py 프로젝트: snyderp/cs412-scorer

def parse(text):
    log("Checking for coherence in '{0}'".format(text), 2)

    family_hits = []
    family_stem_words = stemmed_words(family_words, 'family_words')
    for sentence in sentence_tokenizer.parse(text):
        tree = parsers.parse(sentence)[0]
        family_hits += [(a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in family_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags)]
    log("Family hits: {0}".format(family_hits), 4)
    family_hit_values = (len([hit for hit in family_hits if hit[2]]), len(family_hits))
    log("%d/%d" % family_hit_values, 3)

    work_hits = []
    work_stem_words = stemmed_words(work_words, 'work_words')
    for sentence in sentence_tokenizer.parse(text):
        tree = parsers.parse(sentence)[0]
        work_hits += [(a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in work_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags)]
    log("Work hits: {0}".format(work_hits), 4)
    work_hit_values = (len([hit for hit in work_hits if hit[2]]), len(work_hits))
    log("%d/%d" % work_hit_values, 3)

    return family_hit_values[0], work_hit_values[0], work_hit_values[1]

예제 #5

0

파일 보기

파일: grade_utils.py 프로젝트: nader92011/zdotfiles

def grade_1a(text):
    sentences = sentence_tokenizer.parse(text)
    num_problems = 0
    num_sentences = 0
    for sentence in sentences:
        issues_in_sentence = word_order.issues_in_sentence(sentence)
        num_sentences += 1
        num_problems += len(issues_in_sentence)
    if num_problems in (0, 1):
        return 5
    elif num_problems == 2:
        return 4
    elif num_problems in (3, 4):
        return 3
    elif num_problems in (5, 6):
        return 2
    else:
        return 1

예제 #6

0

파일 보기

def grade_1a(text):
    sentences = sentence_tokenizer.parse(text)
    num_problems = 0
    num_sentences = 0
    for sentence in sentences:
        issues_in_sentence = word_order.issues_in_sentence(sentence)
        num_sentences += 1
        num_problems += len(issues_in_sentence)
    if num_problems in (0, 1):
        return 5
    elif num_problems == 2:
        return 4
    elif num_problems in (3, 4):
        return 3
    elif num_problems in (5, 6):
        return 2
    else:
        return 1

예제 #7

0

파일 보기

파일: syntactic_formation.py 프로젝트: pes10k/cs412-scorer

def parse(text):
    treebank_rules = get_treebank_rules(cutoff=0)

    sentence_probs = []
    for line in text.split("\n"):
        sentences = sentence_tokenizer.parse(line)

        for sentence in sentences:

            # Add a period to the end of the sentence, which sometimes
            # forces a better parse
            #if sentence[-1] not in ('.', '!', '?'):
            #                    sentence += '.'

            parse_trees = parsers.parse(sentence)
            for tree in parse_trees:
                if cmd_utils.cmd_log_level() > 2:
                    print tree.pprint()

                evindenced_lexical_rules = set(lexical_rules(tree).keys())
                differences = evindenced_lexical_rules.difference(
                    treebank_rules)

                bad_generations = len(differences)
                log(
                    "Found {0} bad generations ({1})".format(
                        bad_generations, differences), 3)

                #bad_parse_prob = 1 if prob == 0 else 0
                #log("Scored {0} for prob {1}".format(bad_parse_prob, prob), 3)

                bad_tag_problems = num_tag_problems(tree)
                log("Found {0} X or FRAG tags".format(bad_tag_problems), 3)

                bad_sbar_problems = num_sbar_problems(tree)
                log("Found {0} bad SBAR issues".format(bad_sbar_problems), 3)

                total_problems = bad_sbar_problems + bad_tag_problems + bad_generations
                log("In '{0}'".format(sentence), 2)
                log(
                    "Found {0} sentence formation problems".format(
                        total_problems), 1)
                sentence_probs.append(total_problems)
    return sentence_probs

예제 #8

0

파일 보기

파일: syntactic_formation.py 프로젝트: snyderp/cs412-scorer

def parse(text):
    treebank_rules = get_treebank_rules(cutoff=0)

    sentence_probs = []
    for line in text.split("\n"):
        sentences = sentence_tokenizer.parse(line)

        for sentence in sentences:

            # Add a period to the end of the sentence, which sometimes
            # forces a better parse
            #if sentence[-1] not in ('.', '!', '?'):
#                    sentence += '.'

            parse_trees = parsers.parse(sentence)
            for tree in parse_trees:
                if cmd_utils.cmd_log_level() > 2:
                    print tree.pprint()

                evindenced_lexical_rules = set(lexical_rules(tree).keys())
                differences = evindenced_lexical_rules.difference(treebank_rules)

                bad_generations = len(differences)
                log("Found {0} bad generations ({1})".format(bad_generations, differences), 3)

                #bad_parse_prob = 1 if prob == 0 else 0
                #log("Scored {0} for prob {1}".format(bad_parse_prob, prob), 3)

                bad_tag_problems = num_tag_problems(tree)
                log("Found {0} X or FRAG tags".format(bad_tag_problems), 3)


                bad_sbar_problems = num_sbar_problems(tree)
                log("Found {0} bad SBAR issues".format(bad_sbar_problems), 3)

                total_problems = bad_sbar_problems + bad_tag_problems + bad_generations
                log("In '{0}'".format(sentence), 2)
                log("Found {0} sentence formation problems".format(total_problems), 1)
                sentence_probs.append(total_problems)
    return sentence_probs

예제 #9

0

파일 보기

def parse(text, use_cache=True):
    num_agrees = 0
    num_not_agrees = 0
    num_unsure = 0

    lines = text.split("\n")
    for line in lines:
        sentences = sentence_tokenizer.parse(line, use_cache=use_cache)
        for sentence in sentences:

            line_agreements, line_non_agreements, line_unsure = 0, 0, 0

            # Possession seems to be tricky for the parser, so we fudge
            # a little here
            sentence = sentence.replace("'s", '')
            if sentence[-1] != ".":
                sentence += "."

            if use_cache:
                cache_rs = cache_utils.cache_get('sub_verb_agreement',
                                                 sentence)
                if cache_rs:
                    line_agreements, line_non_agreements, line_unsure = cache_rs
                    num_agrees += line_agreements
                    num_not_agrees += line_non_agreements
                    num_unsure += line_unsure
                    continue

            log("Looking for Sub-Verb agreement in '%s'" % (sentence, ), 1)

            tree = parsers.parse(sentence)[0]
            dependencies = parsers.dependences(sentence)
            sub_verb_deps = [
                dep for dep in dependencies if dep['dep_name'] == 'nsubj'
            ]

            if len(sub_verb_deps) == 0:
                log("Couldn't find Subject-Verb dependency info", 1)
                cache_utils.cache_set('sub_verb_agreement', sentence,
                                      (0, 0, 0))
                continue

            for sub_verb in sub_verb_deps:
                first_node = node_in_tree(tree, sub_verb['first_word'])
                sec_node = node_in_tree(tree, sub_verb['second_word'])
                if first_node and sec_node:

                    log("First Dep Node: %s" % (first_node, ), 2)
                    log("Sec Dep Node: %s" % (sec_node, ), 2)

                    try:
                        is_agreement = check_node_agreement(
                            first_node, sec_node)
                        if is_agreement:
                            line_agreements += 1
                        else:
                            line_non_agreements += 1
                        log("Agreement in sentence? %s" % (is_agreement, ), 1)
                    except Exception as e:
                        line_unsure += 1
                        log("Error looking for agreement? %s" % (e.message, ),
                            2)

                        # No agreement in pair.  Not sure how to handle.
                        # More exhaustive search?
            if use_cache:
                cache_utils.cache_set(
                    'sub_verb_agreement', sentence,
                    (line_agreements, line_non_agreements, line_unsure))
            num_agrees += line_agreements
            num_not_agrees += line_non_agreements
            num_unsure += line_unsure

    return num_agrees, num_not_agrees, num_unsure

예제 #10

0

파일 보기

def parse(text):
    # Strip numbers out, since that seems to cause problems for my approach
    text = re.sub(r'\d+ ?', 'some ', text)

    sentences = sentence_tokenizer.parse(text)
    sentence_pronouns = []

    for sentence in sentences:
        log("Looking for pronouns in '{0}'".format(sentence), 2)

        pronoun_totals = [[], [], []]
        tree = parsers.parse(sentence)[0]
        pronoun_trees = tree.subtrees(lambda x: x.node in pronoun_tags)
        for pronoun_tree in pronoun_trees:
            # First total up all the first person pronouns
            for i in range(3):
                if pronoun_tree[0].lower() in pronouns[i]:
                    pronoun_totals[i].append(pronoun_tree[0])
        log("First Person '{0}'".format(pronoun_totals[0]), 3)
        log("Second Person '{0}'".format(pronoun_totals[1]), 3)
        log("Third Person '{0}'".format(pronoun_totals[2]), 3)
        sentence_pronouns.append(pronoun_totals)

    log("Pronouns found in text: %s" % (sentence_pronouns), 2)

    # If there are 3rd person pronouns in any sentence, we have to decide
    # if they are used correctly.  We do this in the following, very
    # expensive, but possibly correct manner.
    #
    # Start from the top down
    #   1. Look back 2 sentences and see if we can find a refernece.
    #       IF NOT - its an error and do no more
    #   2. If so, replace the refereneced word with "RUNNING"
    #      and search again, to see if there is a previous word it could refer
    #      to.
    #       IF NOT, its correct.  Replace the pronoun with the referenced word
    #       and continue
    #   3. Else, its not felicitous.  Give bad credit
    for i in range(len(sentences)):
        if len(sentence_pronouns[i][2]) > 0:
            pronoun_results = []
            for third_pronoun in sentence_pronouns[i][2]:
                all_sentences = sentences[max(0, i - 2):i + 1]
                norm_sentences = ". ".join(
                    [a_sen.strip(".") for a_sen in all_sentences]) + "."
                log(
                    "Looking for pronoun coherence for '{0}'".format(
                        norm_sentences), 4)
                pronouns_refs = parsers.parse_coref(norm_sentences)

                log("Recieved co-references {0}".format(pronouns_refs), 5)

                found_bundle = False

                for j in range(len(pronouns_refs)):
                    if third_pronoun == pronouns_refs[j]['pronoun']:
                        found_bundle = pronouns_refs[j]
                        break

                if not found_bundle:
                    log("Found NO anticedent for {0}".format(third_pronoun), 3)
                    pronoun_results.append((third_pronoun, -1))
                else:
                    log("Found anticedent for {0}".format(third_pronoun), 3)
                    ref_index = int(found_bundle['ref_sentence']) - 1 + (i - 2)

                    sentences[ref_index] = sentences[ref_index].replace(
                        found_bundle['ref'], 'RUNNING')
                    log(
                        "Replacing '{0}' with 'RUNNING'".format(
                            found_bundle['ref']), 3)

                    altered_sentences = sentences[max(0, i - 2):i + 1]
                    norm_altered_sentences = ". ".join(
                        [a_sen.strip(".")
                         for a_sen in altered_sentences]) + "."
                    log(
                        "New test sentences are '{0}'".format(
                            norm_altered_sentences), 4)
                    altered_pronouns_refs = parsers.parse_coref(
                        norm_altered_sentences)

                    if third_pronoun not in [
                            a_ref['pronoun'] for a_ref in altered_pronouns_refs
                    ]:
                        log("Anticedent is unambigious!", 3)

                        pro_index = int(
                            found_bundle['pronoun_sentence']) - 1 + (i - 2)
                        sentences[pro_index] = sentences[pro_index].replace(
                            found_bundle['pronoun'], found_bundle['ref'])

                        pronoun_results.append(
                            (third_pronoun, found_bundle['ref']))
                    else:
                        log("Anticedent is ambigious", 3)
                        log("New Sentences: {0}".format(altered_pronouns_refs),
                            4)
                        pronoun_results.append((third_pronoun, .5))
            sentence_pronouns[i][2] = pronoun_results
    return sentence_pronouns

예제 #11

0

파일 보기

파일: text_coherence.py 프로젝트: snyderp/cs412-scorer

def parse(text):
    # Strip numbers out, since that seems to cause problems for my approach
    text = re.sub(r'\d+ ?', 'some ', text)

    sentences = sentence_tokenizer.parse(text)
    sentence_pronouns = []

    for sentence in sentences:
        log("Looking for pronouns in '{0}'".format(sentence), 2)

        pronoun_totals = [[], [], []]
        tree = parsers.parse(sentence)[0]
        pronoun_trees = tree.subtrees(lambda x: x.node in pronoun_tags)
        for pronoun_tree in pronoun_trees:
            # First total up all the first person pronouns
            for i in range(3):
                if pronoun_tree[0].lower() in pronouns[i]:
                    pronoun_totals[i].append(pronoun_tree[0])
        log("First Person '{0}'".format(pronoun_totals[0]), 3)
        log("Second Person '{0}'".format(pronoun_totals[1]), 3)
        log("Third Person '{0}'".format(pronoun_totals[2]), 3)
        sentence_pronouns.append(pronoun_totals)

    log("Pronouns found in text: %s" % (sentence_pronouns), 2)

    # If there are 3rd person pronouns in any sentence, we have to decide
    # if they are used correctly.  We do this in the following, very
    # expensive, but possibly correct manner.
    #
    # Start from the top down
    #   1. Look back 2 sentences and see if we can find a refernece.
    #       IF NOT - its an error and do no more
    #   2. If so, replace the refereneced word with "RUNNING"
    #      and search again, to see if there is a previous word it could refer
    #      to.
    #       IF NOT, its correct.  Replace the pronoun with the referenced word
    #       and continue
    #   3. Else, its not felicitous.  Give bad credit
    for i in range(len(sentences)):
        if len(sentence_pronouns[i][2]) > 0:
            pronoun_results = []
            for third_pronoun in sentence_pronouns[i][2]:
                all_sentences = sentences[max(0, i - 2):i + 1]
                norm_sentences = ". ".join([a_sen.strip(".") for a_sen in all_sentences]) + "."
                log("Looking for pronoun coherence for '{0}'".format(norm_sentences), 4)
                pronouns_refs = parsers.parse_coref(norm_sentences)

                log("Recieved co-references {0}".format(pronouns_refs), 5)

                found_bundle = False

                for j in range(len(pronouns_refs)):
                    if third_pronoun == pronouns_refs[j]['pronoun']:
                        found_bundle = pronouns_refs[j]
                        break

                if not found_bundle:
                    log("Found NO anticedent for {0}".format(third_pronoun), 3)
                    pronoun_results.append((third_pronoun, -1))
                else:
                    log("Found anticedent for {0}".format(third_pronoun), 3)
                    ref_index = int(found_bundle['ref_sentence']) - 1 + (i - 2)

                    sentences[ref_index] = sentences[ref_index].replace(found_bundle['ref'], 'RUNNING')
                    log("Replacing '{0}' with 'RUNNING'".format(found_bundle['ref']), 3)

                    altered_sentences = sentences[max(0, i - 2):i + 1]
                    norm_altered_sentences = ". ".join([a_sen.strip(".") for a_sen in altered_sentences]) + "."
                    log("New test sentences are '{0}'".format(norm_altered_sentences), 4)
                    altered_pronouns_refs = parsers.parse_coref(norm_altered_sentences)

                    if third_pronoun not in [a_ref['pronoun'] for a_ref in altered_pronouns_refs]:
                        log("Anticedent is unambigious!", 3)

                        pro_index = int(found_bundle['pronoun_sentence']) - 1 + (i - 2)
                        sentences[pro_index] = sentences[pro_index].replace(found_bundle['pronoun'], found_bundle['ref'])

                        pronoun_results.append((third_pronoun, found_bundle['ref']))
                    else:
                        log("Anticedent is ambigious", 3)
                        log("New Sentences: {0}".format(altered_pronouns_refs), 4)
                        pronoun_results.append((third_pronoun, .5))
            sentence_pronouns[i][2] = pronoun_results
    return sentence_pronouns

예제 #12

0

파일 보기

파일: agreement_utils.py 프로젝트: nader92011/zdotfiles

def parse(text, use_cache=True):
    num_agrees = 0
    num_not_agrees = 0
    num_unsure = 0

    lines = text.split("\n")
    for line in lines:
        sentences = sentence_tokenizer.parse(line, use_cache=use_cache)
        for sentence in sentences:

            line_agreements, line_non_agreements, line_unsure = 0, 0, 0

            # Possession seems to be tricky for the parser, so we fudge
            # a little here
            sentence = sentence.replace("'s", '')
            if sentence[-1] != ".":
                sentence += "."

            if use_cache:
                cache_rs = cache_utils.cache_get('sub_verb_agreement', sentence)
                if cache_rs:
                    line_agreements, line_non_agreements, line_unsure = cache_rs
                    num_agrees += line_agreements
                    num_not_agrees += line_non_agreements
                    num_unsure += line_unsure
                    continue

            log("Looking for Sub-Verb agreement in '%s'" % (sentence,), 1)

            tree = parsers.parse(sentence)[0]
            dependencies = parsers.dependences(sentence)
            sub_verb_deps = [dep for dep in dependencies if dep['dep_name'] == 'nsubj']

            if len(sub_verb_deps) == 0:
                log("Couldn't find Subject-Verb dependency info", 1)
                cache_utils.cache_set('sub_verb_agreement', sentence, (0, 0, 0))
                continue

            for sub_verb in sub_verb_deps:
                first_node = node_in_tree(tree, sub_verb['first_word'])
                sec_node = node_in_tree(tree, sub_verb['second_word'])
                if first_node and sec_node:

                    log("First Dep Node: %s" % (first_node,), 2)
                    log("Sec Dep Node: %s" % (sec_node,), 2)

                    try:
                        is_agreement = check_node_agreement(first_node, sec_node)
                        if is_agreement:
                            line_agreements += 1
                        else:
                            line_non_agreements += 1
                        log("Agreement in sentence? %s" % (is_agreement,), 1)
                    except Exception as e:
                        line_unsure += 1
                        log("Error looking for agreement? %s" % (e.message,), 2)

                        # No agreement in pair.  Not sure how to handle.
                        # More exhaustive search?
            if use_cache:
                cache_utils.cache_set('sub_verb_agreement', sentence, (line_agreements, line_non_agreements, line_unsure))
            num_agrees += line_agreements
            num_not_agrees += line_non_agreements
            num_unsure += line_unsure

    return num_agrees, num_not_agrees, num_unsure