def parse(text): log("Checking for coherence in '{0}'".format(text), 2) family_hits = [] family_stem_words = stemmed_words(family_words, 'family_words') for sentence in sentence_tokenizer.parse(text): tree = parsers.parse(sentence)[0] family_hits += [ (a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in family_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags) ] log("Family hits: {0}".format(family_hits), 4) family_hit_values = (len([hit for hit in family_hits if hit[2]]), len(family_hits)) log("%d/%d" % family_hit_values, 3) work_hits = [] work_stem_words = stemmed_words(work_words, 'work_words') for sentence in sentence_tokenizer.parse(text): tree = parsers.parse(sentence)[0] work_hits += [ (a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in work_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags) ] log("Work hits: {0}".format(work_hits), 4) work_hit_values = (len([hit for hit in work_hits if hit[2]]), len(work_hits)) log("%d/%d" % work_hit_values, 3) return family_hit_values[0], work_hit_values[0], work_hit_values[1]
def grade_3a(text): sentences = sentence_tokenizer.parse(text) num_sentences = len(sentences) if num_sentences >= 6: return 5 else: return max(num_sentences - 1, 1)
def parse(text): log("Checking for coherence in '{0}'".format(text), 2) family_hits = [] family_stem_words = stemmed_words(family_words, 'family_words') for sentence in sentence_tokenizer.parse(text): tree = parsers.parse(sentence)[0] family_hits += [(a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in family_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags)] log("Family hits: {0}".format(family_hits), 4) family_hit_values = (len([hit for hit in family_hits if hit[2]]), len(family_hits)) log("%d/%d" % family_hit_values, 3) work_hits = [] work_stem_words = stemmed_words(work_words, 'work_words') for sentence in sentence_tokenizer.parse(text): tree = parsers.parse(sentence)[0] work_hits += [(a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in work_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags)] log("Work hits: {0}".format(work_hits), 4) work_hit_values = (len([hit for hit in work_hits if hit[2]]), len(work_hits)) log("%d/%d" % work_hit_values, 3) return family_hit_values[0], work_hit_values[0], work_hit_values[1]
def grade_1a(text): sentences = sentence_tokenizer.parse(text) num_problems = 0 num_sentences = 0 for sentence in sentences: issues_in_sentence = word_order.issues_in_sentence(sentence) num_sentences += 1 num_problems += len(issues_in_sentence) if num_problems in (0, 1): return 5 elif num_problems == 2: return 4 elif num_problems in (3, 4): return 3 elif num_problems in (5, 6): return 2 else: return 1
def parse(text): treebank_rules = get_treebank_rules(cutoff=0) sentence_probs = [] for line in text.split("\n"): sentences = sentence_tokenizer.parse(line) for sentence in sentences: # Add a period to the end of the sentence, which sometimes # forces a better parse #if sentence[-1] not in ('.', '!', '?'): # sentence += '.' parse_trees = parsers.parse(sentence) for tree in parse_trees: if cmd_utils.cmd_log_level() > 2: print tree.pprint() evindenced_lexical_rules = set(lexical_rules(tree).keys()) differences = evindenced_lexical_rules.difference( treebank_rules) bad_generations = len(differences) log( "Found {0} bad generations ({1})".format( bad_generations, differences), 3) #bad_parse_prob = 1 if prob == 0 else 0 #log("Scored {0} for prob {1}".format(bad_parse_prob, prob), 3) bad_tag_problems = num_tag_problems(tree) log("Found {0} X or FRAG tags".format(bad_tag_problems), 3) bad_sbar_problems = num_sbar_problems(tree) log("Found {0} bad SBAR issues".format(bad_sbar_problems), 3) total_problems = bad_sbar_problems + bad_tag_problems + bad_generations log("In '{0}'".format(sentence), 2) log( "Found {0} sentence formation problems".format( total_problems), 1) sentence_probs.append(total_problems) return sentence_probs
def parse(text): treebank_rules = get_treebank_rules(cutoff=0) sentence_probs = [] for line in text.split("\n"): sentences = sentence_tokenizer.parse(line) for sentence in sentences: # Add a period to the end of the sentence, which sometimes # forces a better parse #if sentence[-1] not in ('.', '!', '?'): # sentence += '.' parse_trees = parsers.parse(sentence) for tree in parse_trees: if cmd_utils.cmd_log_level() > 2: print tree.pprint() evindenced_lexical_rules = set(lexical_rules(tree).keys()) differences = evindenced_lexical_rules.difference(treebank_rules) bad_generations = len(differences) log("Found {0} bad generations ({1})".format(bad_generations, differences), 3) #bad_parse_prob = 1 if prob == 0 else 0 #log("Scored {0} for prob {1}".format(bad_parse_prob, prob), 3) bad_tag_problems = num_tag_problems(tree) log("Found {0} X or FRAG tags".format(bad_tag_problems), 3) bad_sbar_problems = num_sbar_problems(tree) log("Found {0} bad SBAR issues".format(bad_sbar_problems), 3) total_problems = bad_sbar_problems + bad_tag_problems + bad_generations log("In '{0}'".format(sentence), 2) log("Found {0} sentence formation problems".format(total_problems), 1) sentence_probs.append(total_problems) return sentence_probs
def parse(text, use_cache=True): num_agrees = 0 num_not_agrees = 0 num_unsure = 0 lines = text.split("\n") for line in lines: sentences = sentence_tokenizer.parse(line, use_cache=use_cache) for sentence in sentences: line_agreements, line_non_agreements, line_unsure = 0, 0, 0 # Possession seems to be tricky for the parser, so we fudge # a little here sentence = sentence.replace("'s", '') if sentence[-1] != ".": sentence += "." if use_cache: cache_rs = cache_utils.cache_get('sub_verb_agreement', sentence) if cache_rs: line_agreements, line_non_agreements, line_unsure = cache_rs num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure continue log("Looking for Sub-Verb agreement in '%s'" % (sentence, ), 1) tree = parsers.parse(sentence)[0] dependencies = parsers.dependences(sentence) sub_verb_deps = [ dep for dep in dependencies if dep['dep_name'] == 'nsubj' ] if len(sub_verb_deps) == 0: log("Couldn't find Subject-Verb dependency info", 1) cache_utils.cache_set('sub_verb_agreement', sentence, (0, 0, 0)) continue for sub_verb in sub_verb_deps: first_node = node_in_tree(tree, sub_verb['first_word']) sec_node = node_in_tree(tree, sub_verb['second_word']) if first_node and sec_node: log("First Dep Node: %s" % (first_node, ), 2) log("Sec Dep Node: %s" % (sec_node, ), 2) try: is_agreement = check_node_agreement( first_node, sec_node) if is_agreement: line_agreements += 1 else: line_non_agreements += 1 log("Agreement in sentence? %s" % (is_agreement, ), 1) except Exception as e: line_unsure += 1 log("Error looking for agreement? %s" % (e.message, ), 2) # No agreement in pair. Not sure how to handle. # More exhaustive search? if use_cache: cache_utils.cache_set( 'sub_verb_agreement', sentence, (line_agreements, line_non_agreements, line_unsure)) num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure return num_agrees, num_not_agrees, num_unsure
def parse(text): # Strip numbers out, since that seems to cause problems for my approach text = re.sub(r'\d+ ?', 'some ', text) sentences = sentence_tokenizer.parse(text) sentence_pronouns = [] for sentence in sentences: log("Looking for pronouns in '{0}'".format(sentence), 2) pronoun_totals = [[], [], []] tree = parsers.parse(sentence)[0] pronoun_trees = tree.subtrees(lambda x: x.node in pronoun_tags) for pronoun_tree in pronoun_trees: # First total up all the first person pronouns for i in range(3): if pronoun_tree[0].lower() in pronouns[i]: pronoun_totals[i].append(pronoun_tree[0]) log("First Person '{0}'".format(pronoun_totals[0]), 3) log("Second Person '{0}'".format(pronoun_totals[1]), 3) log("Third Person '{0}'".format(pronoun_totals[2]), 3) sentence_pronouns.append(pronoun_totals) log("Pronouns found in text: %s" % (sentence_pronouns), 2) # If there are 3rd person pronouns in any sentence, we have to decide # if they are used correctly. We do this in the following, very # expensive, but possibly correct manner. # # Start from the top down # 1. Look back 2 sentences and see if we can find a refernece. # IF NOT - its an error and do no more # 2. If so, replace the refereneced word with "RUNNING" # and search again, to see if there is a previous word it could refer # to. # IF NOT, its correct. Replace the pronoun with the referenced word # and continue # 3. Else, its not felicitous. Give bad credit for i in range(len(sentences)): if len(sentence_pronouns[i][2]) > 0: pronoun_results = [] for third_pronoun in sentence_pronouns[i][2]: all_sentences = sentences[max(0, i - 2):i + 1] norm_sentences = ". ".join( [a_sen.strip(".") for a_sen in all_sentences]) + "." log( "Looking for pronoun coherence for '{0}'".format( norm_sentences), 4) pronouns_refs = parsers.parse_coref(norm_sentences) log("Recieved co-references {0}".format(pronouns_refs), 5) found_bundle = False for j in range(len(pronouns_refs)): if third_pronoun == pronouns_refs[j]['pronoun']: found_bundle = pronouns_refs[j] break if not found_bundle: log("Found NO anticedent for {0}".format(third_pronoun), 3) pronoun_results.append((third_pronoun, -1)) else: log("Found anticedent for {0}".format(third_pronoun), 3) ref_index = int(found_bundle['ref_sentence']) - 1 + (i - 2) sentences[ref_index] = sentences[ref_index].replace( found_bundle['ref'], 'RUNNING') log( "Replacing '{0}' with 'RUNNING'".format( found_bundle['ref']), 3) altered_sentences = sentences[max(0, i - 2):i + 1] norm_altered_sentences = ". ".join( [a_sen.strip(".") for a_sen in altered_sentences]) + "." log( "New test sentences are '{0}'".format( norm_altered_sentences), 4) altered_pronouns_refs = parsers.parse_coref( norm_altered_sentences) if third_pronoun not in [ a_ref['pronoun'] for a_ref in altered_pronouns_refs ]: log("Anticedent is unambigious!", 3) pro_index = int( found_bundle['pronoun_sentence']) - 1 + (i - 2) sentences[pro_index] = sentences[pro_index].replace( found_bundle['pronoun'], found_bundle['ref']) pronoun_results.append( (third_pronoun, found_bundle['ref'])) else: log("Anticedent is ambigious", 3) log("New Sentences: {0}".format(altered_pronouns_refs), 4) pronoun_results.append((third_pronoun, .5)) sentence_pronouns[i][2] = pronoun_results return sentence_pronouns
def parse(text): # Strip numbers out, since that seems to cause problems for my approach text = re.sub(r'\d+ ?', 'some ', text) sentences = sentence_tokenizer.parse(text) sentence_pronouns = [] for sentence in sentences: log("Looking for pronouns in '{0}'".format(sentence), 2) pronoun_totals = [[], [], []] tree = parsers.parse(sentence)[0] pronoun_trees = tree.subtrees(lambda x: x.node in pronoun_tags) for pronoun_tree in pronoun_trees: # First total up all the first person pronouns for i in range(3): if pronoun_tree[0].lower() in pronouns[i]: pronoun_totals[i].append(pronoun_tree[0]) log("First Person '{0}'".format(pronoun_totals[0]), 3) log("Second Person '{0}'".format(pronoun_totals[1]), 3) log("Third Person '{0}'".format(pronoun_totals[2]), 3) sentence_pronouns.append(pronoun_totals) log("Pronouns found in text: %s" % (sentence_pronouns), 2) # If there are 3rd person pronouns in any sentence, we have to decide # if they are used correctly. We do this in the following, very # expensive, but possibly correct manner. # # Start from the top down # 1. Look back 2 sentences and see if we can find a refernece. # IF NOT - its an error and do no more # 2. If so, replace the refereneced word with "RUNNING" # and search again, to see if there is a previous word it could refer # to. # IF NOT, its correct. Replace the pronoun with the referenced word # and continue # 3. Else, its not felicitous. Give bad credit for i in range(len(sentences)): if len(sentence_pronouns[i][2]) > 0: pronoun_results = [] for third_pronoun in sentence_pronouns[i][2]: all_sentences = sentences[max(0, i - 2):i + 1] norm_sentences = ". ".join([a_sen.strip(".") for a_sen in all_sentences]) + "." log("Looking for pronoun coherence for '{0}'".format(norm_sentences), 4) pronouns_refs = parsers.parse_coref(norm_sentences) log("Recieved co-references {0}".format(pronouns_refs), 5) found_bundle = False for j in range(len(pronouns_refs)): if third_pronoun == pronouns_refs[j]['pronoun']: found_bundle = pronouns_refs[j] break if not found_bundle: log("Found NO anticedent for {0}".format(third_pronoun), 3) pronoun_results.append((third_pronoun, -1)) else: log("Found anticedent for {0}".format(third_pronoun), 3) ref_index = int(found_bundle['ref_sentence']) - 1 + (i - 2) sentences[ref_index] = sentences[ref_index].replace(found_bundle['ref'], 'RUNNING') log("Replacing '{0}' with 'RUNNING'".format(found_bundle['ref']), 3) altered_sentences = sentences[max(0, i - 2):i + 1] norm_altered_sentences = ". ".join([a_sen.strip(".") for a_sen in altered_sentences]) + "." log("New test sentences are '{0}'".format(norm_altered_sentences), 4) altered_pronouns_refs = parsers.parse_coref(norm_altered_sentences) if third_pronoun not in [a_ref['pronoun'] for a_ref in altered_pronouns_refs]: log("Anticedent is unambigious!", 3) pro_index = int(found_bundle['pronoun_sentence']) - 1 + (i - 2) sentences[pro_index] = sentences[pro_index].replace(found_bundle['pronoun'], found_bundle['ref']) pronoun_results.append((third_pronoun, found_bundle['ref'])) else: log("Anticedent is ambigious", 3) log("New Sentences: {0}".format(altered_pronouns_refs), 4) pronoun_results.append((third_pronoun, .5)) sentence_pronouns[i][2] = pronoun_results return sentence_pronouns
def parse(text, use_cache=True): num_agrees = 0 num_not_agrees = 0 num_unsure = 0 lines = text.split("\n") for line in lines: sentences = sentence_tokenizer.parse(line, use_cache=use_cache) for sentence in sentences: line_agreements, line_non_agreements, line_unsure = 0, 0, 0 # Possession seems to be tricky for the parser, so we fudge # a little here sentence = sentence.replace("'s", '') if sentence[-1] != ".": sentence += "." if use_cache: cache_rs = cache_utils.cache_get('sub_verb_agreement', sentence) if cache_rs: line_agreements, line_non_agreements, line_unsure = cache_rs num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure continue log("Looking for Sub-Verb agreement in '%s'" % (sentence,), 1) tree = parsers.parse(sentence)[0] dependencies = parsers.dependences(sentence) sub_verb_deps = [dep for dep in dependencies if dep['dep_name'] == 'nsubj'] if len(sub_verb_deps) == 0: log("Couldn't find Subject-Verb dependency info", 1) cache_utils.cache_set('sub_verb_agreement', sentence, (0, 0, 0)) continue for sub_verb in sub_verb_deps: first_node = node_in_tree(tree, sub_verb['first_word']) sec_node = node_in_tree(tree, sub_verb['second_word']) if first_node and sec_node: log("First Dep Node: %s" % (first_node,), 2) log("Sec Dep Node: %s" % (sec_node,), 2) try: is_agreement = check_node_agreement(first_node, sec_node) if is_agreement: line_agreements += 1 else: line_non_agreements += 1 log("Agreement in sentence? %s" % (is_agreement,), 1) except Exception as e: line_unsure += 1 log("Error looking for agreement? %s" % (e.message,), 2) # No agreement in pair. Not sure how to handle. # More exhaustive search? if use_cache: cache_utils.cache_set('sub_verb_agreement', sentence, (line_agreements, line_non_agreements, line_unsure)) num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure return num_agrees, num_not_agrees, num_unsure