def parse_sentences(line, use_cache=True, include_prob=False): log("Working on: %s" % (line,), 2) if use_cache: correct_parse = cache_get("sentence_tokenizer", line) if correct_parse: log("Cache Hit: %s" % (correct_parse[0],), 4) log("-------------\n", 4) return correct_parse if include_prob else correct_parse[0] all_possible_sentences = _possible_sentences_in_line(line) all_possible_sentence_probs = [] invalid_possible_sentences = [] stored_probs = {} for possible_sentences in all_possible_sentences: log("Examining: %s" % (possible_sentences,), 1) prob_for_sentences = [] sent_is_impossible = False for possible_sentence in possible_sentences: if use_cache: possible_sentence_prob = cache_get('possible_sentences', possible_sentence) if possible_sentence_prob is not None: log("Cache Hit: %s (from %s)" % (possible_sentence, 'possible sentences'), 4) prob_for_sentences.append(possible_sentence_prob) continue if contains_any_invalid_setences(possible_sentences, invalid_possible_sentences) or sent_is_impossible: prob_for_sentences.append(0) continue elif possible_sentence in stored_probs: prob_for_sentences.append(stored_probs[possible_sentence]) continue sentence_trees = parsers.parse(possible_sentence) if len(sentence_trees) == 0: log("Wasn't able to parse input %s" % (possible_sentence,), 0) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True continue else: sentence_tree = sentence_trees[0] if cmd_log_level() >= 4: print "--------" print "Pre Simplified Tree" print sentence_tree tree_utils.simplify_tree(sentence_tree, remove_starting_cc=possible_sentences.index(possible_sentence) == 0) if cmd_log_level() >= 4: print "--------" print "Post Simplified Tree" print sentence_tree sentence_transitions = tree_utils.transitions_in_tree(sentence_tree) if not is_possible_sentence(sentence_tree): log("%s" % (sentence_transitions,), 2) log("Invalid parse", 2) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True if use_cache: cache_set('possible_sentences', possible_sentence, 0) else: log("%s" % (sentence_transitions,), 2) sentence_probs = [] for transition in sentence_transitions: try: probs = hmm_utils.prob_of_all_transitions(transition, counts, gram_size=3) except KeyError, e: log("'Imposible' Tag order", 2, sep=' ** ') log("%s" % (e,), 2, sep=' ** ') probs = [0] sentence_probs += probs log("Transitions: %s" % (transition,), 3) log("Probabilities: %s" % (probs,), 3) attempt_sentence_prob = prod(sentence_probs) sentence_prob_boost = boost_for_sentence_tree(sentence_tree) attempt_sentence_prob *= sentence_prob_boost prob_for_sentences.append(attempt_sentence_prob) stored_probs[possible_sentence] = attempt_sentence_prob if use_cache: cache_set('possible_sentences', possible_sentence, attempt_sentence_prob) weighted_score = prod(prob_for_sentences) * (weight ** (len(possible_sentences) - 1)) if weighted_score > 0: log("Valid Parse: %s" % (possible_sentences,), 2) log(weighted_score, 2) all_possible_sentence_probs.append(weighted_score)
def issues_in_sentence(sentence, use_cache=True): """'Brute force' check for a bunch of possible word ordering issues. Specifically, looking for the following: - VP coming before NP in standard sentence - NP coming before VP in inverted sentence - JJ coming after Nount in NP - VB before PP in VP - VB before NP in VP - VP before S in standard sentence (with embedded sentences) - NN before CD in NP - NNP before CD in NP """ if use_cache: result = cache_get('word_order_issues', sentence) if result is not None: return result tree = parsers.parse(sentence)[0] tree_utils.simplify_tree(tree, trim_adjecent_prop_nouns=True, normalize_sent_roots=True, normalize_plural=True, normalize_case=True) log("Looking for order issues in: %s" % (sentence,), 1) if cmd_log_level() >= 4: print "Simplified Parse Tree" print tree problems = [] problems += ["VP->NP in S"] * num_forbidden_orders(tree, ("S",), ('VP', 'NP')) problems += ["NP->VP in SINV"] * num_forbidden_orders(tree, ('SINV',), ('NP', 'VP')) problems += ["NN->JJ in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'JP')) problems += ["PP->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('PP', 'VB')) problems += ["NP->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('NP', 'VP')) problems += ["S->VP in S"] * num_forbidden_orders(tree, ('S',), ('S', 'VP')) problems += ["S->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('S', 'VB')) # problems += ["VB->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('VB', 'VP')) problems += ["NP->RBR in ADVP"] * num_forbidden_orders(tree, ('ADVP',), ('NP', 'RBR')) problems += ["NN->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'DT')) problems += ["NNP->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'DT')) problems += ["NN->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'CD')) problems += ["NNP->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'CD')) problems += ['PP->NP in S'] * num_forbidden_orders(tree, ('S',), ('PP', 'NP')) # Toggle? problems += ['NP->VP in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'VP')) # Seems like it should be VB->ADVP->PP problems += ['VB->PP->ADVP in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'ADVP')) problems += ['VB->PP->SBAR in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'SBAR')) problems += ['NP->S in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'S')) # Seems like the ADJP should be in a NP or somewhere else, not a sibling # of a noun phrase problems += ['NP->ADJP in S'] * num_forbidden_orders(tree, ('S',), ('NP', 'ADJP')) # Last, if there is an S w/ only one child, we call it a word order problem... problems += ['Single Child S'] * len(list(tree.subtrees(lambda x: x in tree_utils.semi_tree_roots and len(x) == 1))) if tree[0].node not in tree_utils.semi_tree_roots and not hasattr(tree[0], '_has_error'): tree[0]._has_error = True problems += ['No S Root'] log("Found %d order issues" % (len(problems),), 1) log("Issues: %s", (problems,), 2) if use_cache: cache_set('word_order_issues', sentence, problems) return problems
def parse_sentences(line, use_cache=True, include_prob=False): log("Working on: %s" % (line, ), 2) if use_cache: correct_parse = cache_get("sentence_tokenizer", line) if correct_parse: log("Cache Hit: %s" % (correct_parse[0], ), 4) log("-------------\n", 4) return correct_parse if include_prob else correct_parse[0] all_possible_sentences = _possible_sentences_in_line(line) all_possible_sentence_probs = [] invalid_possible_sentences = [] stored_probs = {} for possible_sentences in all_possible_sentences: log("Examining: %s" % (possible_sentences, ), 1) prob_for_sentences = [] sent_is_impossible = False for possible_sentence in possible_sentences: if use_cache: possible_sentence_prob = cache_get('possible_sentences', possible_sentence) if possible_sentence_prob is not None: log( "Cache Hit: %s (from %s)" % (possible_sentence, 'possible sentences'), 4) prob_for_sentences.append(possible_sentence_prob) continue if contains_any_invalid_setences( possible_sentences, invalid_possible_sentences) or sent_is_impossible: prob_for_sentences.append(0) continue elif possible_sentence in stored_probs: prob_for_sentences.append(stored_probs[possible_sentence]) continue sentence_trees = parsers.parse(possible_sentence) if len(sentence_trees) == 0: log("Wasn't able to parse input %s" % (possible_sentence, ), 0) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True continue else: sentence_tree = sentence_trees[0] if cmd_log_level() >= 4: print "--------" print "Pre Simplified Tree" print sentence_tree tree_utils.simplify_tree( sentence_tree, remove_starting_cc=possible_sentences.index( possible_sentence) == 0) if cmd_log_level() >= 4: print "--------" print "Post Simplified Tree" print sentence_tree sentence_transitions = tree_utils.transitions_in_tree( sentence_tree) if not is_possible_sentence(sentence_tree): log("%s" % (sentence_transitions, ), 2) log("Invalid parse", 2) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True if use_cache: cache_set('possible_sentences', possible_sentence, 0) else: log("%s" % (sentence_transitions, ), 2) sentence_probs = [] for transition in sentence_transitions: try: probs = hmm_utils.prob_of_all_transitions(transition, counts, gram_size=3) except KeyError, e: log("'Imposible' Tag order", 2, sep=' ** ') log("%s" % (e, ), 2, sep=' ** ') probs = [0] sentence_probs += probs log("Transitions: %s" % (transition, ), 3) log("Probabilities: %s" % (probs, ), 3) attempt_sentence_prob = prod(sentence_probs) sentence_prob_boost = boost_for_sentence_tree(sentence_tree) attempt_sentence_prob *= sentence_prob_boost prob_for_sentences.append(attempt_sentence_prob) stored_probs[possible_sentence] = attempt_sentence_prob if use_cache: cache_set('possible_sentences', possible_sentence, attempt_sentence_prob) weighted_score = prod(prob_for_sentences) * (weight**( len(possible_sentences) - 1)) if weighted_score > 0: log("Valid Parse: %s" % (possible_sentences, ), 2) log(weighted_score, 2) all_possible_sentence_probs.append(weighted_score)