def check_whether_header_is_valuable(location, hits): header = get_header_of_chunk(location, hits) #print "CHECKING HEADER:", header # header *has* to contain some special keywords. contains_keyword = False for word in header: if re.match(lfp.headerpatternrepository.get_pattern_of_headers_we_want(), word): contains_keyword = True #print word, header break if not contains_keyword: return False # now check for common bigrams that we don't want. compressed_header = ''.join(header) #print header #print compressed_header if re.search("Debt|Other|Environmental|Proceeding", compressed_header, re.I) and \ not re.search("Litigation|Contingenc|Commitment|" + \ "Contigencies|Legal|Subsequent", compressed_header, re.I): return False for regex in lfp.headerpatternrepository.get_patterns_of_headers_we_dont_want(): if re.search(regex, compressed_header): #print "MATCH ON BAD REGEX" return False # we only want subsequent event headers; nothing more. if re.search("S[uU][bB][sS][eE][qQ][uU][eE][nN][tT]", compressed_header) \ and not re.search("Subsequent.*?Event", compressed_header, re.I): #print "match on sub" return False # first words are never numbers. if Utilities.contains_numbers(header[0]) \ or (len(header) >= 2 and Utilities.contains_numbers(header[1])): #print "MATCH ON NUMBER" return False # does it have a letter in parentheses that is not "A"? # if so, forget it. if re.search("\([B-Zb-z]\)", compressed_header): return False return True
def was_cut_within_a_table(location, hits): last_sentence_fragment = lfp.wordtokencreation.get_last_sentence_fragment(location, hits) if last_sentence_fragment is None: return False compressed_sentence_fragment = lfp.wordtokencreation.get_last_sentence_fragment(location, hits, return_as_string=True) #print "FRAGMENT:", compressed_sentence_fragment # see whether we picked up a table. # tables normally have units of currency as well as the word follows somewhere. # if these hold, then we're probably in a table from a previous section. # that means that if we're in a relevant section right now, and the new hit demarcates a new section, # then we want to stop recording. if we're not recording, then we probably want to start. # if we're in a relevant section and the new hit does *not* have a header that's been whitelisted as being # a section, then we can continue recording. if re.search("(in)?\s*(millions|thousands|billions)", compressed_sentence_fragment, re.I | re.M | re.S) \ and re.search("total|follow(s|ing)|balance", compressed_sentence_fragment, re.I | re.M | re.S): #print "MATCH ON currency" #print 'MATCH ON FOLLOWS|total' return True char_frequency = Utilities.character_counter(compressed_sentence_fragment, '$') if char_frequency['$'] >= 6: #print 'MATCH ON DOLLAR COUNT' return True number_count = 0 for word in last_sentence_fragment: if Utilities.contains_numbers(word): number_count += 1 if re.search("total|follow(s|ing)|balance", compressed_sentence_fragment, re.I | re.M | re.S) \ and number_count >= 6: #print "match on number count" return True return False