def get_dividend_paras(doc_text): start_trigger = "shall be entitled to receive dividends" start_trigger = preprocess_text(start_trigger, stem=False).split(" ") loc = find_loc(doc_text, [start_trigger], allow_contains=True) if loc is None: return "Failed to find dividend info text" loc = loc[0] numbers = get_nums_from_text(doc_text, min=0, max=1, decimal=True) start_found = False start = -1 for i in range(len(numbers)): cur_num = numbers[i] if not start_found and cur_num[2] > loc: start = cur_num[2] - 10 start_found = True continue if start_found: if (cur_num[2] - numbers[i - 1][2]) > 140: # value is too far aways end = numbers[i-2][2] + 20 return " ".join(doc_text[start: end]) if not start_found: return "Failed to find dividend info text" return " ".join(doc_text[start: numbers[-1][2] + 20])
def get_IV_intro_text(filename, beginning_intro_triggers_filename="Beginning_IV", end_intro_triggers_filename="End_IV_intro"): beginning_intro_triggers = get_proccessed_triggers( beginning_intro_triggers_filename) end_intro_triggers = get_proccessed_triggers(end_intro_triggers_filename) text = read_contract(filename) text = text.split(" ") # print("text", text) # text = text.split(" ") # text = remove_periods(text) if filename in Failures: return "Failure expected" try: beginning_of_IV_intro = find_loc(text, beginning_intro_triggers, allow_contains=False) end_of_IV_intro = find_loc(text, end_intro_triggers)[0] except TypeError: print("failed", filename) print(text) return None # In case triggers didn't work, search the entire document if beginning_of_IV_intro is None: print("Beginning of IV intro is None") beginning_of_IV_intro = 0 if beginning_of_IV_intro[0] > end_of_IV_intro: print("End is less than beginning index of IV intro") beginning_of_IV_intro = [0] end_of_IV_intro = len(text) # assert (beginning_of_IV_intro is not None), "Invalid info file " + " ".join(text) + filename beginning_of_IV = get_closest_string(values=beginning_of_IV_intro, target=end_of_IV_intro, less=True) if beginning_of_IV == -1: print("no beginning found") beginning_of_IV = end_of_IV_intro - 300 IV_intro_text = text[beginning_of_IV:end_of_IV_intro] return IV_intro_text
def get_board_of_directors_paras(doc_text): # start_trigger = "election of directors" start_trigger = "director elect" start_trigger = preprocess_text(start_trigger, stem=False).split(" ") print(start_trigger) loc = find_loc(doc_text, [start_trigger], allow_contains=False) if loc is None: loc = find_loc(doc_text, [start_trigger], allow_contains=True) if loc is None: return "Failed to find directors info text" for i in loc: if i < 200: continue print(i) return " ".join(doc_text[i: i + 200]) # TODO: review find_loc function given sample results return "Failed to find board of directors info text"
def get_securities_info_paras(doc_text, beginning_intro_triggers_filename="Beginning_IV", end_intro_triggers_filename="End_IV_intro"): beginning_intro_triggers = get_proccessed_triggers(beginning_intro_triggers_filename, preproccess=False) end_intro_triggers = get_proccessed_triggers(end_intro_triggers_filename, preproccess=False) try: beginning_of_IV_intro = find_loc(doc_text, beginning_intro_triggers, allow_contains=True) end_of_IV_intro = find_loc(doc_text, end_intro_triggers) except TypeError: return "Failed to find securities info text" # In case triggers didn't work, search the entire document diffs = sorted(product(beginning_of_IV_intro, end_of_IV_intro), key=lambda t: abs(t[0] - t[1])) _, names = get_names(doc_text, stem=False) names_used = get_names_from_text(doc_text, names) names_used = " ".join([x[0] + ", " for x in names_used])[:-2] for dif in diffs: if dif[0] < dif[1]: return "Types of shares found: " + names_used + "\n" + " ".join(doc_text[dif[0]:dif[1]]) return "Failed to find securities info text"
def get_liquidation_paras(doc_text): start_triggers = ["event of any liquidation", "upon any such liquidation"] start_triggers = [preprocess_text(start_trigger, stem=False).split(" ") for start_trigger in start_triggers] start_locs = find_loc(doc_text, start_triggers, allow_contains=True) print(start_locs) print(len(doc_text)) if start_locs is None or start_locs == []: return "Failed to find liquidation info text" for i in start_locs: if i > 200: out = " ".join(doc_text[i: i + 200]) return out return "Failed to find liquidation info text"
def get_original_issue_price(text, names, buffer=30): loc = find_loc(text, ["riginal issue pric"], allow_contains=True) print("LOC", loc) text = text[loc:] used_names = get_names_from_text(text, names) numbers = get_nums_from_text(text, min=0, max=5, decimal=True) pairs = match_nums_with_targets(numbers, used_names) names_out = [] nums_out = [] if pairs is None: print("NA") return None, None for name, num in pairs: names_out.append(name) nums_out.append(num) print("nums_out", nums_out) print("names_out", names_out) return nums_out, names_out