def headlerlistcreation(headeritem): logger = Utils.add_logger() colist = [] headerlist = [] headeritem = [ x for x in headeritem if x not in ['Management', 'Opposition'] ] # print("headeritem:{}".format(headeritem)) length = len(headeritem) try: for i in range(0, length + 1): if (i % 2 == 0): headerlist.append(headeritem[i]) else: if "Date" in headeritem[i] or "Agenda" in headeritem[i]: headeritem.insert(i, " ") colist.append(" ") length += 1 else: colist.append(headeritem[i]) except: logger.debug("colList:{}".format(colist)) #print("colList:{}".format(colist)) #print("headeritem after loop:{}".format(headerlist)) #print("collist:{}".format(colist)) alist = [] for i, j in zip(headerlist, colist): alist.append(i) alist.append(j) #print(alist) return alist
def preprocess(accession_no): logger = Utils.add_logger() try: path = os.path.join(CONFIG["Path"]["file_path"], accession_no + ".dissem") file = open(path, 'r', encoding='utf-8') s = file.read() raw = bs(s, "lxml") val = raw.get_text() result = raw.find_all(text=re.compile("NAME OF REGISTRANT")) for val in result: v = val.split('\n') for i, str in enumerate(v): if "NAME OF REGISTRANT" in str: temp = v[i].split(':') Registrant_Name = temp[1].strip() logging.debug("Registrant Name for file:{} is {}".format( accession_no, Registrant_Name)) res = raw.find_all(text=re.compile(Registrant_Name)) res = raw.getText() text = "" for value in res: text += value result_list = re.split(r"-{20,}", text) except Exception as error: logger.error("Error in preprocess for file-{}-{}".format( accession_no, error)) return list(), "" return result_list, Registrant_Name
def splitevenodd(headeritem, companyName): logger = Utils.add_logger() colist = [] headerlist = [] headeritem = [ x for x in headeritem if x not in ['Management', 'Opposition'] ] # print("headeritem:{}".format(headeritem)) length = len(headeritem) try: for i in range(0, length + 1): # print(length) if (i % 2 == 0): headerlist.append(headeritem[i]) else: if "Date" in headeritem[i] or "Agenda" in headeritem[i]: headeritem.insert(i, " ") colist.append(" ") length += 1 else: colist.append(headeritem[i]) except: logger.debug("colList:{}".format(colist)) #print("colList:{}".format(colist)) dfs = pd.DataFrame(columns=headerlist) # print("headeritem after loop:{}".format(headerlist)) # print("collist:{}".format(colist)) dfs = dfs.append(pd.Series(colist, index=headerlist), ignore_index=True) #print("CompanyName={}".format(companyName)) if len(companyName) >= 1: dfs['CompanyName'] = companyName #print(dfs) return dfs
def process(self, accession_no, doc_id): logger = Utils.add_logger() try: path = CONFIG["Path"]["file_path"] + accession_no + ".dissem" content = Utils.read_file(path, accession_no) content = Format6parser.segparsing(content) content = content.replace("</td>\n<tr>", "</td></tr><tr>") soup = BeautifulSoup(content, "html.parser") tables = soup.findAll('table') dffa = None prev_table = None finaldf = pd.DataFrame() for table in tables: try: if Format6APipeline.check_table(table): if prev_table is not None: prev_table = prev_table + str(table.contents) dffa = Format6parser.tabledetails( BeautifulSoup(prev_table, "html.parser")) prev_table = None else: dffa = Format6parser.tabledetails(table) else: prev_table = str(table.contents) #print('######################') if dffa is not None: if finaldf.empty == True: finaldf = dffa dffa = None else: #print(dffa.head()) finaldf = finaldf.append(dffa, sort=True) dffa = None except Exception as error: logger.error( "Exception in table parsing for format6 for accession_no -{} -{}" .format(accession_no, error)) filename = os.path.join(CONFIG['Path']['output_path_format6B'], accession_no + '.xlsx') Utils.df_to_excel(accession_no, finaldf, filename) return 1 except Exception as error: logger.error( "Exception in format6 pipeline for accession_no -{}-{}".format( accession_no, error)) print("Exception in format6A pipeline-{}".format(error)) return 0
def process(self,accession_no,doc_id): logger = Utils.add_logger() try: path = CONFIG["Path"]["file_path"] + accession_no +".dissem" content = Utils.read_file(path,accession_no) content = content.replace(' ', '\xa0') soup = BeautifulSoup(content, "html.parser") contents = soup.text contents = contents.replace('SIGNATURES', '----------------------------------') contents = contents.replace('FUND:', '############### \n FUND:') fundlist = contents.split('###############') except Exception as error: logger.error("Exception in parsing through bs4 for accession_no -{} -{}".format(accession_no,error)) fundlist = [] return "unsuccessfull" i = 0 final_df = pd.DataFrame() # print(fundlist[1]) for list in fundlist: if i > 0: sr = list.split('ISSUER') fName = Format4Parser.fundName(sr[0]) #print(fName) df = Format4Parser.companyfundDetails(list, fName) try: final_df = final_df.append(df) except Exception as error: logger.error("Exception in pipeline for df merging for accession_no -{} -{}".format(accession_no,error)) return "unsuccessfull" i = 1 final_df = Format4Parser.concat_rowData(final_df) final_df = final_df.drop(columns=['ID']) final_df.rename(columns={"ISSUER":"CompanyName","MEETING DATE":"MeetingDate","Proposal No":"ProposalNumber", "PROPOSAL":"Proposal","PROPOSED BY":"ProposedBy","VOTED?":"Voted", "MGMT":"ForAgainstManagement","TICKER":"Ticker","VOTE CAST":"VoteCast"},inplace=True) final_df["MeetingDate"] = final_df["MeetingDate"].apply(Utils.parsing_date) final_df["AccesssionNumber"] = accession_no final_df["DocumentId"] = doc_id Utils.df_to_database(accession_no,doc_id,final_df) filename = os.path.join(CONFIG["Path"]["output_path_format4"], accession_no + ".xlsx") Utils.df_to_excel(accession_no,final_df,filename) return 1
def process_df(accession_no, main_header_df, table_df): logger = Utils.add_logger() try: table_df = table_df.astype({"seq_id": int}) main_header_df = main_header_df.reset_index(drop=True) main_header_df = main_header_df.astype({"seq_id": int}) df = table_df.dropna(subset=['Proposal']) df["ProposedBy"] = df["ProposedBy"].apply(Parser.replace_empty) df["ProposalNumber"] = df["ProposalNumber"].apply( Parser.replace_empty) main_header_df["MeetingDate"] = main_header_df[ "MeetingDate"].apply(Utils.parsing_date) df = df.reset_index(drop=True) r = len(df.index) - 1 #print(r) while r >= 0: temp_index = list() while ((df.at[r, "Proposal"] != "DIRECTOR" or df.at[r, "ProposalNumber"] == None) and (df.at[r, "ProposedBy"] == None)): temp_index.append(r) r -= 1 #print(temp_index) r -= 1 if len(temp_index) >= 1: c = temp_index[-1] # print(c) for m in reversed(temp_index): df.at[c - 1, "Proposal"] += " " + df.at[m, "Proposal"] df.at[m, "Proposal"] = None df = df.dropna(subset=["Proposal"]) final_df = main_header_df.merge(df, on=["seq_id"], how="inner") except Exception as error: logger.error("Error in process_df for accession_no-{}-{}".format( accession_no, error)) print("Error in df processing for accession_no-{}-{}".format( accession_no, error)) return pd.DataFrame() return final_df
def process(self, accession_no, doc_id): logger = Utils.add_logger() text_list, registrant_name = Parser.preprocess(accession_no) if len(text_list) > 0: funds_list = Parser.parse_file(accession_no, text_list, doc_id) if len(funds_list) > 0: header_df, table_df = Parser.post_process( accession_no, funds_list) if header_df.empty: logging.debug( "header_df is empty for accession_no:{}".format( accession_no)) else: final_df = Parser.process_df(accession_no, header_df, table_df) if final_df.empty: logging.debug( "final_df is empty for accession_no:{}".format( accession_no)) else: logging.debug( "final_df created successfully for accession_no:{}" .format(accession_no)) print("final_df length:{}".format(len(final_df.index))) final_df = final_df.drop(columns=["seq_id"]) Utils.df_to_database(accession_no, doc_id, final_df) filename = os.path.join( CONFIG["Path"]["output_path_format2"], accession_no + ".xlsx") Utils.df_to_excel(accession_no, final_df, filename) return 1 else: logger.debug( "No data found from the file for accssion_no:{}".format( accession_no)) return "Unsuccessfull"
def tabledetails(table, lstdf, accession_no): logger = Utils.add_logger() line = [] temp = False length = 0 head = 0 tempf = 0 fund = '' parseline = [] checkdata = [] table_rows = table.find_all('tr') for tr in table_rows: try: td = tr.find_all('td') for i in td: line.append(i.text.replace('\n', '').replace('\xa0', ' ')) checkdata.append(i.text.strip().replace('\n', '').replace( '\xa0', ' ')) line = [(lambda x: x.strip())(l) for l in line if l != ' '] #print(line) # if "QSR" in line or "FP" in line: # print("Line=={}".format(line)) if line == [] or len(line) == line.count(''): line = [] continue if temp == False: temp = Format8Parser.identify_header(line, accession_no) head = head + 1 # print('head ' + str(head)) # print('temp ' + str(temp)) # print('temp ' + str(len(line))) # print(len(lstdf)) if temp == False and head == 1 and len(lstdf) != 0: temp = True parseline = line if len(lstdf) == len(line): line = lstdf else: lstdf = list(lstdf) abcd = set(lstdf) if 'Fund Name' in abcd: lstdf.remove('Fund Name') if 'FundName' in abcd: lstdf.remove('FundName') line = lstdf if temp == True: line = [(lambda x: re.sub(' +', ' ', x))(l) for l in line if l != ' '] df = pd.DataFrame(columns=line) col = line length = len(line) if parseline != [] and len(parseline) == len(line): df = df.append(pd.Series(parseline, index=col), ignore_index=True) #print("header") #print(line) #print(length) ##print(df) elif length == len(line): df = df.append(pd.Series(line, index=col), ignore_index=True) elif length == len(checkdata): df = df.append(pd.Series(checkdata, index=col), ignore_index=True) line = [] checkdata = [] except Exception as error: logger.error( "Exception in format8 parser for accession_no:{} -{}". format(accession_no, error)) print("Exception in format8 parser:{}".format(error)) if temp == False: df = pd.DataFrame() else: for col in df.columns: if col.upper().find('FUND NAME') > -1: tempf = 1 if tempf != 1 and fund != '': df["Fund Name"] = fund return df
def process(self, accession_no, doc_id): logger = Utils.add_logger() try: olderFundName = '' lstdf = [] dfs_all = pd.DataFrame() path = CONFIG["Path"]["file_path"] + accession_no + ".dissem" content = Utils.read_file(path, accession_no) content = Format8Parser.segparsing(content) # print(content) content = re.sub("=+", "", content) content = content.replace("<TABLE", "@@@@@@@@@@@@@@@@@@@@@\n <TABLE") content = content.replace("<table", "@@@@@@@@@@@@@@@@@@@@@\n <table") content = content.replace("</TABLE>", "</TABLE>\n ######################") content = content.replace("</table>", "</table>\n ######################") content = content.replace("</td>\n<tr>", "</td></tr><tr>") content = content.replace( "The Fund did not vote any proxies during this reporting period", "The Fund did not vote any proxies during this reporting period\n ######################" ) fundVoteData = content.split('######################') for alist in fundVoteData: try: #print("---------------------------------------------------------------") # print(alist,lstdf) # print(alist) df_all = Format8Parser.tableparsed(alist, olderFundName, lstdf, accession_no) # print(df_all) if df_all is not None: dfs_batch = pd.DataFrame() # col_list = df_all.columns # col_list = [(lambda x: re.sub(' +',' ',x))(l) for l in col_list] # df_all.columns = col_list dfs_batch = Format8Parser.formDataFrame(df_all) if dfs_batch.empty != True: if dfs_all.empty == True: # print(dfs_batch) dfs_all = dfs_batch else: #print(dfs_batch) dfs_all = dfs_all.append(dfs_batch, ignore_index=True, sort=False) if "FundName" in dfs_all.columns: olderFundName = dfs_all["FundName"][0] lstdf = df_all.columns # print(dfs_batch) # print(lstdf) except Exception as error: logger.error( "Exception in for loop of Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) print( "Exception in for loop of Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) #print('------------Final -----------------') #print(dfs_all) #print("Column names:{}".format(dfs_all.columns)) if dfs_all is not None: dfs_all = Format8Parser.remove_spaces_from_df(dfs_all) filename = os.path.join(CONFIG['Path']['output_path_format8'], accession_no + '.xlsx') Utils.df_to_excel(accession_no, dfs_all, filename) return 1 except Exception as error: logger.error( "Exception in Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) print( "Exception in Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) return 0
def parse_file(accession_no, text_list, doc_id): logger = Utils.add_logger() try: funds_list = list() seq_id = 1 # sequencing prev_fundname = "" for index, temp in enumerate(text_list): split = re.split("\n", temp) for x, spli in enumerate(split): num = x + 3 table_list = list() if "Proposal Vote" in spli: #print("Table Data") row_list = list() table_list.append(seq_id - 1) while (num < (len(split) - 2)) and ( case3 not in split[num]) and (case4 not in split[num]): row_text = split[num] # print(row_text) if len(row_text) > 0: temp_text = "" temp_text += row_text[:6].strip() temp_text += "\t" + row_text[6:65].strip() temp_text += "\t" + row_text[65:79].strip() temp_text += "\t" + row_text[79:110].strip() temp_text += "\t" + row_text[110:].strip() table_list.append(temp_text) #print(temp_text) num += 1 funds_list.append(table_list) sep = re.split("\s{3,}", spli) # print(sep) header_list = list() for i, s in enumerate(sep): # print(s) if "Agenda Number:" in s: fund = text_list[index - 1] # print(fund) fund_name = re.split(r'\s{2,}', fund) fund_name = fund_name[-1] fund_name = re.sub("\n", "", fund_name) fund_name = fund_name.strip() if len(fund_name) > 1: prev_fundname = fund_name else: fund_name = prev_fundname #print("fund_name:{}".format(fund_name)) t = s.split(':') agenda_number = t[1].strip() inc_name = sep[i - 1] inc_name = inc_name.strip() #print("Company Name:{} \nAgenda_number:{}".format(inc_name, agenda_number)) elif "Security:" in s: t = s.split(':') security = t[1].strip() #print("Security:", security) elif "Meeting Type:" in s: t = s.split(':') meeting_type = t[1].strip() #print("Meeting_type:", meeting_type) elif "Meeting Date:" in s: t = s.split(':') meeting_date = t[1].strip() #print("Meeting_date:", meeting_date) elif "Ticker:" in s: t = s.split(':') ticker = t[1].strip() #print("Ticker:", ticker) elif "ISIN:" in s: t = s.split(':') ISIN = t[1].strip() #print("ISIN:", ISIN) #print("") header_list.append(seq_id) header_list.append(doc_id) header_list.append(accession_no) header_list.append(fund_name) header_list.append(inc_name) header_list.append(security) header_list.append(meeting_date) header_list.append(meeting_type) header_list.append(ISIN) header_list.append(ticker) header_list.append(agenda_number) funds_list.append(header_list) seq_id += 1 except Exception as error: logger.error( "Error in parsing the file for accession_no-{}-{}".format( accession_no, error)) print("Error in parsing file for accession_no-{}-{}".format( accession_no, error)) return list() return funds_list
def post_process(accession_no, funds_list): logger = Utils.add_logger() try: seq_id = 1 main_header_df = pd.DataFrame(columns=[ "seq_id", "DocumentId", "AccesssionNumber", "FundName", "CompanyName", "SecurityId", "MeetingDate", "MeetingType", "ISIN", "Ticker", "AgendaNumber" ]) table_df = pd.DataFrame(columns=[ "seq_id", "ProposalNumber", "Proposal", "ProposedBy", "ForAgainstManagement", "VoteCast" ]) for i, data in enumerate(funds_list): if i % 2 == 0: #print("<-----Header Data----->") temp = np.asarray(data) temp = temp.reshape(1, 11) header_df = pd.DataFrame(temp, columns=[ "seq_id", "DocumentId", "AccesssionNumber", "FundName", "CompanyName", "SecurityId", "MeetingDate", "MeetingType", "ISIN", "Ticker", "AgendaNumber" ]) main_header_df = main_header_df.append(header_df) #print(data) #print() elif i % 2 != 0: #print("<------Table Data----->") for val in data: row_list = list() row_list.append(seq_id) if not isinstance(val, int): # val = re.sub(r"") split = re.split(r"\t", val) for value in split: row_list.append(value.lstrip()) else: seq_id = val tp = np.asarray(row_list) if len(tp) < 6: for x in range(len(tp), 6): tp = np.append(tp, None) # print(tp) tp = tp.reshape(1, 6) # print(tp) temp_df = pd.DataFrame(tp, columns=[ "seq_id", "ProposalNumber", "Proposal", "ProposedBy", "ForAgainstManagement", "VoteCast" ]) table_df = table_df.append(temp_df) #print(data) #print() except Exception as error: logger.error( "Error in post-processing for accession_no-{}-{}".format( accession_no, error)) print("Error in post-processing for accession_no-{}-{}".format( accession_no, error)) return pd.DataFrame(), pd.DataFrame() return main_header_df, table_df