def process(self, accession_no, doc_id): logger = Utils.add_logger() try: path = CONFIG["Path"]["file_path"] + accession_no + ".dissem" content = Utils.read_file(path, accession_no) content = Format6parser.segparsing(content) content = content.replace("</td>\n<tr>", "</td></tr><tr>") soup = BeautifulSoup(content, "html.parser") tables = soup.findAll('table') dffa = None prev_table = None finaldf = pd.DataFrame() for table in tables: try: if Format6APipeline.check_table(table): if prev_table is not None: prev_table = prev_table + str(table.contents) dffa = Format6parser.tabledetails( BeautifulSoup(prev_table, "html.parser")) prev_table = None else: dffa = Format6parser.tabledetails(table) else: prev_table = str(table.contents) #print('######################') if dffa is not None: if finaldf.empty == True: finaldf = dffa dffa = None else: #print(dffa.head()) finaldf = finaldf.append(dffa, sort=True) dffa = None except Exception as error: logger.error( "Exception in table parsing for format6 for accession_no -{} -{}" .format(accession_no, error)) filename = os.path.join(CONFIG['Path']['output_path_format6B'], accession_no + '.xlsx') Utils.df_to_excel(accession_no, finaldf, filename) return 1 except Exception as error: logger.error( "Exception in format6 pipeline for accession_no -{}-{}".format( accession_no, error)) print("Exception in format6A pipeline-{}".format(error)) return 0
def process(self,accession_no,doc_id): logger = Utils.add_logger() try: path = CONFIG["Path"]["file_path"] + accession_no +".dissem" content = Utils.read_file(path,accession_no) content = content.replace(' ', '\xa0') soup = BeautifulSoup(content, "html.parser") contents = soup.text contents = contents.replace('SIGNATURES', '----------------------------------') contents = contents.replace('FUND:', '############### \n FUND:') fundlist = contents.split('###############') except Exception as error: logger.error("Exception in parsing through bs4 for accession_no -{} -{}".format(accession_no,error)) fundlist = [] return "unsuccessfull" i = 0 final_df = pd.DataFrame() # print(fundlist[1]) for list in fundlist: if i > 0: sr = list.split('ISSUER') fName = Format4Parser.fundName(sr[0]) #print(fName) df = Format4Parser.companyfundDetails(list, fName) try: final_df = final_df.append(df) except Exception as error: logger.error("Exception in pipeline for df merging for accession_no -{} -{}".format(accession_no,error)) return "unsuccessfull" i = 1 final_df = Format4Parser.concat_rowData(final_df) final_df = final_df.drop(columns=['ID']) final_df.rename(columns={"ISSUER":"CompanyName","MEETING DATE":"MeetingDate","Proposal No":"ProposalNumber", "PROPOSAL":"Proposal","PROPOSED BY":"ProposedBy","VOTED?":"Voted", "MGMT":"ForAgainstManagement","TICKER":"Ticker","VOTE CAST":"VoteCast"},inplace=True) final_df["MeetingDate"] = final_df["MeetingDate"].apply(Utils.parsing_date) final_df["AccesssionNumber"] = accession_no final_df["DocumentId"] = doc_id Utils.df_to_database(accession_no,doc_id,final_df) filename = os.path.join(CONFIG["Path"]["output_path_format4"], accession_no + ".xlsx") Utils.df_to_excel(accession_no,final_df,filename) return 1
def process(self, accession_no, doc_id): logger = Utils.add_logger() try: olderFundName = '' lstdf = [] dfs_all = pd.DataFrame() path = CONFIG["Path"]["file_path"] + accession_no + ".dissem" content = Utils.read_file(path, accession_no) content = Format8Parser.segparsing(content) # print(content) content = re.sub("=+", "", content) content = content.replace("<TABLE", "@@@@@@@@@@@@@@@@@@@@@\n <TABLE") content = content.replace("<table", "@@@@@@@@@@@@@@@@@@@@@\n <table") content = content.replace("</TABLE>", "</TABLE>\n ######################") content = content.replace("</table>", "</table>\n ######################") content = content.replace("</td>\n<tr>", "</td></tr><tr>") content = content.replace( "The Fund did not vote any proxies during this reporting period", "The Fund did not vote any proxies during this reporting period\n ######################" ) fundVoteData = content.split('######################') for alist in fundVoteData: try: #print("---------------------------------------------------------------") # print(alist,lstdf) # print(alist) df_all = Format8Parser.tableparsed(alist, olderFundName, lstdf, accession_no) # print(df_all) if df_all is not None: dfs_batch = pd.DataFrame() # col_list = df_all.columns # col_list = [(lambda x: re.sub(' +',' ',x))(l) for l in col_list] # df_all.columns = col_list dfs_batch = Format8Parser.formDataFrame(df_all) if dfs_batch.empty != True: if dfs_all.empty == True: # print(dfs_batch) dfs_all = dfs_batch else: #print(dfs_batch) dfs_all = dfs_all.append(dfs_batch, ignore_index=True, sort=False) if "FundName" in dfs_all.columns: olderFundName = dfs_all["FundName"][0] lstdf = df_all.columns # print(dfs_batch) # print(lstdf) except Exception as error: logger.error( "Exception in for loop of Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) print( "Exception in for loop of Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) #print('------------Final -----------------') #print(dfs_all) #print("Column names:{}".format(dfs_all.columns)) if dfs_all is not None: dfs_all = Format8Parser.remove_spaces_from_df(dfs_all) filename = os.path.join(CONFIG['Path']['output_path_format8'], accession_no + '.xlsx') Utils.df_to_excel(accession_no, dfs_all, filename) return 1 except Exception as error: logger.error( "Exception in Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) print( "Exception in Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) return 0