def process(self,accession_no,doc_id): logger = Utils.add_logger() try: path = CONFIG["Path"]["file_path"] + accession_no +".dissem" content = Utils.read_file(path,accession_no) content = content.replace(' ', '\xa0') soup = BeautifulSoup(content, "html.parser") contents = soup.text contents = contents.replace('SIGNATURES', '----------------------------------') contents = contents.replace('FUND:', '############### \n FUND:') fundlist = contents.split('###############') except Exception as error: logger.error("Exception in parsing through bs4 for accession_no -{} -{}".format(accession_no,error)) fundlist = [] return "unsuccessfull" i = 0 final_df = pd.DataFrame() # print(fundlist[1]) for list in fundlist: if i > 0: sr = list.split('ISSUER') fName = Format4Parser.fundName(sr[0]) #print(fName) df = Format4Parser.companyfundDetails(list, fName) try: final_df = final_df.append(df) except Exception as error: logger.error("Exception in pipeline for df merging for accession_no -{} -{}".format(accession_no,error)) return "unsuccessfull" i = 1 final_df = Format4Parser.concat_rowData(final_df) final_df = final_df.drop(columns=['ID']) final_df.rename(columns={"ISSUER":"CompanyName","MEETING DATE":"MeetingDate","Proposal No":"ProposalNumber", "PROPOSAL":"Proposal","PROPOSED BY":"ProposedBy","VOTED?":"Voted", "MGMT":"ForAgainstManagement","TICKER":"Ticker","VOTE CAST":"VoteCast"},inplace=True) final_df["MeetingDate"] = final_df["MeetingDate"].apply(Utils.parsing_date) final_df["AccesssionNumber"] = accession_no final_df["DocumentId"] = doc_id Utils.df_to_database(accession_no,doc_id,final_df) filename = os.path.join(CONFIG["Path"]["output_path_format4"], accession_no + ".xlsx") Utils.df_to_excel(accession_no,final_df,filename) return 1
def process(self, accession_no, doc_id): logger = Utils.add_logger() text_list, registrant_name = Parser.preprocess(accession_no) if len(text_list) > 0: funds_list = Parser.parse_file(accession_no, text_list, doc_id) if len(funds_list) > 0: header_df, table_df = Parser.post_process( accession_no, funds_list) if header_df.empty: logging.debug( "header_df is empty for accession_no:{}".format( accession_no)) else: final_df = Parser.process_df(accession_no, header_df, table_df) if final_df.empty: logging.debug( "final_df is empty for accession_no:{}".format( accession_no)) else: logging.debug( "final_df created successfully for accession_no:{}" .format(accession_no)) print("final_df length:{}".format(len(final_df.index))) final_df = final_df.drop(columns=["seq_id"]) Utils.df_to_database(accession_no, doc_id, final_df) filename = os.path.join( CONFIG["Path"]["output_path_format2"], accession_no + ".xlsx") Utils.df_to_excel(accession_no, final_df, filename) return 1 else: logger.debug( "No data found from the file for accssion_no:{}".format( accession_no)) return "Unsuccessfull"