def headlerlistcreation(headeritem): logger = Utils.add_logger() colist = [] headerlist = [] headeritem = [ x for x in headeritem if x not in ['Management', 'Opposition'] ] # print("headeritem:{}".format(headeritem)) length = len(headeritem) try: for i in range(0, length + 1): if (i % 2 == 0): headerlist.append(headeritem[i]) else: if "Date" in headeritem[i] or "Agenda" in headeritem[i]: headeritem.insert(i, " ") colist.append(" ") length += 1 else: colist.append(headeritem[i]) except: logger.debug("colList:{}".format(colist)) #print("colList:{}".format(colist)) #print("headeritem after loop:{}".format(headerlist)) #print("collist:{}".format(colist)) alist = [] for i, j in zip(headerlist, colist): alist.append(i) alist.append(j) #print(alist) return alist
def preprocess(accession_no): logger = Utils.add_logger() try: path = os.path.join(CONFIG["Path"]["file_path"], accession_no + ".dissem") file = open(path, 'r', encoding='utf-8') s = file.read() raw = bs(s, "lxml") val = raw.get_text() result = raw.find_all(text=re.compile("NAME OF REGISTRANT")) for val in result: v = val.split('\n') for i, str in enumerate(v): if "NAME OF REGISTRANT" in str: temp = v[i].split(':') Registrant_Name = temp[1].strip() logging.debug("Registrant Name for file:{} is {}".format( accession_no, Registrant_Name)) res = raw.find_all(text=re.compile(Registrant_Name)) res = raw.getText() text = "" for value in res: text += value result_list = re.split(r"-{20,}", text) except Exception as error: logger.error("Error in preprocess for file-{}-{}".format( accession_no, error)) return list(), "" return result_list, Registrant_Name
def splitevenodd(headeritem, companyName): logger = Utils.add_logger() colist = [] headerlist = [] headeritem = [ x for x in headeritem if x not in ['Management', 'Opposition'] ] # print("headeritem:{}".format(headeritem)) length = len(headeritem) try: for i in range(0, length + 1): # print(length) if (i % 2 == 0): headerlist.append(headeritem[i]) else: if "Date" in headeritem[i] or "Agenda" in headeritem[i]: headeritem.insert(i, " ") colist.append(" ") length += 1 else: colist.append(headeritem[i]) except: logger.debug("colList:{}".format(colist)) #print("colList:{}".format(colist)) dfs = pd.DataFrame(columns=headerlist) # print("headeritem after loop:{}".format(headerlist)) # print("collist:{}".format(colist)) dfs = dfs.append(pd.Series(colist, index=headerlist), ignore_index=True) #print("CompanyName={}".format(companyName)) if len(companyName) >= 1: dfs['CompanyName'] = companyName #print(dfs) return dfs
def process(self, accession_no, doc_id): logger = Utils.add_logger() try: path = CONFIG["Path"]["file_path"] + accession_no + ".dissem" content = Utils.read_file(path, accession_no) content = Format6parser.segparsing(content) content = content.replace("</td>\n<tr>", "</td></tr><tr>") soup = BeautifulSoup(content, "html.parser") tables = soup.findAll('table') dffa = None prev_table = None finaldf = pd.DataFrame() for table in tables: try: if Format6APipeline.check_table(table): if prev_table is not None: prev_table = prev_table + str(table.contents) dffa = Format6parser.tabledetails( BeautifulSoup(prev_table, "html.parser")) prev_table = None else: dffa = Format6parser.tabledetails(table) else: prev_table = str(table.contents) #print('######################') if dffa is not None: if finaldf.empty == True: finaldf = dffa dffa = None else: #print(dffa.head()) finaldf = finaldf.append(dffa, sort=True) dffa = None except Exception as error: logger.error( "Exception in table parsing for format6 for accession_no -{} -{}" .format(accession_no, error)) filename = os.path.join(CONFIG['Path']['output_path_format6B'], accession_no + '.xlsx') Utils.df_to_excel(accession_no, finaldf, filename) return 1 except Exception as error: logger.error( "Exception in format6 pipeline for accession_no -{}-{}".format( accession_no, error)) print("Exception in format6A pipeline-{}".format(error)) return 0
def __update_enabilities(self): # set the enabilities of all setting-widgets except for the radio-buttons for irregularity Utils.set_enabled_recursively(self.__vbox_settings, self.__rdb_irregular.isChecked()) self.__rdb_regular.setEnabled(True) self.__rdb_irregular.setEnabled(True) # if it's an irregular verb if self.__rdb_irregular.isChecked(): self.__comb_stem_changing.setEnabled( self.__chk_stem_changing.isChecked()) self.__le_special_past_particle.setEnabled( self.__chk_special_past_particle.isChecked()) self.__le_special_yo_form.setEnabled( self.__chk_special_yo_form.isChecked()) self.__le_special_preterite_stem.setEnabled( self.__chk_special_preterite_stem.isChecked()) self.__le_special_future_stem.setEnabled( self.__chk_special_future_stem.isChecked()) self.__le_special_present_subjunctive_stem.setEnabled( self.__chk_special_present_subjunctive_stem.isChecked()) self.__le_special_tú_form_affirmative_imperative.setEnabled( self.__chk_special_tú_form_affirmative_imperative.isChecked())
def process(self, accession_no, doc_id): logger = Utils.add_logger() text_list, registrant_name = Parser.preprocess(accession_no) if len(text_list) > 0: funds_list = Parser.parse_file(accession_no, text_list, doc_id) if len(funds_list) > 0: header_df, table_df = Parser.post_process( accession_no, funds_list) if header_df.empty: logging.debug( "header_df is empty for accession_no:{}".format( accession_no)) else: final_df = Parser.process_df(accession_no, header_df, table_df) if final_df.empty: logging.debug( "final_df is empty for accession_no:{}".format( accession_no)) else: logging.debug( "final_df created successfully for accession_no:{}" .format(accession_no)) print("final_df length:{}".format(len(final_df.index))) final_df = final_df.drop(columns=["seq_id"]) Utils.df_to_database(accession_no, doc_id, final_df) filename = os.path.join( CONFIG["Path"]["output_path_format2"], accession_no + ".xlsx") Utils.df_to_excel(accession_no, final_df, filename) return 1 else: logger.debug( "No data found from the file for accssion_no:{}".format( accession_no)) return "Unsuccessfull"
def process_df(accession_no, main_header_df, table_df): logger = Utils.add_logger() try: table_df = table_df.astype({"seq_id": int}) main_header_df = main_header_df.reset_index(drop=True) main_header_df = main_header_df.astype({"seq_id": int}) df = table_df.dropna(subset=['Proposal']) df["ProposedBy"] = df["ProposedBy"].apply(Parser.replace_empty) df["ProposalNumber"] = df["ProposalNumber"].apply( Parser.replace_empty) main_header_df["MeetingDate"] = main_header_df[ "MeetingDate"].apply(Utils.parsing_date) df = df.reset_index(drop=True) r = len(df.index) - 1 #print(r) while r >= 0: temp_index = list() while ((df.at[r, "Proposal"] != "DIRECTOR" or df.at[r, "ProposalNumber"] == None) and (df.at[r, "ProposedBy"] == None)): temp_index.append(r) r -= 1 #print(temp_index) r -= 1 if len(temp_index) >= 1: c = temp_index[-1] # print(c) for m in reversed(temp_index): df.at[c - 1, "Proposal"] += " " + df.at[m, "Proposal"] df.at[m, "Proposal"] = None df = df.dropna(subset=["Proposal"]) final_df = main_header_df.merge(df, on=["seq_id"], how="inner") except Exception as error: logger.error("Error in process_df for accession_no-{}-{}".format( accession_no, error)) print("Error in df processing for accession_no-{}-{}".format( accession_no, error)) return pd.DataFrame() return final_df
def process(self,accession_no,doc_id): logger = Utils.add_logger() try: path = CONFIG["Path"]["file_path"] + accession_no +".dissem" content = Utils.read_file(path,accession_no) content = content.replace(' ', '\xa0') soup = BeautifulSoup(content, "html.parser") contents = soup.text contents = contents.replace('SIGNATURES', '----------------------------------') contents = contents.replace('FUND:', '############### \n FUND:') fundlist = contents.split('###############') except Exception as error: logger.error("Exception in parsing through bs4 for accession_no -{} -{}".format(accession_no,error)) fundlist = [] return "unsuccessfull" i = 0 final_df = pd.DataFrame() # print(fundlist[1]) for list in fundlist: if i > 0: sr = list.split('ISSUER') fName = Format4Parser.fundName(sr[0]) #print(fName) df = Format4Parser.companyfundDetails(list, fName) try: final_df = final_df.append(df) except Exception as error: logger.error("Exception in pipeline for df merging for accession_no -{} -{}".format(accession_no,error)) return "unsuccessfull" i = 1 final_df = Format4Parser.concat_rowData(final_df) final_df = final_df.drop(columns=['ID']) final_df.rename(columns={"ISSUER":"CompanyName","MEETING DATE":"MeetingDate","Proposal No":"ProposalNumber", "PROPOSAL":"Proposal","PROPOSED BY":"ProposedBy","VOTED?":"Voted", "MGMT":"ForAgainstManagement","TICKER":"Ticker","VOTE CAST":"VoteCast"},inplace=True) final_df["MeetingDate"] = final_df["MeetingDate"].apply(Utils.parsing_date) final_df["AccesssionNumber"] = accession_no final_df["DocumentId"] = doc_id Utils.df_to_database(accession_no,doc_id,final_df) filename = os.path.join(CONFIG["Path"]["output_path_format4"], accession_no + ".xlsx") Utils.df_to_excel(accession_no,final_df,filename) return 1
def tabledetails(table, lstdf, accession_no): logger = Utils.add_logger() line = [] temp = False length = 0 head = 0 tempf = 0 fund = '' parseline = [] checkdata = [] table_rows = table.find_all('tr') for tr in table_rows: try: td = tr.find_all('td') for i in td: line.append(i.text.replace('\n', '').replace('\xa0', ' ')) checkdata.append(i.text.strip().replace('\n', '').replace( '\xa0', ' ')) line = [(lambda x: x.strip())(l) for l in line if l != ' '] #print(line) # if "QSR" in line or "FP" in line: # print("Line=={}".format(line)) if line == [] or len(line) == line.count(''): line = [] continue if temp == False: temp = Format8Parser.identify_header(line, accession_no) head = head + 1 # print('head ' + str(head)) # print('temp ' + str(temp)) # print('temp ' + str(len(line))) # print(len(lstdf)) if temp == False and head == 1 and len(lstdf) != 0: temp = True parseline = line if len(lstdf) == len(line): line = lstdf else: lstdf = list(lstdf) abcd = set(lstdf) if 'Fund Name' in abcd: lstdf.remove('Fund Name') if 'FundName' in abcd: lstdf.remove('FundName') line = lstdf if temp == True: line = [(lambda x: re.sub(' +', ' ', x))(l) for l in line if l != ' '] df = pd.DataFrame(columns=line) col = line length = len(line) if parseline != [] and len(parseline) == len(line): df = df.append(pd.Series(parseline, index=col), ignore_index=True) #print("header") #print(line) #print(length) ##print(df) elif length == len(line): df = df.append(pd.Series(line, index=col), ignore_index=True) elif length == len(checkdata): df = df.append(pd.Series(checkdata, index=col), ignore_index=True) line = [] checkdata = [] except Exception as error: logger.error( "Exception in format8 parser for accession_no:{} -{}". format(accession_no, error)) print("Exception in format8 parser:{}".format(error)) if temp == False: df = pd.DataFrame() else: for col in df.columns: if col.upper().find('FUND NAME') > -1: tempf = 1 if tempf != 1 and fund != '': df["Fund Name"] = fund return df
def process(self, accession_no, doc_id): logger = Utils.add_logger() try: olderFundName = '' lstdf = [] dfs_all = pd.DataFrame() path = CONFIG["Path"]["file_path"] + accession_no + ".dissem" content = Utils.read_file(path, accession_no) content = Format8Parser.segparsing(content) # print(content) content = re.sub("=+", "", content) content = content.replace("<TABLE", "@@@@@@@@@@@@@@@@@@@@@\n <TABLE") content = content.replace("<table", "@@@@@@@@@@@@@@@@@@@@@\n <table") content = content.replace("</TABLE>", "</TABLE>\n ######################") content = content.replace("</table>", "</table>\n ######################") content = content.replace("</td>\n<tr>", "</td></tr><tr>") content = content.replace( "The Fund did not vote any proxies during this reporting period", "The Fund did not vote any proxies during this reporting period\n ######################" ) fundVoteData = content.split('######################') for alist in fundVoteData: try: #print("---------------------------------------------------------------") # print(alist,lstdf) # print(alist) df_all = Format8Parser.tableparsed(alist, olderFundName, lstdf, accession_no) # print(df_all) if df_all is not None: dfs_batch = pd.DataFrame() # col_list = df_all.columns # col_list = [(lambda x: re.sub(' +',' ',x))(l) for l in col_list] # df_all.columns = col_list dfs_batch = Format8Parser.formDataFrame(df_all) if dfs_batch.empty != True: if dfs_all.empty == True: # print(dfs_batch) dfs_all = dfs_batch else: #print(dfs_batch) dfs_all = dfs_all.append(dfs_batch, ignore_index=True, sort=False) if "FundName" in dfs_all.columns: olderFundName = dfs_all["FundName"][0] lstdf = df_all.columns # print(dfs_batch) # print(lstdf) except Exception as error: logger.error( "Exception in for loop of Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) print( "Exception in for loop of Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) #print('------------Final -----------------') #print(dfs_all) #print("Column names:{}".format(dfs_all.columns)) if dfs_all is not None: dfs_all = Format8Parser.remove_spaces_from_df(dfs_all) filename = os.path.join(CONFIG['Path']['output_path_format8'], accession_no + '.xlsx') Utils.df_to_excel(accession_no, dfs_all, filename) return 1 except Exception as error: logger.error( "Exception in Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) print( "Exception in Format8pipeline.process for accessionnumber:{}-{}" .format(accession_no, error)) return 0
def __init__(self): self.utils = Utils()
class NN: def __init__(self): self.utils = Utils() @staticmethod def initialize_parameters(layer_dims): """ Arguments: layer_dims -- python array (list) containing the dimensions of each layer in our network Returns: parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL": Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1]) bl -- bias vector of shape (layer_dims[l], 1) """ np.random.seed(1) parameters = {} L = len(layer_dims) # number of layers in the network for l in range(1, L): parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) / np.sqrt( layer_dims[l - 1]) parameters['b' + str(l)] = np.zeros((layer_dims[l], 1)) return parameters def linear_forward(self, A, W, b): """ Implement the linear part of a layer's forward propagation. Arguments: A -- activations from previous layer (or input data): (size of previous layer, number of examples) W -- weights matrix: numpy array of shape (size of current layer, size of previous layer) b -- bias vector, numpy array of shape (size of the current layer, 1) Returns: Z -- the input of the activation function, also called pre-activation parameter cache -- a python dictionary containing "A", "W" and "b" ; stored for computing the backward pass efficiently """ Z = W.dot(A) + b cache = (A, W, b) return Z, cache def linear_activation_forward(self, A_prev, W, b, activation): """ Implement the forward propagation for the LINEAR->ACTIVATION layer Arguments: A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples) W -- weights matrix: numpy array of shape (size of current layer, size of previous layer) b -- bias vector, numpy array of shape (size of the current layer, 1) activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu" Returns: A -- the output of the activation function, also called the post-activation value cache -- a python dictionary containing "linear_cache" and "activation_cache"; stored for computing the backward pass efficiently """ if activation == "sigmoid": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = self.linear_forward(A_prev, W, b) A, activation_cache = self.utils.sigmoid(Z) elif activation == "relu": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = self.linear_forward(A_prev, W, b) A, activation_cache = self.utils.relu(Z) cache = (linear_cache, activation_cache) return A, cache def L_model_forward(self, X, parameters): """ Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation Arguments: X -- data, numpy array of shape (input size, number of examples) parameters -- output of initialize_parameters_deep() Returns: AL -- last post-activation value caches -- list of caches containing: every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2) the cache of linear_sigmoid_forward() (there is one, indexed L-1) """ caches = [] A = X L = len(parameters) // 2 # number of layers in the neural network # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list. for l in range(1, L): A_prev = A A, cache = self.linear_activation_forward(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], activation = "relu") caches.append(cache) # Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list. AL, cache = self.linear_activation_forward(A, parameters['W' + str(L)], parameters['b' + str(L)], activation = "sigmoid") caches.append(cache) return AL, caches def compute_cost(self, AL, Y): """ Implement the cost function defined by equation (7). Arguments: AL -- probability vector corresponding to your label predictions, shape (1, number of examples) Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples) Returns: cost -- cross-entropy cost """ m = Y.shape[1] # Compute loss from aL and y. cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T)) cost = np.squeeze(cost) # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17). return cost def compute_cost_with_regularization(self, AL, Y, parameters, lambd): L = len(parameters) // 2 m = Y.shape[1] # Compute loss from aL and y. cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T)) # Using L2 Regularization for l in range(1, L): L2_regularization_cost = (1. / m)*(lambd / 2) * (np.sum(np.square(parameters['W' + str(l)]))) cost = cost + L2_regularization_cost cost = np.squeeze(cost) # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17). return cost def linear_backward(self, dZ, cache): """ Implement the linear portion of backward propagation for a single layer (layer l) Arguments: dZ -- Gradient of the cost with respect to the linear output (of current layer l) cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer Returns: dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev dW -- Gradient of the cost with respect to W (current layer l), same shape as W db -- Gradient of the cost with respect to b (current layer l), same shape as b """ A_prev, W, b = cache m = A_prev.shape[1] dW = (1. / m) * np.dot(dZ, cache[0].T) db = (1. / m) * np.sum(dZ, axis=1, keepdims=True) dA_prev = np.dot(cache[1].T, dZ) return dA_prev, dW, db def linear_backward_with_regularization(self, dZ, cache, parameters, lambd): A_prev, W, b = cache m = A_prev.shape[1] L = len(parameters) // 2 # L2 Regularization #for l in reversed(range(L)): dW = (1./m) * (np.dot(dZ, cache[0].T) + lambd * W) db = (1./m) * np.sum(dZ, axis = 1, keepdims = True) dA_prev = np.dot(cache[1].T,dZ) return dA_prev, dW, db def linear_activation_backward(self, dA, cache, activation, lambd, parameters): """ Implement the backward propagation for the LINEAR->ACTIVATION layer. Arguments: dA -- post-activation gradient for current layer l cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu" Returns: dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev dW -- Gradient of the cost with respect to W (current layer l), same shape as W db -- Gradient of the cost with respect to b (current layer l), same shape as b """ linear_cache, activation_cache = cache if activation == "relu": dZ = self.utils.relu_backward(dA, activation_cache) if lambd == 0: dA_prev, dW, db = self.linear_backward(dZ, linear_cache) elif lambd > 0: dA_prev, dW, db = self.linear_backward_with_regularization(dZ, linear_cache, parameters, lambd) elif activation == "sigmoid": dZ = self.utils.sigmoid_backward(dA, activation_cache) if lambd == 0: dA_prev, dW, db = self.linear_backward(dZ, linear_cache) elif lambd > 0: dA_prev, dW, db = self.linear_backward_with_regularization(dZ, linear_cache, parameters, lambd) return dA_prev, dW, db def L_model_backward(self, AL, Y, caches, lambd, parameters): """ Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group Arguments: AL -- probability vector, output of the forward propagation (L_model_forward()) Y -- true "label" vector (containing 0 if non-cat, 1 if cat) caches -- list of caches containing: every cache of linear_activation_forward() with "relu" (there are (L-1) or them, indexes from 0 to L-2) the cache of linear_activation_forward() with "sigmoid" (there is one, index L-1) Returns: grads -- A dictionary with the gradients grads["dA" + str(l)] = ... grads["dW" + str(l)] = ... grads["db" + str(l)] = ... """ grads = {} L = len(caches) # the number of layers m = AL.shape[1] Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL # Initializing the backpropagation dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "AL, Y, caches". Outputs: "grads["dAL"], grads["dWL"], grads["dbL"] current_cache = caches[L-1] grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = self.linear_activation_backward(dAL, current_cache, activation = "sigmoid", lambd=lambd, parameters=parameters) for l in reversed(range(L-1)): # lth layer: (RELU -> LINEAR) gradients. current_cache = caches[l] dA_prev_temp, dW_temp, db_temp = self.linear_activation_backward(grads["dA" + str(l + 2)], current_cache, activation = "relu", lambd=lambd, parameters=parameters) grads["dA" + str(l + 1)] = dA_prev_temp grads["dW" + str(l + 1)] = dW_temp grads["db" + str(l + 1)] = db_temp return grads def update_parameters(self, parameters, grads, learning_rate): """ Update parameters using gradient descent Arguments: parameters -- python dictionary containing your parameters grads -- python dictionary containing your gradients, output of L_model_backward Returns: parameters -- python dictionary containing your updated parameters parameters["W" + str(l)] = ... parameters["b" + str(l)] = ... """ L = len(parameters) // 2 # number of layers in the neural network # Update rule for each parameter. Use a for loop. for l in range(L): parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)] parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)] return parameters
def parse_file(accession_no, text_list, doc_id): logger = Utils.add_logger() try: funds_list = list() seq_id = 1 # sequencing prev_fundname = "" for index, temp in enumerate(text_list): split = re.split("\n", temp) for x, spli in enumerate(split): num = x + 3 table_list = list() if "Proposal Vote" in spli: #print("Table Data") row_list = list() table_list.append(seq_id - 1) while (num < (len(split) - 2)) and ( case3 not in split[num]) and (case4 not in split[num]): row_text = split[num] # print(row_text) if len(row_text) > 0: temp_text = "" temp_text += row_text[:6].strip() temp_text += "\t" + row_text[6:65].strip() temp_text += "\t" + row_text[65:79].strip() temp_text += "\t" + row_text[79:110].strip() temp_text += "\t" + row_text[110:].strip() table_list.append(temp_text) #print(temp_text) num += 1 funds_list.append(table_list) sep = re.split("\s{3,}", spli) # print(sep) header_list = list() for i, s in enumerate(sep): # print(s) if "Agenda Number:" in s: fund = text_list[index - 1] # print(fund) fund_name = re.split(r'\s{2,}', fund) fund_name = fund_name[-1] fund_name = re.sub("\n", "", fund_name) fund_name = fund_name.strip() if len(fund_name) > 1: prev_fundname = fund_name else: fund_name = prev_fundname #print("fund_name:{}".format(fund_name)) t = s.split(':') agenda_number = t[1].strip() inc_name = sep[i - 1] inc_name = inc_name.strip() #print("Company Name:{} \nAgenda_number:{}".format(inc_name, agenda_number)) elif "Security:" in s: t = s.split(':') security = t[1].strip() #print("Security:", security) elif "Meeting Type:" in s: t = s.split(':') meeting_type = t[1].strip() #print("Meeting_type:", meeting_type) elif "Meeting Date:" in s: t = s.split(':') meeting_date = t[1].strip() #print("Meeting_date:", meeting_date) elif "Ticker:" in s: t = s.split(':') ticker = t[1].strip() #print("Ticker:", ticker) elif "ISIN:" in s: t = s.split(':') ISIN = t[1].strip() #print("ISIN:", ISIN) #print("") header_list.append(seq_id) header_list.append(doc_id) header_list.append(accession_no) header_list.append(fund_name) header_list.append(inc_name) header_list.append(security) header_list.append(meeting_date) header_list.append(meeting_type) header_list.append(ISIN) header_list.append(ticker) header_list.append(agenda_number) funds_list.append(header_list) seq_id += 1 except Exception as error: logger.error( "Error in parsing the file for accession_no-{}-{}".format( accession_no, error)) print("Error in parsing file for accession_no-{}-{}".format( accession_no, error)) return list() return funds_list
def post_process(accession_no, funds_list): logger = Utils.add_logger() try: seq_id = 1 main_header_df = pd.DataFrame(columns=[ "seq_id", "DocumentId", "AccesssionNumber", "FundName", "CompanyName", "SecurityId", "MeetingDate", "MeetingType", "ISIN", "Ticker", "AgendaNumber" ]) table_df = pd.DataFrame(columns=[ "seq_id", "ProposalNumber", "Proposal", "ProposedBy", "ForAgainstManagement", "VoteCast" ]) for i, data in enumerate(funds_list): if i % 2 == 0: #print("<-----Header Data----->") temp = np.asarray(data) temp = temp.reshape(1, 11) header_df = pd.DataFrame(temp, columns=[ "seq_id", "DocumentId", "AccesssionNumber", "FundName", "CompanyName", "SecurityId", "MeetingDate", "MeetingType", "ISIN", "Ticker", "AgendaNumber" ]) main_header_df = main_header_df.append(header_df) #print(data) #print() elif i % 2 != 0: #print("<------Table Data----->") for val in data: row_list = list() row_list.append(seq_id) if not isinstance(val, int): # val = re.sub(r"") split = re.split(r"\t", val) for value in split: row_list.append(value.lstrip()) else: seq_id = val tp = np.asarray(row_list) if len(tp) < 6: for x in range(len(tp), 6): tp = np.append(tp, None) # print(tp) tp = tp.reshape(1, 6) # print(tp) temp_df = pd.DataFrame(tp, columns=[ "seq_id", "ProposalNumber", "Proposal", "ProposedBy", "ForAgainstManagement", "VoteCast" ]) table_df = table_df.append(temp_df) #print(data) #print() except Exception as error: logger.error( "Error in post-processing for accession_no-{}-{}".format( accession_no, error)) print("Error in post-processing for accession_no-{}-{}".format( accession_no, error)) return pd.DataFrame(), pd.DataFrame() return main_header_df, table_df