def process_folder_multithread(folders): connection = utils.create_mysql_connection() for folder in folders: process_folder(folder, connection) connection.close()
def process_folder_multithread(annual_reports): connection = utils.create_mysql_connection() for annual_report in annual_reports: process_folder(annual_report, connection) connection.close()
def connect_to_mysql(**params): user = params['user'] password = params['password'] host = params['host'] db = params['code_review_db'] cursor = create_mysql_connection(user, password, host, db) return cursor
def compute_process(filenames, cik_2_oindices, sec_ind_lookup_table, stocks_already_computed, ni_seqs_already_computed, sec_ind_already_computed, cookie, crumb, error_stocks, error_ni_seqs, error_sec_inds, pid): connection = utils.create_mysql_connection(all_in_mem=True) db = wrds.Connection() for filename in filenames: compute_process_utils(filename, cik_2_oindices, sec_ind_lookup_table, stocks_already_computed, ni_seqs_already_computed, sec_ind_already_computed, connection, db, cookie, crumb, error_stocks, error_ni_seqs, error_sec_inds, pid) connection.close()
def fetch_data(): connection = utils.create_mysql_connection(all_in_mem=False) list_1a, list_7, list_7a = [], [], [] with connection.cursor() as cursor: sql = "SELECT * FROM `10k` WHERE {} = %s".format( config.KEY_BADLY_PARSED) cursor.execute(sql, (False)) for row in cursor: # 1A risk factors list_1a += process_row( row, config.ITEM_1A_AGG_METHOD, config.ITEM_1A_DIFF_OFFSET_KEEP_BOTH, config.ITEM_1A_LINES_TO_READ_IF_NO_NEXT_INDEX, config.KEY_ITEM_1A_1, config.KEY_ITEM_1A_2, config.KEY_ITEM_1A_3, config.KEY_ITEM_1B_1, config.KEY_ITEM_2_1, config.KEY_ITEM_2_2 ) # 2_2 because there is section 7 is too far # 7 Management's discussion list_7 += process_row(row, config.ITEM_7_AGG_METHOD, config.ITEM_7_DIFF_OFFSET_KEEP_BOTH, config.ITEM_7_LINES_TO_READ_IF_NO_NEXT_INDEX, config.KEY_ITEM_7_1, config.KEY_ITEM_7_2, config.KEY_ITEM_7_3, config.KEY_ITEM_7A_1, config.KEY_ITEM_8_1, config.KEY_ITEM_9_1) # 7A Quantitative and Qualitative analysis list_7a += process_row( row, config.ITEM_7A_AGG_METHOD, config.ITEM_7A_DIFF_OFFSET_KEEP_BOTH, config.ITEM_7A_LINES_TO_READ_IF_NO_NEXT_INDEX, config.KEY_ITEM_7A_1, config.KEY_ITEM_7A_2, config.KEY_ITEM_7A_3, config.KEY_ITEM_8_1, config.KEY_ITEM_9_1, config.KEY_ITEM_9_2 ) # 9_2 because there is no a third section afterwards connection.close() return list_1a, list_7, list_7a
consecutive_low_freq = 0 for f, b1, b2 in zip(norm_freqs, bins[:-1], bins[1:]): print('{}-{}:'.format(b1, b2) + '#'*(int(f))) if f < 1: consecutive_low_freq += 1 if consecutive_low_freq >= 3: break if __name__ == "__main__": random.seed(config.SEED) connection = utils.create_mysql_connection() # Total with connection.cursor() as cursor: sql = "SELECT COUNT(*) AS total FROM `10k`" cursor.execute(sql) results = cursor.fetchall() total = results[0]['total'] print('{} 10k reports in total'.format(total)) print() # If there are some badly parsed files print('BADLY PARSED FILES') with connection.cursor() as cursor: sql = "SELECT * FROM `10k` WHERE {} = %s".format(config.KEY_BADLY_PARSED) cursor.execute(sql, (True)) results = cursor.fetchall()
for section in sections_to_analyze: print(section) stocks_already_computed = { f.split('/')[-1][:-4] for f in glob.glob("{}/*.pkl".format(config.DATA_STOCKS_FOLDER)) } ni_seqs_already_computed = { f.split('/')[-1][:-4] for f in glob.glob("{}/*.pkl".format(config.DATA_NI_SEQ_FOLDER)) } sec_ind_already_computed = { f.split('/')[-1][:-4] for f in glob.glob("{}/*.pkl".format(config.DATA_SEC_IND_FOLDER)) } connection = utils.create_mysql_connection(all_in_mem=True) db = wrds.Connection() cik_2_oindices, already_computed = get_cik_lookup_table(db, connection) connection.close() # Try to fill database with new TICKERs from CIKs only_cik = 0 filenames = [ x[0].split('/')[-1].split('.')[0] for x in analyze_topics_static.load_and_clean_data(section) ] random.shuffle(filenames) # Better balanced work for multiprocessing for i in range(0, len(filenames)): company_cik = filenames[i].split( config.CIK_COMPANY_NAME_SEPARATOR)[0].rjust(10, '0')