示例#1
0
def process_folder_multithread(folders):
    connection = utils.create_mysql_connection()

    for folder in folders:
        process_folder(folder, connection)

    connection.close()
示例#2
0
def process_folder_multithread(annual_reports):
    connection = utils.create_mysql_connection()

    for annual_report in annual_reports:
        process_folder(annual_report, connection)

    connection.close()
def connect_to_mysql(**params):
    user = params['user']
    password = params['password']
    host = params['host']
    db = params['code_review_db']

    cursor = create_mysql_connection(user, password, host, db)

    return cursor
示例#4
0
def compute_process(filenames, cik_2_oindices, sec_ind_lookup_table,
                    stocks_already_computed, ni_seqs_already_computed,
                    sec_ind_already_computed, cookie, crumb, error_stocks,
                    error_ni_seqs, error_sec_inds, pid):
    connection = utils.create_mysql_connection(all_in_mem=True)
    db = wrds.Connection()

    for filename in filenames:
        compute_process_utils(filename, cik_2_oindices, sec_ind_lookup_table,
                              stocks_already_computed,
                              ni_seqs_already_computed,
                              sec_ind_already_computed, connection, db, cookie,
                              crumb, error_stocks, error_ni_seqs,
                              error_sec_inds, pid)

    connection.close()
def fetch_data():
    connection = utils.create_mysql_connection(all_in_mem=False)

    list_1a, list_7, list_7a = [], [], []
    with connection.cursor() as cursor:
        sql = "SELECT * FROM `10k` WHERE {} = %s".format(
            config.KEY_BADLY_PARSED)
        cursor.execute(sql, (False))

        for row in cursor:
            # 1A risk factors
            list_1a += process_row(
                row, config.ITEM_1A_AGG_METHOD,
                config.ITEM_1A_DIFF_OFFSET_KEEP_BOTH,
                config.ITEM_1A_LINES_TO_READ_IF_NO_NEXT_INDEX,
                config.KEY_ITEM_1A_1, config.KEY_ITEM_1A_2,
                config.KEY_ITEM_1A_3, config.KEY_ITEM_1B_1,
                config.KEY_ITEM_2_1, config.KEY_ITEM_2_2
            )  # 2_2 because there is section 7 is too far
            # 7 Management's discussion
            list_7 += process_row(row, config.ITEM_7_AGG_METHOD,
                                  config.ITEM_7_DIFF_OFFSET_KEEP_BOTH,
                                  config.ITEM_7_LINES_TO_READ_IF_NO_NEXT_INDEX,
                                  config.KEY_ITEM_7_1, config.KEY_ITEM_7_2,
                                  config.KEY_ITEM_7_3, config.KEY_ITEM_7A_1,
                                  config.KEY_ITEM_8_1, config.KEY_ITEM_9_1)
            # 7A Quantitative and Qualitative analysis
            list_7a += process_row(
                row, config.ITEM_7A_AGG_METHOD,
                config.ITEM_7A_DIFF_OFFSET_KEEP_BOTH,
                config.ITEM_7A_LINES_TO_READ_IF_NO_NEXT_INDEX,
                config.KEY_ITEM_7A_1, config.KEY_ITEM_7A_2,
                config.KEY_ITEM_7A_3, config.KEY_ITEM_8_1, config.KEY_ITEM_9_1,
                config.KEY_ITEM_9_2
            )  # 9_2 because there is no a third section afterwards

    connection.close()

    return list_1a, list_7, list_7a
示例#6
0
    consecutive_low_freq = 0
    for f, b1, b2 in zip(norm_freqs, bins[:-1], bins[1:]):
        print('{}-{}:'.format(b1, b2) + '#'*(int(f)))

        if f < 1:
            consecutive_low_freq += 1

        if consecutive_low_freq >= 3:
            break


if __name__ == "__main__":
    random.seed(config.SEED)

    connection = utils.create_mysql_connection()
    # Total
    with connection.cursor() as cursor:
        sql = "SELECT COUNT(*) AS total FROM `10k`"
        cursor.execute(sql)
        results = cursor.fetchall()
    total = results[0]['total']
    print('{} 10k reports in total'.format(total))
    print()

    # If there are some badly parsed files
    print('BADLY PARSED FILES')
    with connection.cursor() as cursor:
        sql = "SELECT * FROM `10k` WHERE {} = %s".format(config.KEY_BADLY_PARSED)
        cursor.execute(sql, (True))
        results = cursor.fetchall()
示例#7
0
    for section in sections_to_analyze:
        print(section)
        stocks_already_computed = {
            f.split('/')[-1][:-4]
            for f in glob.glob("{}/*.pkl".format(config.DATA_STOCKS_FOLDER))
        }
        ni_seqs_already_computed = {
            f.split('/')[-1][:-4]
            for f in glob.glob("{}/*.pkl".format(config.DATA_NI_SEQ_FOLDER))
        }
        sec_ind_already_computed = {
            f.split('/')[-1][:-4]
            for f in glob.glob("{}/*.pkl".format(config.DATA_SEC_IND_FOLDER))
        }

        connection = utils.create_mysql_connection(all_in_mem=True)
        db = wrds.Connection()
        cik_2_oindices, already_computed = get_cik_lookup_table(db, connection)
        connection.close()

        # Try to fill database with new TICKERs from CIKs
        only_cik = 0
        filenames = [
            x[0].split('/')[-1].split('.')[0]
            for x in analyze_topics_static.load_and_clean_data(section)
        ]
        random.shuffle(filenames)  # Better balanced work for multiprocessing

        for i in range(0, len(filenames)):
            company_cik = filenames[i].split(
                config.CIK_COMPANY_NAME_SEPARATOR)[0].rjust(10, '0')