Пример #1
0
def test_get_files_from_tar_limits_rows(mocked_crunchbase_tar,
                                        crunchbase_tarfile):
    mocked_crunchbase_tar.return_value = tarfile.open(crunchbase_tarfile.name)

    expected_result = pd.DataFrame({'id': [111], 'data': ['aaa']})
    dfs = get_files_from_tar(['test_0'], nrows=1)  # only return 1 row
    assert_frame_equal(dfs[0], expected_result, check_like=True)
Пример #2
0
def test_get_files_from_tar(mocked_crunchbase_tar, crunchbase_tarfile):
    mocked_crunchbase_tar.return_value = tarfile.open(crunchbase_tarfile.name)

    expected_result = pd.DataFrame({'id': [111, 222], 'data': ['aaa', 'bbb']})
    dfs = get_files_from_tar(['test_0'])
    assert type(dfs) == list
    assert_frame_equal(dfs[0], expected_result, check_like=True)
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # collect file
        logging.info(f"Collecting org_parents from crunchbase tar")
        org_parents = get_files_from_tar(['org_parents'])[0]
        logging.info(f"{len(org_parents)} parent ids in crunchbase export")

        # collect previously processed orgs
        logging.info("Extracting previously processed organisations")
        with db_session(self.engine) as session:
            processed_orgs = session.query(Organization.id,
                                           Organization.parent_id).all()
        all_orgs = {org for (org, _) in processed_orgs}
        logging.info(f"{len(all_orgs)} total orgs in database")
        processed_orgs = {
            org
            for (org, parent_id) in processed_orgs if parent_id is not None
        }
        logging.info(f"{len(processed_orgs)} previously processed orgs")

        # reformat into a list of dicts, removing orgs that already have a parent_id
        # or are missing from the database
        org_parents = org_parents[['uuid', 'parent_uuid']]
        org_parents.columns = ['id', 'parent_id']
        org_parents = org_parents[org_parents['id'].isin(all_orgs)]
        org_parents = org_parents[~org_parents['id'].isin(processed_orgs)]
        org_parents = org_parents.to_dict(orient='records')
        logging.info(f"{len(org_parents)} organisations to update in MYSQL")

        # insert parent_ids into db in batches
        for count, batch in enumerate(
                split_batches(org_parents, self.insert_batch_size), 1):
            with db_session(self.engine) as session:
                session.bulk_update_mappings(Organization, batch)
            logging.info(
                f"{count} batch{'es' if count > 1 else ''} written to db")
            if self.test and count > 1:
                logging.info("Breaking after 2 batches while in test mode")
                break

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
Пример #4
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    db_name = os.environ["BATCHPAR_db_name"]
    table = os.environ["BATCHPAR_table"]
    batch_size = int(os.environ["BATCHPAR_batch_size"])
    s3_path = os.environ["BATCHPAR_outinfo"]

    logging.warning(f"Processing {table} file")

    # database setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)
    table_name = f"crunchbase_{table}"
    table_class = get_class_by_tablename(Base, table_name)

    # collect file
    nrows = 1000 if test else None
    df = get_files_from_tar([table], nrows=nrows)[0]
    logging.warning(f"{len(df)} rows in file")

    # get primary key fields and set of all those already existing in the db
    pk_cols = list(table_class.__table__.primary_key.columns)
    pk_names = [pk.name for pk in pk_cols]
    with db_session(engine) as session:
        existing_rows = set(session.query(*pk_cols).all())

    # process and insert data
    processed_rows = process_non_orgs(df, existing_rows, pk_names)
    for batch in split_batches(processed_rows, batch_size):
        insert_data("BATCHPAR_config",
                    'mysqldb',
                    db_name,
                    Base,
                    table_class,
                    processed_rows,
                    low_memory=True)

    logging.warning(f"Marking task as done to {s3_path}")
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    logging.warning("Batch job complete.")
Пример #5
0
    def run(self):
        """Collect and process organizations, categories and long descriptions."""

        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        try_until_allowed(Base.metadata.create_all, self.engine)

        # collect files
        nrows = 200 if self.test else None
        cat_groups, orgs, org_descriptions = get_files_from_tar(
            ['category_groups', 'organizations', 'organization_descriptions'],
            nrows=nrows)
        # process category_groups
        cat_groups = rename_uuid_columns(cat_groups)
        insert_data(self.db_config_env,
                    'mysqldb',
                    database,
                    Base,
                    CategoryGroup,
                    cat_groups.to_dict(orient='records'),
                    low_memory=True)

        # process organizations and categories
        with db_session(self.engine) as session:
            existing_orgs = session.query(Organization.id).all()
        existing_orgs = {org[0] for org in existing_orgs}

        logging.info("Summary of organisation data:")
        logging.info(f"Total number of organisations:\t {len(orgs)}")
        logging.info(
            f"Number of organisations already in the database:\t {len(existing_orgs)}"
        )
        logging.info(f"Number of category groups and text descriptions:\t"
                     f"{len(cat_groups)}, {len(org_descriptions)}")

        processed_orgs, org_cats, missing_cat_groups = process_orgs(
            orgs, existing_orgs, cat_groups, org_descriptions)
        # Insert CatGroups
        insert_data(self.db_config_env, 'mysqldb', database, Base,
                    CategoryGroup, missing_cat_groups)
        # Insert orgs in batches
        n_batches = round(len(processed_orgs) / self.insert_batch_size)
        logging.info(
            f"Inserting {n_batches} batches of size {self.insert_batch_size}")
        for i, batch in enumerate(
                split_batches(processed_orgs, self.insert_batch_size)):
            if i % 100 == 0:
                logging.info(f"Inserting batch {i} of {n_batches}")
            insert_data(self.db_config_env,
                        'mysqldb',
                        database,
                        Base,
                        Organization,
                        batch,
                        low_memory=True)

        # link table needs to be inserted via non-bulk method to enforce relationship
        logging.info("Filtering duplicates...")
        org_cats, existing_org_cats, failed_org_cats = filter_out_duplicates(
            self.db_config_env,
            'mysqldb',
            database,
            Base,
            OrganizationCategory,
            org_cats,
            low_memory=True)
        logging.info(
            f"Inserting {len(org_cats)} org categories "
            f"({len(existing_org_cats)} already existed and {len(failed_org_cats)} failed)"
        )
        #org_cats = [OrganizationCategory(**org_cat) for org_cat in org_cats]
        with db_session(self.engine) as session:
            session.add_all(org_cats)

        # mark as done
        self.output().touch()