예제 #1
0
def main(config, client, check_collection, dump_collection, dupe_collection):
    scraped_update_date = scrape_updated_date()
    try:
        stored_update_date = client['data-sources'].find_one(
            {"name": "irs_exempt_organizations"})['last_updated']
        stored_update_date = datetime.strptime(str(stored_update_date),
                                               '%Y-%m-%d %H:%M:%S').date()
        if check_site_for_new_date(stored_update_date):
            logging.info('No new update detected. Exiting script...')
            return
    except KeyError:
        pass
    logging.info('updating scraped update date in data-sources collection')
    client['data_sources'].update_one(
        {"name": "irs_exempt_organizations"},
        {'$set': {
            'last_updated': str(scraped_update_date)
        }})
    code_dict = config['NTEE_codes']
    df = grab_data(config, code_dict)
    logging.info('purging EIN duplicates')
    if client[dump_collection].estimated_document_count() > 0:
        df = purge_EIN_duplicates(df, client, dump_collection, dupe_collection)
    if client[check_collection].estimated_document_count() == 0:
        # No need to check for duplicates in an empty collection
        insert_services(df.to_dict('records'), client, dump_collection)
    else:
        logging.info('refreshing ngrams')
        refresh_ngrams(client, check_collection)
        found_duplicates = []
        logging.info('checking for duplicates in the services collection')
        for i in tqdm(range(len(df))):
            dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i,
                                                                      'zip'],
                                            client, check_collection)
            if dc is not False:
                if check_similarity(df.loc[i, 'name'], dc):
                    found_duplicates.append(i)
        duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
        logging.info(
            f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection'
        )
        if len(duplicate_df) > 0:
            insert_services(duplicate_df.to_dict('records'), client,
                            dupe_collection)
        df = df.drop(found_duplicates).reset_index(drop=True)
        logging.info(f'final df shape: {df.shape}')
        if len(df) > 0:
            insert_services(df.to_dict('records'), client, dump_collection)
예제 #2
0
    def main_scraper(self, client: MongoClient) -> None:
        """Base function for ingesting raw data, preparing it and depositing it in MongoDB

        Args:
            client (MongoClient): connection to the MongoDB instance
            scraper_config (ScraperConfig): instance of the ScraperConfig class
        """
        df = self.grab_data()
        if client[self.dump_collection].estimated_document_count() > 0:
            logging.info(
                f'purging duplicates from existing {self.source} collection')
            df = self.purge_collection_duplicates(df, client)
        if client[self.check_collection].estimated_document_count() == 0:
            # No need to check for duplicates in an empty collection
            insert_services(df.to_dict('records'), client,
                            self.dump_collection)
        else:
            logging.info('refreshing ngrams')
            refresh_ngrams(client, self.check_collection)
            found_duplicates = []
            logging.info('checking for duplicates in the services collection')
            for i in tqdm(range(len(df))):
                dc = locate_potential_duplicate(df.loc[i, 'name'],
                                                df.loc[i, 'zip'], client,
                                                self.check_collection)
                if dc is not False:
                    if check_similarity(df.loc[i, 'name'], dc):
                        found_duplicates.append(i)
            duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
            if len(duplicate_df) > 0:
                logging.info(
                    f'inserting services dupes into the {self.source} dupe collection'
                )
                insert_services(duplicate_df.to_dict('records'), client,
                                self.dupe_collection)
            df = df.drop(found_duplicates).reset_index(drop=True)
            logging.info(f'final df shape: {df.shape}')
            if len(df) > 0:
                insert_services(df.to_dict('records'), client,
                                self.dump_collection)
                logging.info(
                    'updating scraped update date in data-sources collection')
                client['data_sources'].update_one(
                    {"name": self.data_source_collection_name}, {
                        '$set': {
                            'last_updated':
                            datetime.strftime(datetime.now(), '%m/%d/%Y')
                        }
                    })
예제 #3
0
def main(config, client, check_collection, dump_collection, dupe_collection):
    scraped_update_date = scrape_updated_date()
    try:
        stored_update_date = retrieve_last_scraped_date(date)
        if stored_update_date and scraped_update_date <= stored_update_date:
            logger.info('No new update detected. Exiting script...')
            return
    except KeyError:
        pass
    logger.info('updating last scraped date in data-sources collection')
    client['data-sources'].update_one({"name": "irs"}, {
        '$set': {
            'last_scraped':
            datetime.now(timezone('UTC')).replace(microsecond=0).isoformat()
        }
    },
                                      upsert=True)
    code_dict = config['NTEE_codes']
    df = grab_data(config, code_dict)
    logger.info('purging EIN duplicates')
    if client[dump_collection].estimated_document_count() > 0:
        df = purge_EIN_duplicates(df, client, dump_collection, dupe_collection)
    if client[check_collection].estimated_document_count() == 0:
        # No need to check for duplicates in an empty collection
        insert_services(df.to_dict('records'), client, dump_collection)
    else:
        logger.info('refreshing ngrams')
        refresh_ngrams(client, check_collection)
        found_duplicates = []
        logger.info('checking for duplicates in the services collection')
        for i in tqdm(range(len(df))):
            dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i,
                                                                      'zip'],
                                            client, check_collection)
            if dc is not False:
                if check_similarity(df.loc[i, 'name'], dc):
                    found_duplicates.append(i)
        duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
        logger.info(
            f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection'
        )
        if len(duplicate_df) > 0:
            insert_services(duplicate_df.to_dict('records'), client,
                            dupe_collection)
        df = df.drop(found_duplicates).reset_index(drop=True)
        logger.info(f'final df shape: {df.shape}')
        if len(df) > 0:
            insert_services(df.to_dict('records'), client, dump_collection)
예제 #4
0
def test_check_similarity_st_vs_saint():
    mock_new_service = 'st dominics legal defense fund'
    mock_existing_service = 'saint dominics legal defense fund'
    assert check_similarity(mock_new_service, mock_existing_service)
예제 #5
0
def test_check_similarity():
    mock_new_service = 'Example new service'
    mock_existing_service = 'Example existing service'
    assert not check_similarity(mock_new_service, mock_existing_service)
예제 #6
0
    def main_scraper(self, client: MongoClient) -> None:
        """Base function for ingesting raw data, preparing it and depositing it in MongoDB

        Args:
            client (MongoClient): connection to the MongoDB instance
            scraper_config (ScraperConfig): instance of the ScraperConfig class
        """
        if not self.is_new_data_available(client):
            logger.info('No new data. Goodbye...')
            return

        df = self.grab_data()

        if client[self.dump_collection].estimated_document_count() > 0:
            logger.info(
                f'purging duplicates from existing {self.source} collection')
            df = self.purge_collection_duplicates(df, client)

        if self.groupby_columns is not None:
            df = self.aggregate_service_summary(df)

        if client[self.check_collection].estimated_document_count() == 0:
            # No need to check for duplicates in an empty collection
            insert_services(df.to_dict('records'), client,
                            self.dump_collection)
        else:
            logger.info('refreshing ngrams')
            refresh_ngrams(client, self.check_collection)
            found_duplicates = []
            logger.info('checking for duplicates in the services collection')
            for i in tqdm(range(len(df))):
                dc = locate_potential_duplicate(df.loc[i, 'name'],
                                                df.loc[i, 'zip'], client,
                                                self.check_collection)
                if dc is not False:
                    if check_similarity(df.loc[i, 'name'], dc):
                        found_duplicates.append(i)
            duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
            if len(duplicate_df) > 0:
                logger.info(
                    f'inserting services dupes into the {self.source} dupe collection'
                )
                insert_services(duplicate_df.to_dict('records'), client,
                                self.dupe_collection)
            df = df.drop(found_duplicates).reset_index(drop=True)
            logger.info(f'final df shape: {df.shape}')
            self.add_required_fields(df)
            if len(df) > 0:
                insert_services(df.to_dict('records'), client,
                                self.dump_collection)
                logger.info(
                    'updating last scraped date in data-sources collection')
                client['data-sources'].update_one(
                    {"name": self.data_source_collection_name}, {
                        '$set': {
                            'last_scraped':
                            datetime.now(timezone('UTC')).replace(
                                microsecond=0).isoformat()
                        }
                    },
                    upsert=True)