def main(config, client, check_collection, dump_collection, dupe_collection): scraped_update_date = scrape_updated_date() try: stored_update_date = client['data-sources'].find_one( {"name": "irs_exempt_organizations"})['last_updated'] stored_update_date = datetime.strptime(str(stored_update_date), '%Y-%m-%d %H:%M:%S').date() if check_site_for_new_date(stored_update_date): logging.info('No new update detected. Exiting script...') return except KeyError: pass logging.info('updating scraped update date in data-sources collection') client['data_sources'].update_one( {"name": "irs_exempt_organizations"}, {'$set': { 'last_updated': str(scraped_update_date) }}) code_dict = config['NTEE_codes'] df = grab_data(config, code_dict) logging.info('purging EIN duplicates') if client[dump_collection].estimated_document_count() > 0: df = purge_EIN_duplicates(df, client, dump_collection, dupe_collection) if client[check_collection].estimated_document_count() == 0: # No need to check for duplicates in an empty collection insert_services(df.to_dict('records'), client, dump_collection) else: logging.info('refreshing ngrams') refresh_ngrams(client, check_collection) found_duplicates = [] logging.info('checking for duplicates in the services collection') for i in tqdm(range(len(df))): dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i, 'zip'], client, check_collection) if dc is not False: if check_similarity(df.loc[i, 'name'], dc): found_duplicates.append(i) duplicate_df = df.loc[found_duplicates].reset_index(drop=True) logging.info( f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection' ) if len(duplicate_df) > 0: insert_services(duplicate_df.to_dict('records'), client, dupe_collection) df = df.drop(found_duplicates).reset_index(drop=True) logging.info(f'final df shape: {df.shape}') if len(df) > 0: insert_services(df.to_dict('records'), client, dump_collection)
def main_scraper(self, client: MongoClient) -> None: """Base function for ingesting raw data, preparing it and depositing it in MongoDB Args: client (MongoClient): connection to the MongoDB instance scraper_config (ScraperConfig): instance of the ScraperConfig class """ df = self.grab_data() if client[self.dump_collection].estimated_document_count() > 0: logging.info( f'purging duplicates from existing {self.source} collection') df = self.purge_collection_duplicates(df, client) if client[self.check_collection].estimated_document_count() == 0: # No need to check for duplicates in an empty collection insert_services(df.to_dict('records'), client, self.dump_collection) else: logging.info('refreshing ngrams') refresh_ngrams(client, self.check_collection) found_duplicates = [] logging.info('checking for duplicates in the services collection') for i in tqdm(range(len(df))): dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i, 'zip'], client, self.check_collection) if dc is not False: if check_similarity(df.loc[i, 'name'], dc): found_duplicates.append(i) duplicate_df = df.loc[found_duplicates].reset_index(drop=True) if len(duplicate_df) > 0: logging.info( f'inserting services dupes into the {self.source} dupe collection' ) insert_services(duplicate_df.to_dict('records'), client, self.dupe_collection) df = df.drop(found_duplicates).reset_index(drop=True) logging.info(f'final df shape: {df.shape}') if len(df) > 0: insert_services(df.to_dict('records'), client, self.dump_collection) logging.info( 'updating scraped update date in data-sources collection') client['data_sources'].update_one( {"name": self.data_source_collection_name}, { '$set': { 'last_updated': datetime.strftime(datetime.now(), '%m/%d/%Y') } })
def main(config, client, check_collection, dump_collection, dupe_collection): scraped_update_date = scrape_updated_date() try: stored_update_date = retrieve_last_scraped_date(date) if stored_update_date and scraped_update_date <= stored_update_date: logger.info('No new update detected. Exiting script...') return except KeyError: pass logger.info('updating last scraped date in data-sources collection') client['data-sources'].update_one({"name": "irs"}, { '$set': { 'last_scraped': datetime.now(timezone('UTC')).replace(microsecond=0).isoformat() } }, upsert=True) code_dict = config['NTEE_codes'] df = grab_data(config, code_dict) logger.info('purging EIN duplicates') if client[dump_collection].estimated_document_count() > 0: df = purge_EIN_duplicates(df, client, dump_collection, dupe_collection) if client[check_collection].estimated_document_count() == 0: # No need to check for duplicates in an empty collection insert_services(df.to_dict('records'), client, dump_collection) else: logger.info('refreshing ngrams') refresh_ngrams(client, check_collection) found_duplicates = [] logger.info('checking for duplicates in the services collection') for i in tqdm(range(len(df))): dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i, 'zip'], client, check_collection) if dc is not False: if check_similarity(df.loc[i, 'name'], dc): found_duplicates.append(i) duplicate_df = df.loc[found_duplicates].reset_index(drop=True) logger.info( f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection' ) if len(duplicate_df) > 0: insert_services(duplicate_df.to_dict('records'), client, dupe_collection) df = df.drop(found_duplicates).reset_index(drop=True) logger.info(f'final df shape: {df.shape}') if len(df) > 0: insert_services(df.to_dict('records'), client, dump_collection)
def test_check_similarity_st_vs_saint(): mock_new_service = 'st dominics legal defense fund' mock_existing_service = 'saint dominics legal defense fund' assert check_similarity(mock_new_service, mock_existing_service)
def test_check_similarity(): mock_new_service = 'Example new service' mock_existing_service = 'Example existing service' assert not check_similarity(mock_new_service, mock_existing_service)
def main_scraper(self, client: MongoClient) -> None: """Base function for ingesting raw data, preparing it and depositing it in MongoDB Args: client (MongoClient): connection to the MongoDB instance scraper_config (ScraperConfig): instance of the ScraperConfig class """ if not self.is_new_data_available(client): logger.info('No new data. Goodbye...') return df = self.grab_data() if client[self.dump_collection].estimated_document_count() > 0: logger.info( f'purging duplicates from existing {self.source} collection') df = self.purge_collection_duplicates(df, client) if self.groupby_columns is not None: df = self.aggregate_service_summary(df) if client[self.check_collection].estimated_document_count() == 0: # No need to check for duplicates in an empty collection insert_services(df.to_dict('records'), client, self.dump_collection) else: logger.info('refreshing ngrams') refresh_ngrams(client, self.check_collection) found_duplicates = [] logger.info('checking for duplicates in the services collection') for i in tqdm(range(len(df))): dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i, 'zip'], client, self.check_collection) if dc is not False: if check_similarity(df.loc[i, 'name'], dc): found_duplicates.append(i) duplicate_df = df.loc[found_duplicates].reset_index(drop=True) if len(duplicate_df) > 0: logger.info( f'inserting services dupes into the {self.source} dupe collection' ) insert_services(duplicate_df.to_dict('records'), client, self.dupe_collection) df = df.drop(found_duplicates).reset_index(drop=True) logger.info(f'final df shape: {df.shape}') self.add_required_fields(df) if len(df) > 0: insert_services(df.to_dict('records'), client, self.dump_collection) logger.info( 'updating last scraped date in data-sources collection') client['data-sources'].update_one( {"name": self.data_source_collection_name}, { '$set': { 'last_scraped': datetime.now(timezone('UTC')).replace( microsecond=0).isoformat() } }, upsert=True)