def strap(application): """ First, this function runs the data importers and adds them to the database. Then, it runs the scrapers and does the same. """ with application.app_context(): # create all the different categories for the providers # we do this separately for clarity but we don't have to # they would automatically be created when importing # the rest of the data categories = seconds(open_csv(data_dir('rad_resource.csv'))) # we commit on every record because they have to be unique map(lambda c: add_get_or_create(db, Category, name=c) and db.session.commit(), categories) # load all the resources' data, but we drop the id # column because our database will assign them on its own raw_resources = map(lambda row: minus_key(row, 'id'), open_dict_csv(data_dir('rad_resource.csv'))) # then we save every record map(lambda row: get_or_create_resource(db, rad_record(**row)), raw_resources) db.session.commit() # run all the scrapers run_scrapers(application)
def get_radrecord(d, resource_fields): """ Gets the equivalent RadRecord from the provided dictionary. Will perform category and population normalization. Args: d: The source dictionary. resource_fields: The list of fields that are recognized by the RadRecord. Returns: The equivalent RadRecord. """ # Perform normalization - map "category" to # "category_name" and filter out unrecognized fields # afterwards filtered_dict = rename_key(d, 'category', 'category_name') filtered_dict = filter_keys(filtered_dict, resource_fields) # Now create a RadRecord from the dict and normalize # the categories/populations. return rad_record(**filtered_dict). \ convert_category_name(). \ convert_population_names()
def process_record(self, link, text_following, url): category = self.get_category_from_link(link) if link.text and '://' in link['href']: # BeautifulSoup will not parse if starts with end tag txt_following = starts_with_end_tag.sub('', text_following) soup_following = BeautifulSoup(txt_following) return rad_record(name=a_cleanse(link.text), url=link['href'], description=a_cleanse("\n".join(soup_following.stripped_strings)), source=self.source, category_name=category) else: # TODO: Why aren't we scraping these? Do we want to? print("Not scraping: %s" % link) return None
def get_radrecords(file_path): """ Opens a CSV file and returns the equivalent RadRecords. Args: file_path: The path to the CSV file. Returns: The RadRecords in the file. """ # Create a new RadRecord so we can get the field names dummy_record = rad_record(name="Ministry of Silly Walks") resource_fields = dummy_record._fields # Now get resources from each row return map(lambda row: get_radrecord(row, resource_fields), open_dict_csv(file_path))
def get_radrecords(file_path): """ Opens a CSV file and returns the equivalent RadRecords. Args: file_path: The path to the CSV file. Returns: The RadRecords in the file. """ # Create a new RadRecord so we can get the field names dummy_record = rad_record(name='Ministry of Silly Walks') resource_fields = dummy_record._fields # Now get resources from each row return map(lambda row: get_radrecord(row, resource_fields), open_dict_csv(file_path))
def scrape(self): resp = requests.get(self.PROVIDERS_RAW) if resp.status_code == requests.codes.ok: data = resp.json() return map(lambda r: rad_record(name=r['providername'], street=r['streetaddress'], city=r['city'], country='U.S.A', zipcode=r['zip'], email=r['email'], phone=r['phone'], url=r['website'], source=self.source), data) else: print('Failed to scrape {0}'.format(self.source))
def get_radrecord(d, resource_fields): """ Gets the equivalent RadRecord from the provided dictionary. Will perform category normalization. Args: d: The source dictionary. resource_fields: The list of fields that are recognized by the RadRecord. Returns: The equivalent RadRecord. """ # Perform normalization - map "category" to # "category_name" and filter out unrecognized fields # afterwards filtered_dict = rename_key(d, 'category', 'category_name') filtered_dict = filter_keys(filtered_dict, resource_fields) # Now create a RadRecord from the dict and normalize # the categories. return rad_record(**filtered_dict).convert_category_name()
def scrape(self): resp = requests.get(self.PROVIDERS_RAW) if resp.status_code == requests.codes.ok: data = resp.json() return map( lambda r: rad_record(name=r['providername'], organization=r['agencyname'], street=r['streetaddress'], city=r['city'], country='U.S.A', zipcode=r['zip'], email=r['email'], phone=r['phone'], url=r['website'], source=self.source, category_names=r['type']), data) else: print('Failed to scrape {0}'.format(self.source))
def strap(application): """ First, this function runs the data importers and adds them to the database. Then, it runs the scrapers and does the same. """ with application.app_context(): # Try a sample join to make sure that our # data_dir is good and add a helpful error message # when it's not. try: data_dir('rad_resource.csv') except: sys.exit('The source data directory is missing or invalid. ' \ 'This is typically due to a missing RAD_DATA_BASE environment variable.') # Load all the resources' data, but we drop the id # column because our database will assign them on its own. # We also want to attempt to rename the "category" row, if provided, # to "category_name", as that's consistent with the RadRecord format. raw_resources = map(lambda row: rename_key(minus_key(row, 'id'), 'category', 'category_name'), open_dict_csv(data_dir('rad_resource.csv'))) # Now save every record. To support multiple delimited # categories in the category_name field, we invoke # convert_category_name() on the raw rad_record # generated from the data row. map(lambda row: get_or_create_resource(db, rad_record(**row).convert_category_name()), raw_resources) db.session.commit() # run all the scrapers run_scrapers(application)