def strap(application):
    """
    First, this function runs the data importers and adds them to the database.
    Then, it runs the scrapers and does the same. 
    """

    with application.app_context():

        # create all the different categories for the providers
        # we do this separately for clarity but we don't have to
        # they would automatically be created when importing
        # the rest of the data
        categories = seconds(open_csv(data_dir('rad_resource.csv')))

        # we commit on every record because they have to be unique
        map(lambda c: add_get_or_create(db, Category, name=c) and db.session.commit(),
            categories)

        # load all the resources' data, but we drop the id
        # column because our database will assign them on its own
        raw_resources = map(lambda row: minus_key(row, 'id'),
                            open_dict_csv(data_dir('rad_resource.csv')))

        # then we save every record
        map(lambda row: get_or_create_resource(db, rad_record(**row)),
            raw_resources)

        db.session.commit()

        # run all the scrapers
        run_scrapers(application)
def get_radrecord(d, resource_fields):
    """
    Gets the equivalent RadRecord from the
    provided dictionary. Will perform category
    and population normalization.

    Args:
        d: The source dictionary.
        resource_fields: The list of fields that are recognized
            by the RadRecord.

    Returns:
        The equivalent RadRecord.
    """
    # Perform normalization - map "category" to
    # "category_name" and filter out unrecognized fields
    # afterwards
    filtered_dict = rename_key(d, 'category', 'category_name')
    filtered_dict = filter_keys(filtered_dict, resource_fields)

    # Now create a RadRecord from the dict and normalize
    # the categories/populations.
    return rad_record(**filtered_dict). \
        convert_category_name(). \
        convert_population_names()
示例#3
0
    def process_record(self, link, text_following, url):
        category = self.get_category_from_link(link)

        if link.text and '://' in link['href']:
            # BeautifulSoup will not parse if starts with end tag
            txt_following = starts_with_end_tag.sub('', text_following)
            soup_following = BeautifulSoup(txt_following)

            return rad_record(name=a_cleanse(link.text),
                              url=link['href'],
                              description=a_cleanse("\n".join(soup_following.stripped_strings)),
                              source=self.source,
                              category_name=category)
        else:
            # TODO: Why aren't we scraping these? Do we want to?
            print("Not scraping: %s" % link)
            return None
def get_radrecords(file_path):
    """
    Opens a CSV file and returns the equivalent
    RadRecords.

    Args:
        file_path: The path to the CSV file.

    Returns:
        The RadRecords in the file.
    """
    # Create a new RadRecord so we can get the field names
    dummy_record = rad_record(name="Ministry of Silly Walks")
    resource_fields = dummy_record._fields

    # Now get resources from each row
    return map(lambda row: get_radrecord(row, resource_fields), open_dict_csv(file_path))
示例#5
0
def get_radrecords(file_path):
    """
    Opens a CSV file and returns the equivalent
    RadRecords.

    Args:
        file_path: The path to the CSV file.

    Returns:
        The RadRecords in the file.
    """
    # Create a new RadRecord so we can get the field names
    dummy_record = rad_record(name='Ministry of Silly Walks')
    resource_fields = dummy_record._fields

    # Now get resources from each row
    return map(lambda row: get_radrecord(row, resource_fields),
               open_dict_csv(file_path))
    def scrape(self):

        resp = requests.get(self.PROVIDERS_RAW)

        if resp.status_code == requests.codes.ok:
            data = resp.json()

            return map(lambda r: rad_record(name=r['providername'],
                                     street=r['streetaddress'],
                                     city=r['city'],
                                     country='U.S.A',
                                     zipcode=r['zip'],
                                     email=r['email'],
                                     phone=r['phone'],
                                     url=r['website'],
                                     source=self.source),
                       data)

        else:
            print('Failed to scrape {0}'.format(self.source))
示例#7
0
def get_radrecord(d, resource_fields):
    """
    Gets the equivalent RadRecord from the
    provided dictionary. Will perform category normalization.

    Args:
        d: The source dictionary.
        resource_fields: The list of fields that are recognized
            by the RadRecord.

    Returns:
        The equivalent RadRecord.
    """
    # Perform normalization - map "category" to
    # "category_name" and filter out unrecognized fields
    # afterwards
    filtered_dict = rename_key(d, 'category', 'category_name') 
    filtered_dict = filter_keys(filtered_dict, resource_fields)

    # Now create a RadRecord from the dict and normalize
    # the categories.
    return rad_record(**filtered_dict).convert_category_name()
示例#8
0
    def scrape(self):

        resp = requests.get(self.PROVIDERS_RAW)

        if resp.status_code == requests.codes.ok:
            data = resp.json()

            return map(
                lambda r: rad_record(name=r['providername'],
                                     organization=r['agencyname'],
                                     street=r['streetaddress'],
                                     city=r['city'],
                                     country='U.S.A',
                                     zipcode=r['zip'],
                                     email=r['email'],
                                     phone=r['phone'],
                                     url=r['website'],
                                     source=self.source,
                                     category_names=r['type']), data)

        else:
            print('Failed to scrape {0}'.format(self.source))
示例#9
0
def strap(application):
    """
    First, this function runs the data importers and adds them to the database.
    Then, it runs the scrapers and does the same. 
    """

    with application.app_context():

        # Try a sample join to make sure that our
        # data_dir is good and add a helpful error message
        # when it's not.
        try:
            data_dir('rad_resource.csv')
        except:
            sys.exit('The source data directory is missing or invalid. ' \
                'This is typically due to a missing RAD_DATA_BASE environment variable.')

        # Load all the resources' data, but we drop the id
        # column because our database will assign them on its own.
        # We also want to attempt to rename the "category" row, if provided,
        # to "category_name", as that's consistent with the RadRecord format.
        raw_resources = map(lambda row: rename_key(minus_key(row, 'id'), 
                                'category', 'category_name'),
                            open_dict_csv(data_dir('rad_resource.csv')))

        # Now save every record. To support multiple delimited
        # categories in the category_name field, we invoke
        # convert_category_name() on the raw rad_record
        # generated from the data row.
        map(lambda row: get_or_create_resource(db, 
            rad_record(**row).convert_category_name()),
            raw_resources)

        db.session.commit()

        # run all the scrapers
        run_scrapers(application)