Пример #1
0
def main():
    # get zip codes
    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    # # add leading 0's to zip codes due to excel's stupidness
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].astype(str)
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].apply(lambda x: '0' * (5 - len(x)) + x)

    current_month = datetime.date.today().month
    current_rows = session.query(Indeed).filter(
        extract('month', Indeed.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [
        zip_code for zip_code in zip_codes
        if zip_code not in existing_zip_codes
    ]

    LOGGER.info(
        'Found {} rows for current month: {}. Extracting {} remaining zip codes'
        .format(len(current_rows), current_month, len(remaining_zip_codes)))

    for i, zip_code in enumerate(remaining_zip_codes):
        job_count = get_num_job_postings(zip_code)
        row = Indeed(zip_code=zip_code,
                     job_count=job_count,
                     date_created=datetime.date.today())
        session.merge(row)
        session.commit()

        LOGGER.info("Extracting zip code {} ({} of {})".format(
            zip_code, i, len(remaining_zip_codes)))
    session.close()
Пример #2
0
def main():
    # get zip codes
    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    # # add leading 0's to zip codes due to excel's stupidness
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].astype(str)
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].apply(lambda x: '0' * (5 - len(x)) + x)

    current_month = datetime.date.today().month
    current_rows = session.query(Indeed).filter(extract('month', Indeed.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes]

    LOGGER.info('Found {} rows for current month: {}. Extracting {} remaining zip codes'.format(len(current_rows),
                                                                                                current_month,
                                                                                                len(
                                                                                                    remaining_zip_codes)))

    for i, zip_code in enumerate(remaining_zip_codes):
        job_count = get_num_job_postings(zip_code)
        row = Indeed(zip_code=zip_code, job_count=job_count, date_created=datetime.date.today())
        session.merge(row)
        session.commit()

        LOGGER.info("Extracting zip code {} ({} of {})".format(zip_code, i, len(remaining_zip_codes)))
    session.close()
Пример #3
0
def main():
    LOGGER.info('Extracting building permit data...')
    post_data_list = generate_post_data((1, 12), (2005, 2015))

    pool = ThreadPool(5)
    results = pool.map(get_census_reponse, post_data_list)
    results_array = parse_results(results)

    for result in results_array:
        session.merge(BuildingPermit(**result))

    session.commit()
Пример #4
0
def main():
    LOGGER.info('Extracting building permit data...')
    post_data_list = generate_post_data((1, 12), (2005, 2015))

    pool = ThreadPool(5)
    results = pool.map(get_census_reponse, post_data_list)
    results_array = parse_results(results)

    for result in results_array:
        session.merge(BuildingPermit(**result))

    session.commit()
Пример #5
0
def main():
    yelp_api = YelpAPI(login_data['yelp_consumer_key'],
                       login_data['yelp_consumer_secret'],
                       login_data['yelp_token'],
                       login_data['yelp_token_secret'])

    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    current_month = datetime.date.today().month
    current_rows = session.query(YelpAPIDb).filter(
        extract('month', YelpAPIDb.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [
        zip_code for zip_code in zip_codes
        if zip_code not in existing_zip_codes
    ]

    category_list = [
        "cafes", "newamerican", "indpak", "italian", "japanese", "thai"
    ]

    for i, zip_code in enumerate(remaining_zip_codes):
        zip_code_results = []
        for category in category_list:
            offset = 0
            total_count = 21
            results_per_query_limit = 20
            business_counter = 1
            remaining_count = 1

            LOGGER.info(
                "Extracting {} restaurants from zip code {} ({} out of {})".
                format(category, zip_code, i, len(remaining_zip_codes)))
            while remaining_count > 0:
                try:

                    search_results = yelp_api.search_query(
                        location=zip_code,
                        category_filter=category,
                        sort=0,
                        limit=20,
                        offset=offset)
                    total_count = search_results['total']
                except YelpAPI.YelpAPIError as e:
                    print e
                    break
                if search_results['total'] == 0:
                    session.merge(
                        YelpAPIDb(zip_code=zip_code,
                                  date_created=datetime.date.today(),
                                  avg_rating=None,
                                  business_count=0))
                    session.commit()
                    break
                for business in search_results['businesses']:
                    if is_business_valid(business, zip_code):
                        print "{} out of {} businesses".format(
                            business_counter, total_count)
                        zip_code_results.append({
                            "zip_code":
                            zip_code,
                            "rating":
                            business['rating'],
                            "review_count":
                            business["review_count"]
                        })
                    business_counter += 1

                remaining_count = total_count - business_counter
                offset += results_per_query_limit

        if zip_code_results:
            total_review_count = sum(
                [business['review_count'] for business in zip_code_results])
            zip_code_avg_rating = sum([
                business['rating'] * business['review_count']
                for business in zip_code_results
            ]) / total_review_count
            row = YelpAPIDb(zip_code=zip_code,
                            date_created=datetime.date.today(),
                            avg_rating=zip_code_avg_rating,
                            business_count=len(zip_code_results))
            session.merge(row)
            session.commit()
        else:
            session.merge(
                YelpAPIDb(zip_code=zip_code,
                          date_created=datetime.date.today(),
                          avg_rating=None,
                          business_count=0))
            session.commit()
    session.close()
Пример #6
0
def main():
    yelp_api = YelpAPI(login_data['yelp_consumer_key'],
                       login_data['yelp_consumer_secret'],
                       login_data['yelp_token'],
                       login_data['yelp_token_secret'])

    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    current_month = datetime.date.today().month
    current_rows = session.query(YelpAPIDb).filter(extract('month', YelpAPIDb.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes]

    category_list = ["cafes",
                     "newamerican",
                     "indpak",
                     "italian",
                     "japanese",
                     "thai"]

    for i, zip_code in enumerate(remaining_zip_codes):
        zip_code_results = []
        for category in category_list:
            offset = 0
            total_count = 21
            results_per_query_limit = 20
            business_counter = 1
            remaining_count = 1

            LOGGER.info("Extracting {} restaurants from zip code {} ({} out of {})".format(category, zip_code, i,
                                                                                           len(remaining_zip_codes)))
            while remaining_count > 0:
                try:

                    search_results = yelp_api.search_query(location=zip_code,
                                                           category_filter=category, sort=0, limit=20,
                                                           offset=offset)
                    total_count = search_results['total']
                except YelpAPI.YelpAPIError as e:
                    print e
                    break
                if search_results['total'] == 0:
                    session.merge(YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None,
                                            business_count=0))
                    session.commit()
                    break
                for business in search_results['businesses']:
                    if is_business_valid(business, zip_code):
                        print "{} out of {} businesses".format(business_counter, total_count)
                        zip_code_results.append({"zip_code": zip_code,
                                                 "rating": business['rating'],
                                                 "review_count": business["review_count"]})
                    business_counter += 1

                remaining_count = total_count - business_counter
                offset += results_per_query_limit

        if zip_code_results:
            total_review_count = sum([business['review_count'] for business in zip_code_results])
            zip_code_avg_rating = sum(
                [business['rating'] * business['review_count'] for business in zip_code_results]) / total_review_count
            row = YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=zip_code_avg_rating,
                            business_count=len(zip_code_results))
            session.merge(row)
            session.commit()
        else:
            session.merge(
                YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0))
            session.commit()
    session.close()