示例#1
0
def main():
    # get zip codes
    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    # # add leading 0's to zip codes due to excel's stupidness
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].astype(str)
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].apply(lambda x: '0' * (5 - len(x)) + x)

    current_month = datetime.date.today().month
    current_rows = session.query(Indeed).filter(
        extract('month', Indeed.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [
        zip_code for zip_code in zip_codes
        if zip_code not in existing_zip_codes
    ]

    LOGGER.info(
        'Found {} rows for current month: {}. Extracting {} remaining zip codes'
        .format(len(current_rows), current_month, len(remaining_zip_codes)))

    for i, zip_code in enumerate(remaining_zip_codes):
        job_count = get_num_job_postings(zip_code)
        row = Indeed(zip_code=zip_code,
                     job_count=job_count,
                     date_created=datetime.date.today())
        session.merge(row)
        session.commit()

        LOGGER.info("Extracting zip code {} ({} of {})".format(
            zip_code, i, len(remaining_zip_codes)))
    session.close()
示例#2
0
def main():
    # get zip codes
    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    # # add leading 0's to zip codes due to excel's stupidness
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].astype(str)
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].apply(lambda x: '0' * (5 - len(x)) + x)

    current_month = datetime.date.today().month
    current_rows = session.query(Indeed).filter(extract('month', Indeed.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes]

    LOGGER.info('Found {} rows for current month: {}. Extracting {} remaining zip codes'.format(len(current_rows),
                                                                                                current_month,
                                                                                                len(
                                                                                                    remaining_zip_codes)))

    for i, zip_code in enumerate(remaining_zip_codes):
        job_count = get_num_job_postings(zip_code)
        row = Indeed(zip_code=zip_code, job_count=job_count, date_created=datetime.date.today())
        session.merge(row)
        session.commit()

        LOGGER.info("Extracting zip code {} ({} of {})".format(zip_code, i, len(remaining_zip_codes)))
    session.close()
示例#3
0
def persist_zip_code_data(df):
    zip_code_labels_df = df[['zip_code', 'city', 'metro', 'state',
                             'county']].drop_duplicates()
    session.query(ZipCode).delete(
    )  # TODO: should append to existing data in case zillow changes something
    session.add_all(
        [ZipCode(**row) for row in zip_code_labels_df.to_dict('records')])
    session.commit()
def main():
    LOGGER.info('Extracting building permit data...')
    post_data_list = generate_post_data((1, 12), (2005, 2015))

    pool = ThreadPool(5)
    results = pool.map(get_census_reponse, post_data_list)
    results_array = parse_results(results)

    for result in results_array:
        session.merge(BuildingPermit(**result))

    session.commit()
def main():
    LOGGER.info('Extracting building permit data...')
    post_data_list = generate_post_data((1, 12), (2005, 2015))

    pool = ThreadPool(5)
    results = pool.map(get_census_reponse, post_data_list)
    results_array = parse_results(results)

    for result in results_array:
        session.merge(BuildingPermit(**result))

    session.commit()
def persist_zillow_metrics(df):
    metrics_df = df.drop(['city', 'metro', 'state', 'county'], axis=1)
    session.query(ZillowMetrics).delete()  # TODO: should append to existing data in case zillow changes something
    session.commit()
    insert_chunk = 100000
    index_start = 0
    while index_start < len(metrics_df):
        LOGGER.info('Persisting Zillow Metrics rows: {} of {}'.format(index_start + insert_chunk,
                                                                      len(metrics_df)))
        engine.execute(
            ZillowMetrics.__table__.insert(metrics_df[index_start:index_start + insert_chunk].to_dict('records')))
        index_start += insert_chunk
示例#7
0
def persist_zillow_metrics(df):
    metrics_df = df.drop(['city', 'metro', 'state', 'county'], axis=1)
    session.query(ZillowMetrics).delete(
    )  # TODO: should append to existing data in case zillow changes something
    session.commit()
    insert_chunk = 100000
    index_start = 0
    while index_start < len(metrics_df):
        LOGGER.info('Persisting Zillow Metrics rows: {} of {}'.format(
            index_start + insert_chunk, len(metrics_df)))
        engine.execute(
            ZillowMetrics.__table__.insert(
                metrics_df[index_start:index_start +
                           insert_chunk].to_dict('records')))
        index_start += insert_chunk
示例#8
0
def main():
    yelp_api = YelpAPI(login_data['yelp_consumer_key'],
                       login_data['yelp_consumer_secret'],
                       login_data['yelp_token'],
                       login_data['yelp_token_secret'])

    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    current_month = datetime.date.today().month
    current_rows = session.query(YelpAPIDb).filter(
        extract('month', YelpAPIDb.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [
        zip_code for zip_code in zip_codes
        if zip_code not in existing_zip_codes
    ]

    category_list = [
        "cafes", "newamerican", "indpak", "italian", "japanese", "thai"
    ]

    for i, zip_code in enumerate(remaining_zip_codes):
        zip_code_results = []
        for category in category_list:
            offset = 0
            total_count = 21
            results_per_query_limit = 20
            business_counter = 1
            remaining_count = 1

            LOGGER.info(
                "Extracting {} restaurants from zip code {} ({} out of {})".
                format(category, zip_code, i, len(remaining_zip_codes)))
            while remaining_count > 0:
                try:

                    search_results = yelp_api.search_query(
                        location=zip_code,
                        category_filter=category,
                        sort=0,
                        limit=20,
                        offset=offset)
                    total_count = search_results['total']
                except YelpAPI.YelpAPIError as e:
                    print e
                    break
                if search_results['total'] == 0:
                    session.merge(
                        YelpAPIDb(zip_code=zip_code,
                                  date_created=datetime.date.today(),
                                  avg_rating=None,
                                  business_count=0))
                    session.commit()
                    break
                for business in search_results['businesses']:
                    if is_business_valid(business, zip_code):
                        print "{} out of {} businesses".format(
                            business_counter, total_count)
                        zip_code_results.append({
                            "zip_code":
                            zip_code,
                            "rating":
                            business['rating'],
                            "review_count":
                            business["review_count"]
                        })
                    business_counter += 1

                remaining_count = total_count - business_counter
                offset += results_per_query_limit

        if zip_code_results:
            total_review_count = sum(
                [business['review_count'] for business in zip_code_results])
            zip_code_avg_rating = sum([
                business['rating'] * business['review_count']
                for business in zip_code_results
            ]) / total_review_count
            row = YelpAPIDb(zip_code=zip_code,
                            date_created=datetime.date.today(),
                            avg_rating=zip_code_avg_rating,
                            business_count=len(zip_code_results))
            session.merge(row)
            session.commit()
        else:
            session.merge(
                YelpAPIDb(zip_code=zip_code,
                          date_created=datetime.date.today(),
                          avg_rating=None,
                          business_count=0))
            session.commit()
    session.close()
示例#9
0
sqlite_db = create_engine('sqlite:///yelp.db')

with sqlite_db.connect() as conn:
    yelp_api_results = conn.execute('SELECT * FROM yelp_api').fetchall()
    yelp_results = conn.execute('SELECT * FROM yelp').fetchall()
    indeed_results = conn.execute('SELECT * FROM indeed').fetchall()

yelp_list = [{
    'zip_code': row.zip_code,
    'date_published': row.date_published,
    'num_reviews': row.num_reviews,
    'review_rating': row.review_rating
} for row in yelp_results]

session.add_all([Yelp(**row) for row in yelp_list])
session.commit()

yelp_api_list = [{
    'zip_code': row.zip_code,
    'date_created': row.date_extracted,
    'business_count': row.business_count,
    'avg_rating': row.avg_rating
} for row in yelp_api_results]

session.add_all([YelpAPIDb(**row) for row in yelp_api_list])
session.commit()

indeed_list = [{
    'zip_code': row.zip_code,
    'date_created': row.date_published,
    'job_count': row.job_count
from models.db_models import session, Yelp, YelpAPIDb, Indeed

sqlite_db = create_engine('sqlite:///yelp.db')

with sqlite_db.connect() as conn:
    yelp_api_results = conn.execute('SELECT * FROM yelp_api').fetchall()
    yelp_results = conn.execute('SELECT * FROM yelp').fetchall()
    indeed_results = conn.execute('SELECT * FROM indeed').fetchall()

yelp_list = [{'zip_code': row.zip_code,
              'date_published': row.date_published,
              'num_reviews': row.num_reviews,
              'review_rating': row.review_rating} for row in yelp_results]

session.add_all([Yelp(**row) for row in yelp_list])
session.commit()

yelp_api_list = [{'zip_code': row.zip_code,
                  'date_created': row.date_extracted,
                  'business_count': row.business_count,
                  'avg_rating': row.avg_rating} for row in yelp_api_results]

session.add_all([YelpAPIDb(**row) for row in yelp_api_list])
session.commit()

indeed_list = [{'zip_code': row.zip_code,
                'date_created': row.date_published,
                'job_count': row.job_count} for row in indeed_results]

session.add_all([Indeed(**row) for row in indeed_list])
session.commit()
示例#11
0
def main():
    yelp_api = YelpAPI(login_data['yelp_consumer_key'],
                       login_data['yelp_consumer_secret'],
                       login_data['yelp_token'],
                       login_data['yelp_token_secret'])

    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    current_month = datetime.date.today().month
    current_rows = session.query(YelpAPIDb).filter(extract('month', YelpAPIDb.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes]

    category_list = ["cafes",
                     "newamerican",
                     "indpak",
                     "italian",
                     "japanese",
                     "thai"]

    for i, zip_code in enumerate(remaining_zip_codes):
        zip_code_results = []
        for category in category_list:
            offset = 0
            total_count = 21
            results_per_query_limit = 20
            business_counter = 1
            remaining_count = 1

            LOGGER.info("Extracting {} restaurants from zip code {} ({} out of {})".format(category, zip_code, i,
                                                                                           len(remaining_zip_codes)))
            while remaining_count > 0:
                try:

                    search_results = yelp_api.search_query(location=zip_code,
                                                           category_filter=category, sort=0, limit=20,
                                                           offset=offset)
                    total_count = search_results['total']
                except YelpAPI.YelpAPIError as e:
                    print e
                    break
                if search_results['total'] == 0:
                    session.merge(YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None,
                                            business_count=0))
                    session.commit()
                    break
                for business in search_results['businesses']:
                    if is_business_valid(business, zip_code):
                        print "{} out of {} businesses".format(business_counter, total_count)
                        zip_code_results.append({"zip_code": zip_code,
                                                 "rating": business['rating'],
                                                 "review_count": business["review_count"]})
                    business_counter += 1

                remaining_count = total_count - business_counter
                offset += results_per_query_limit

        if zip_code_results:
            total_review_count = sum([business['review_count'] for business in zip_code_results])
            zip_code_avg_rating = sum(
                [business['rating'] * business['review_count'] for business in zip_code_results]) / total_review_count
            row = YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=zip_code_avg_rating,
                            business_count=len(zip_code_results))
            session.merge(row)
            session.commit()
        else:
            session.merge(
                YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0))
            session.commit()
    session.close()
def persist_zip_code_data(df):
    zip_code_labels_df = df[['zip_code', 'city', 'metro', 'state', 'county']].drop_duplicates()
    session.query(ZipCode).delete()  # TODO: should append to existing data in case zillow changes something
    session.add_all([ZipCode(**row) for row in zip_code_labels_df.to_dict('records')])
    session.commit()