def main(): # get zip codes zip_codes = [row.zip_code for row in session.query(ZipCode).all()] # # add leading 0's to zip codes due to excel's stupidness # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].astype(str) # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].apply(lambda x: '0' * (5 - len(x)) + x) current_month = datetime.date.today().month current_rows = session.query(Indeed).filter( extract('month', Indeed.date_created) == current_month).all() current_rows = [row.as_dict() for row in current_rows] existing_zip_codes = [row['zip_code'] for row in current_rows] remaining_zip_codes = [ zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes ] LOGGER.info( 'Found {} rows for current month: {}. Extracting {} remaining zip codes' .format(len(current_rows), current_month, len(remaining_zip_codes))) for i, zip_code in enumerate(remaining_zip_codes): job_count = get_num_job_postings(zip_code) row = Indeed(zip_code=zip_code, job_count=job_count, date_created=datetime.date.today()) session.merge(row) session.commit() LOGGER.info("Extracting zip code {} ({} of {})".format( zip_code, i, len(remaining_zip_codes))) session.close()
def main(): # get zip codes zip_codes = [row.zip_code for row in session.query(ZipCode).all()] # # add leading 0's to zip codes due to excel's stupidness # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].astype(str) # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].apply(lambda x: '0' * (5 - len(x)) + x) current_month = datetime.date.today().month current_rows = session.query(Indeed).filter(extract('month', Indeed.date_created) == current_month).all() current_rows = [row.as_dict() for row in current_rows] existing_zip_codes = [row['zip_code'] for row in current_rows] remaining_zip_codes = [zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes] LOGGER.info('Found {} rows for current month: {}. Extracting {} remaining zip codes'.format(len(current_rows), current_month, len( remaining_zip_codes))) for i, zip_code in enumerate(remaining_zip_codes): job_count = get_num_job_postings(zip_code) row = Indeed(zip_code=zip_code, job_count=job_count, date_created=datetime.date.today()) session.merge(row) session.commit() LOGGER.info("Extracting zip code {} ({} of {})".format(zip_code, i, len(remaining_zip_codes))) session.close()
def persist_zip_code_data(df): zip_code_labels_df = df[['zip_code', 'city', 'metro', 'state', 'county']].drop_duplicates() session.query(ZipCode).delete( ) # TODO: should append to existing data in case zillow changes something session.add_all( [ZipCode(**row) for row in zip_code_labels_df.to_dict('records')]) session.commit()
def main(): LOGGER.info('Extracting building permit data...') post_data_list = generate_post_data((1, 12), (2005, 2015)) pool = ThreadPool(5) results = pool.map(get_census_reponse, post_data_list) results_array = parse_results(results) for result in results_array: session.merge(BuildingPermit(**result)) session.commit()
def persist_zillow_metrics(df): metrics_df = df.drop(['city', 'metro', 'state', 'county'], axis=1) session.query(ZillowMetrics).delete() # TODO: should append to existing data in case zillow changes something session.commit() insert_chunk = 100000 index_start = 0 while index_start < len(metrics_df): LOGGER.info('Persisting Zillow Metrics rows: {} of {}'.format(index_start + insert_chunk, len(metrics_df))) engine.execute( ZillowMetrics.__table__.insert(metrics_df[index_start:index_start + insert_chunk].to_dict('records'))) index_start += insert_chunk
def persist_zillow_metrics(df): metrics_df = df.drop(['city', 'metro', 'state', 'county'], axis=1) session.query(ZillowMetrics).delete( ) # TODO: should append to existing data in case zillow changes something session.commit() insert_chunk = 100000 index_start = 0 while index_start < len(metrics_df): LOGGER.info('Persisting Zillow Metrics rows: {} of {}'.format( index_start + insert_chunk, len(metrics_df))) engine.execute( ZillowMetrics.__table__.insert( metrics_df[index_start:index_start + insert_chunk].to_dict('records'))) index_start += insert_chunk
def main(): yelp_api = YelpAPI(login_data['yelp_consumer_key'], login_data['yelp_consumer_secret'], login_data['yelp_token'], login_data['yelp_token_secret']) zip_codes = [row.zip_code for row in session.query(ZipCode).all()] current_month = datetime.date.today().month current_rows = session.query(YelpAPIDb).filter( extract('month', YelpAPIDb.date_created) == current_month).all() current_rows = [row.as_dict() for row in current_rows] existing_zip_codes = [row['zip_code'] for row in current_rows] remaining_zip_codes = [ zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes ] category_list = [ "cafes", "newamerican", "indpak", "italian", "japanese", "thai" ] for i, zip_code in enumerate(remaining_zip_codes): zip_code_results = [] for category in category_list: offset = 0 total_count = 21 results_per_query_limit = 20 business_counter = 1 remaining_count = 1 LOGGER.info( "Extracting {} restaurants from zip code {} ({} out of {})". format(category, zip_code, i, len(remaining_zip_codes))) while remaining_count > 0: try: search_results = yelp_api.search_query( location=zip_code, category_filter=category, sort=0, limit=20, offset=offset) total_count = search_results['total'] except YelpAPI.YelpAPIError as e: print e break if search_results['total'] == 0: session.merge( YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0)) session.commit() break for business in search_results['businesses']: if is_business_valid(business, zip_code): print "{} out of {} businesses".format( business_counter, total_count) zip_code_results.append({ "zip_code": zip_code, "rating": business['rating'], "review_count": business["review_count"] }) business_counter += 1 remaining_count = total_count - business_counter offset += results_per_query_limit if zip_code_results: total_review_count = sum( [business['review_count'] for business in zip_code_results]) zip_code_avg_rating = sum([ business['rating'] * business['review_count'] for business in zip_code_results ]) / total_review_count row = YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=zip_code_avg_rating, business_count=len(zip_code_results)) session.merge(row) session.commit() else: session.merge( YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0)) session.commit() session.close()
sqlite_db = create_engine('sqlite:///yelp.db') with sqlite_db.connect() as conn: yelp_api_results = conn.execute('SELECT * FROM yelp_api').fetchall() yelp_results = conn.execute('SELECT * FROM yelp').fetchall() indeed_results = conn.execute('SELECT * FROM indeed').fetchall() yelp_list = [{ 'zip_code': row.zip_code, 'date_published': row.date_published, 'num_reviews': row.num_reviews, 'review_rating': row.review_rating } for row in yelp_results] session.add_all([Yelp(**row) for row in yelp_list]) session.commit() yelp_api_list = [{ 'zip_code': row.zip_code, 'date_created': row.date_extracted, 'business_count': row.business_count, 'avg_rating': row.avg_rating } for row in yelp_api_results] session.add_all([YelpAPIDb(**row) for row in yelp_api_list]) session.commit() indeed_list = [{ 'zip_code': row.zip_code, 'date_created': row.date_published, 'job_count': row.job_count
from models.db_models import session, Yelp, YelpAPIDb, Indeed sqlite_db = create_engine('sqlite:///yelp.db') with sqlite_db.connect() as conn: yelp_api_results = conn.execute('SELECT * FROM yelp_api').fetchall() yelp_results = conn.execute('SELECT * FROM yelp').fetchall() indeed_results = conn.execute('SELECT * FROM indeed').fetchall() yelp_list = [{'zip_code': row.zip_code, 'date_published': row.date_published, 'num_reviews': row.num_reviews, 'review_rating': row.review_rating} for row in yelp_results] session.add_all([Yelp(**row) for row in yelp_list]) session.commit() yelp_api_list = [{'zip_code': row.zip_code, 'date_created': row.date_extracted, 'business_count': row.business_count, 'avg_rating': row.avg_rating} for row in yelp_api_results] session.add_all([YelpAPIDb(**row) for row in yelp_api_list]) session.commit() indeed_list = [{'zip_code': row.zip_code, 'date_created': row.date_published, 'job_count': row.job_count} for row in indeed_results] session.add_all([Indeed(**row) for row in indeed_list]) session.commit()
def main(): yelp_api = YelpAPI(login_data['yelp_consumer_key'], login_data['yelp_consumer_secret'], login_data['yelp_token'], login_data['yelp_token_secret']) zip_codes = [row.zip_code for row in session.query(ZipCode).all()] current_month = datetime.date.today().month current_rows = session.query(YelpAPIDb).filter(extract('month', YelpAPIDb.date_created) == current_month).all() current_rows = [row.as_dict() for row in current_rows] existing_zip_codes = [row['zip_code'] for row in current_rows] remaining_zip_codes = [zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes] category_list = ["cafes", "newamerican", "indpak", "italian", "japanese", "thai"] for i, zip_code in enumerate(remaining_zip_codes): zip_code_results = [] for category in category_list: offset = 0 total_count = 21 results_per_query_limit = 20 business_counter = 1 remaining_count = 1 LOGGER.info("Extracting {} restaurants from zip code {} ({} out of {})".format(category, zip_code, i, len(remaining_zip_codes))) while remaining_count > 0: try: search_results = yelp_api.search_query(location=zip_code, category_filter=category, sort=0, limit=20, offset=offset) total_count = search_results['total'] except YelpAPI.YelpAPIError as e: print e break if search_results['total'] == 0: session.merge(YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0)) session.commit() break for business in search_results['businesses']: if is_business_valid(business, zip_code): print "{} out of {} businesses".format(business_counter, total_count) zip_code_results.append({"zip_code": zip_code, "rating": business['rating'], "review_count": business["review_count"]}) business_counter += 1 remaining_count = total_count - business_counter offset += results_per_query_limit if zip_code_results: total_review_count = sum([business['review_count'] for business in zip_code_results]) zip_code_avg_rating = sum( [business['rating'] * business['review_count'] for business in zip_code_results]) / total_review_count row = YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=zip_code_avg_rating, business_count=len(zip_code_results)) session.merge(row) session.commit() else: session.merge( YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0)) session.commit() session.close()
def persist_zip_code_data(df): zip_code_labels_df = df[['zip_code', 'city', 'metro', 'state', 'county']].drop_duplicates() session.query(ZipCode).delete() # TODO: should append to existing data in case zillow changes something session.add_all([ZipCode(**row) for row in zip_code_labels_df.to_dict('records')]) session.commit()