def main(): # get zip codes zip_codes = [row.zip_code for row in session.query(ZipCode).all()] # # add leading 0's to zip codes due to excel's stupidness # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].astype(str) # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].apply(lambda x: '0' * (5 - len(x)) + x) current_month = datetime.date.today().month current_rows = session.query(Indeed).filter( extract('month', Indeed.date_created) == current_month).all() current_rows = [row.as_dict() for row in current_rows] existing_zip_codes = [row['zip_code'] for row in current_rows] remaining_zip_codes = [ zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes ] LOGGER.info( 'Found {} rows for current month: {}. Extracting {} remaining zip codes' .format(len(current_rows), current_month, len(remaining_zip_codes))) for i, zip_code in enumerate(remaining_zip_codes): job_count = get_num_job_postings(zip_code) row = Indeed(zip_code=zip_code, job_count=job_count, date_created=datetime.date.today()) session.merge(row) session.commit() LOGGER.info("Extracting zip code {} ({} of {})".format( zip_code, i, len(remaining_zip_codes))) session.close()
def main(): # get zip codes zip_codes = [row.zip_code for row in session.query(ZipCode).all()] # # add leading 0's to zip codes due to excel's stupidness # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].astype(str) # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].apply(lambda x: '0' * (5 - len(x)) + x) current_month = datetime.date.today().month current_rows = session.query(Indeed).filter(extract('month', Indeed.date_created) == current_month).all() current_rows = [row.as_dict() for row in current_rows] existing_zip_codes = [row['zip_code'] for row in current_rows] remaining_zip_codes = [zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes] LOGGER.info('Found {} rows for current month: {}. Extracting {} remaining zip codes'.format(len(current_rows), current_month, len( remaining_zip_codes))) for i, zip_code in enumerate(remaining_zip_codes): job_count = get_num_job_postings(zip_code) row = Indeed(zip_code=zip_code, job_count=job_count, date_created=datetime.date.today()) session.merge(row) session.commit() LOGGER.info("Extracting zip code {} ({} of {})".format(zip_code, i, len(remaining_zip_codes))) session.close()
def persist_zip_code_data(df): zip_code_labels_df = df[['zip_code', 'city', 'metro', 'state', 'county']].drop_duplicates() session.query(ZipCode).delete( ) # TODO: should append to existing data in case zillow changes something session.add_all( [ZipCode(**row) for row in zip_code_labels_df.to_dict('records')]) session.commit()
def persist_zillow_metrics(df): metrics_df = df.drop(['city', 'metro', 'state', 'county'], axis=1) session.query(ZillowMetrics).delete() # TODO: should append to existing data in case zillow changes something session.commit() insert_chunk = 100000 index_start = 0 while index_start < len(metrics_df): LOGGER.info('Persisting Zillow Metrics rows: {} of {}'.format(index_start + insert_chunk, len(metrics_df))) engine.execute( ZillowMetrics.__table__.insert(metrics_df[index_start:index_start + insert_chunk].to_dict('records'))) index_start += insert_chunk
def persist_zillow_metrics(df): metrics_df = df.drop(['city', 'metro', 'state', 'county'], axis=1) session.query(ZillowMetrics).delete( ) # TODO: should append to existing data in case zillow changes something session.commit() insert_chunk = 100000 index_start = 0 while index_start < len(metrics_df): LOGGER.info('Persisting Zillow Metrics rows: {} of {}'.format( index_start + insert_chunk, len(metrics_df))) engine.execute( ZillowMetrics.__table__.insert( metrics_df[index_start:index_start + insert_chunk].to_dict('records'))) index_start += insert_chunk
def main(): yelp_api = YelpAPI(login_data['yelp_consumer_key'], login_data['yelp_consumer_secret'], login_data['yelp_token'], login_data['yelp_token_secret']) zip_codes = [row.zip_code for row in session.query(ZipCode).all()] current_month = datetime.date.today().month current_rows = session.query(YelpAPIDb).filter( extract('month', YelpAPIDb.date_created) == current_month).all() current_rows = [row.as_dict() for row in current_rows] existing_zip_codes = [row['zip_code'] for row in current_rows] remaining_zip_codes = [ zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes ] category_list = [ "cafes", "newamerican", "indpak", "italian", "japanese", "thai" ] for i, zip_code in enumerate(remaining_zip_codes): zip_code_results = [] for category in category_list: offset = 0 total_count = 21 results_per_query_limit = 20 business_counter = 1 remaining_count = 1 LOGGER.info( "Extracting {} restaurants from zip code {} ({} out of {})". format(category, zip_code, i, len(remaining_zip_codes))) while remaining_count > 0: try: search_results = yelp_api.search_query( location=zip_code, category_filter=category, sort=0, limit=20, offset=offset) total_count = search_results['total'] except YelpAPI.YelpAPIError as e: print e break if search_results['total'] == 0: session.merge( YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0)) session.commit() break for business in search_results['businesses']: if is_business_valid(business, zip_code): print "{} out of {} businesses".format( business_counter, total_count) zip_code_results.append({ "zip_code": zip_code, "rating": business['rating'], "review_count": business["review_count"] }) business_counter += 1 remaining_count = total_count - business_counter offset += results_per_query_limit if zip_code_results: total_review_count = sum( [business['review_count'] for business in zip_code_results]) zip_code_avg_rating = sum([ business['rating'] * business['review_count'] for business in zip_code_results ]) / total_review_count row = YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=zip_code_avg_rating, business_count=len(zip_code_results)) session.merge(row) session.commit() else: session.merge( YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0)) session.commit() session.close()
model_df = df.copy() for column in column_shift: model_df[column[1]] = model_df[column[1]].shift(column[0]) model.add_data(model_df, y_col=[y_col]) model.model_data() results.append({'shift': column_shift, 'RMSE': model.RMSE, 'Score': model.score, 'Coefs': model.coefficients}) return max(results, key=lambda x: x['Score']) # yelp_df = pd.DataFrame([row.as_dict() for row in session.query(Yelp).all()]) # yelp_df = yelp_df.rename(columns={'date_published': 'month'}) zillow_results = session.query(ZillowMetrics).filter(ZillowMetrics.ZRI != None, ZillowMetrics.ZHVI != None).all() zillow_metrics_df = pd.DataFrame([row.as_dict() for row in zillow_results]) # merged_df = yelp_df.merge(zillow_metrics_df, on=['month', 'zip_code']) lin_model = Model(linear_model.LinearRegression()) min_periods = 8 lag_columns = ['ZRI'] zip_code_sample = zillow_metrics_df.zip_code.unique() random.shuffle(zip_code_sample) zip_code_sample = zip_code_sample[:10] best_models = [] for zip_code in zip_code_sample:
def main(): yelp_api = YelpAPI(login_data['yelp_consumer_key'], login_data['yelp_consumer_secret'], login_data['yelp_token'], login_data['yelp_token_secret']) zip_codes = [row.zip_code for row in session.query(ZipCode).all()] current_month = datetime.date.today().month current_rows = session.query(YelpAPIDb).filter(extract('month', YelpAPIDb.date_created) == current_month).all() current_rows = [row.as_dict() for row in current_rows] existing_zip_codes = [row['zip_code'] for row in current_rows] remaining_zip_codes = [zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes] category_list = ["cafes", "newamerican", "indpak", "italian", "japanese", "thai"] for i, zip_code in enumerate(remaining_zip_codes): zip_code_results = [] for category in category_list: offset = 0 total_count = 21 results_per_query_limit = 20 business_counter = 1 remaining_count = 1 LOGGER.info("Extracting {} restaurants from zip code {} ({} out of {})".format(category, zip_code, i, len(remaining_zip_codes))) while remaining_count > 0: try: search_results = yelp_api.search_query(location=zip_code, category_filter=category, sort=0, limit=20, offset=offset) total_count = search_results['total'] except YelpAPI.YelpAPIError as e: print e break if search_results['total'] == 0: session.merge(YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0)) session.commit() break for business in search_results['businesses']: if is_business_valid(business, zip_code): print "{} out of {} businesses".format(business_counter, total_count) zip_code_results.append({"zip_code": zip_code, "rating": business['rating'], "review_count": business["review_count"]}) business_counter += 1 remaining_count = total_count - business_counter offset += results_per_query_limit if zip_code_results: total_review_count = sum([business['review_count'] for business in zip_code_results]) zip_code_avg_rating = sum( [business['rating'] * business['review_count'] for business in zip_code_results]) / total_review_count row = YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=zip_code_avg_rating, business_count=len(zip_code_results)) session.merge(row) session.commit() else: session.merge( YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0)) session.commit() session.close()
def persist_zip_code_data(df): zip_code_labels_df = df[['zip_code', 'city', 'metro', 'state', 'county']].drop_duplicates() session.query(ZipCode).delete() # TODO: should append to existing data in case zillow changes something session.add_all([ZipCode(**row) for row in zip_code_labels_df.to_dict('records')]) session.commit()