def crawl_history(direction: str, date=yesterday()): assert direction in ["in", "out"] city_code = get_city_codes() total = len(city_code) for i, city_record in enumerate(city_code): time.sleep(SLEEP_SEC) city_id = city_record["code"] logger.info(f"[{i+1}/{total}]: {city_record['city']} ({city_id})") query = ( "https://huiyan.baidu.com/migration/historycurve.jsonp" + f"?dt=city&id={city_id}&type=move_{direction}&startDate=20200101&endDate={date}" ) logger.info(f"Getting {query}") res = requests.get(query) if res.status_code == 200: logger.info("Success.") with open(FilepathMapper.history(city_id, direction), "w", encoding="utf-8") as f: f.write(res.text) else: logger.warning( f"Bad response code {res.status_code} for {city_record['city']}" )
def update_history_if_outdated(direction, city_id): path = FilepathMapper.history("110000", direction) with open(path, "r", encoding="utf-8") as f: res = f.read() if yesterday() not in res: logger.info("Obtaining the latest history data.") crawl_history(direction)
def get_p2p_overall_dataframe(dates=[yesterday()]): res = [] for date in dates: print(date) for _, row in get_city_code_table().iterrows(): history_curve = load_history(date, row.adcode) if history_curve is None: continue move_data = load_p2p(date, row.adcode) for record in move_data: from_city = row["name"] if from_city[-1] == "市": from_city = from_city[:-1] to_city = record["city_name"] if to_city[-1] == "市": to_city = to_city[:-1] to_province = record["province_name"] if to_province[-1] in ["省", "市"]: to_province = to_province[:-1] new_entry = { "m_date": pd.to_datetime(date), "from_city": from_city, "to_city": to_city, "to_province": to_province, "percentage": record["value"], "migration_index": history_curve[date], } res.append(new_entry) time.sleep() return pd.DataFrame(res)
def get_index_overall_dataframe(date=yesterday()): res = [] for _, row in get_city_code_table().iterrows(): history_curve = load_history(date, row.adcode) if history_curve is None: continue city = row["name"] if city[-1] in ["省", "市"]: city = city[:-1] for this_date in history_curve.keys(): new_entry = {"m_date": pd.to_datetime(this_date), "city": city, "migration_index": history_curve[this_date]} res.append(new_entry) return pd.DataFrame(res)
def insert_latest(): engine = create_engine( f"postgresql://{config['db_username']}:{config['db_password']}@{config['db_host']}:{config['db_port']}/{config['db_name']}" ) session_fact = sessionmaker(bind=engine) session = session_fact() last_sql_date = session.execute("SELECT MAX(m_date) from p2p_migration" ).fetchall()[0][0].strftime('%Y%m%d') print(last_sql_date) df = get_p2p_overall_dataframe( from_date_to_date(last_sql_date, yesterday())) print(df) df.to_sql("p2p_migration", engine, if_exists="append", index=False, method="multi")