예제 #1
0
def crawl_history(direction: str, date=yesterday()):
    assert direction in ["in", "out"]
    city_code = get_city_codes()
    total = len(city_code)
    for i, city_record in enumerate(city_code):
        time.sleep(SLEEP_SEC)
        city_id = city_record["code"]
        logger.info(f"[{i+1}/{total}]: {city_record['city']} ({city_id})")

        query = (
            "https://huiyan.baidu.com/migration/historycurve.jsonp" +
            f"?dt=city&id={city_id}&type=move_{direction}&startDate=20200101&endDate={date}"
        )
        logger.info(f"Getting {query}")
        res = requests.get(query)
        if res.status_code == 200:
            logger.info("Success.")
            with open(FilepathMapper.history(city_id, direction),
                      "w",
                      encoding="utf-8") as f:
                f.write(res.text)
        else:
            logger.warning(
                f"Bad response code {res.status_code} for {city_record['city']}"
            )
예제 #2
0
def update_history_if_outdated(direction, city_id):
    path = FilepathMapper.history("110000", direction)
    with open(path, "r", encoding="utf-8") as f:
        res = f.read()
    if yesterday() not in res:
        logger.info("Obtaining the latest history data.")
        crawl_history(direction)
예제 #3
0
def get_p2p_overall_dataframe(dates=[yesterday()]):
    res = []
    for date in dates:
        print(date)
        for _, row in get_city_code_table().iterrows():
            history_curve = load_history(date, row.adcode)
            if history_curve is None:
                continue
            move_data = load_p2p(date, row.adcode)
            for record in move_data:
                from_city = row["name"]
                if from_city[-1] == "市":
                    from_city = from_city[:-1]

                to_city = record["city_name"]
                if to_city[-1] == "市":
                    to_city = to_city[:-1]

                to_province = record["province_name"]
                if to_province[-1] in ["省", "市"]:
                    to_province = to_province[:-1]

                new_entry = {
                    "m_date": pd.to_datetime(date),
                    "from_city": from_city,
                    "to_city": to_city,
                    "to_province": to_province,
                    "percentage": record["value"],
                    "migration_index": history_curve[date],
                }
                res.append(new_entry)
        time.sleep()
    return pd.DataFrame(res)
예제 #4
0
def get_index_overall_dataframe(date=yesterday()):
    res = []
    for _, row in get_city_code_table().iterrows():
        history_curve = load_history(date, row.adcode)
        if history_curve is None:
            continue
        city = row["name"]
        if city[-1] in ["省", "市"]:
            city = city[:-1]
        for this_date in history_curve.keys():
            new_entry = {"m_date": pd.to_datetime(this_date), "city": city, "migration_index": history_curve[this_date]}
            res.append(new_entry)

    return pd.DataFrame(res)
예제 #5
0
def insert_latest():
    engine = create_engine(
        f"postgresql://{config['db_username']}:{config['db_password']}@{config['db_host']}:{config['db_port']}/{config['db_name']}"
    )
    session_fact = sessionmaker(bind=engine)
    session = session_fact()
    last_sql_date = session.execute("SELECT MAX(m_date) from p2p_migration"
                                    ).fetchall()[0][0].strftime('%Y%m%d')
    print(last_sql_date)
    df = get_p2p_overall_dataframe(
        from_date_to_date(last_sql_date, yesterday()))
    print(df)
    df.to_sql("p2p_migration",
              engine,
              if_exists="append",
              index=False,
              method="multi")