示例#1
0
def main(output_directory: str):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)

    db = get_db()

    aggregation_res = db["popularity"].aggregate(
        [
            {
                "$sort": {
                    "timestamp": 1
                }
            },
            {
                "$group": {
                    "_id": "$instrument_id",
                    "popularity_history": {
                        "$push": {
                            "popularity": "$popularity",
                            "timestamp": "$timestamp"
                        }
                    },
                }
            },
            {
                "$lookup": {
                    "from": "index",
                    "localField": "_id",
                    "foreignField": "instrument_id",
                    "as": "indexes",
                }
            },
            {
                "$addFields": {
                    "symbol": {
                        "$arrayElemAt": ["$indexes.symbol", 0]
                    }
                }
            },
            {
                "$project": {
                    "popularity_history": True,
                    "symbol": True,
                    "_id": True
                }
            },
        ],
        allowDiskUse=True,
    )

    written_count = 0
    for datum in aggregation_res:
        symbol = datum.get("symbol")
        if symbol is None:
            continue

        write_csv_file(output_directory, symbol, datum["popularity_history"])
        written_count += 1

    print(f"Finished writing {written_count} CSV files to {output_directory}!")
示例#2
0
def main(output_directory: str):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)

    db = get_db()

    all_index_docs = list(db["index"].find())
    symbol_by_instrument_id = dict()
    for doc in all_index_docs:
        symbol_by_instrument_id[doc["instrument_id"]] = doc["symbol"]

    updates = list(
        filter(
            lambda doc: doc["symbol"] is not None,
            map(
                lambda doc: {
                    **doc, "symbol":
                    symbol_by_instrument_id.get(doc["instrument_id"])
                },
                db["popularity"].find({
                    "timestamp": {
                        "$gt": datetime.now() - timedelta(hours=25)
                    }
                }),
            ),
        ))

    sorted_updates = sorted(
        updates, key=lambda doc: f"{doc['symbol']}{doc['timestamp']}")

    write_csv_file(output_directory, sorted_updates)
示例#3
0
def main(output_directory: str):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)

    db = get_db()

    index_items = db["index"].find({},
                                   projection={
                                       "instrument_id": True,
                                       "symbol": True,
                                       "_id": False
                                   })
    symbols_by_instrument_id = {}
    for item in index_items:
        symbol = item.get("symbol")
        if symbol is None:
            continue
        symbols_by_instrument_id[item["instrument_id"]] = symbol

    cursor = db["popularity"].find(
        {},
        projection={
            "_id": 0,
            "timestamp": 1,
            "popularity": 1,
            "instrument_id": 1
        },
        sort=[("instrument_id", pymongo.ASCENDING),
              ("timestamp", pymongo.ASCENDING)],
    )

    written_count = 0
    cur_symbol = None
    acc = []
    for datum in cursor:
        instrument_id = datum.get("instrument_id")
        symbol = symbols_by_instrument_id.get(instrument_id)

        if symbol is None:
            continue
        if symbol != cur_symbol and cur_symbol is not None:
            write_csv_file(output_directory, cur_symbol, acc)
            written_count += 1
            acc = []

        cur_symbol = symbol
        acc.append(datum)

    if cur_symbol is not None:
        write_csv_file(output_directory, cur_symbol, acc)
        written_count += 1

    print(f"Finished writing {written_count} CSV files to {output_directory}!")
def get_all_stock_fundamentals():
    """
    Returns a dict mapping instrument id to stock fundamentals
    """

    db = get_db()

    fundamentals_by_instrument_id = dict()
    all_fundamentals = list(db["fundamentals"].find())
    for f in all_fundamentals:
        fundamentals_by_instrument_id[f["instrument_id"]] = f

    return fundamentals_by_instrument_id
示例#5
0
def main(output_directory: str):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)

    db = get_db()

    index_items = db["index"].find({}, projection={})
    symbols_by_instrument_id = {}
    for item in index_items:
        symbols_by_instrument_id[item["_id"]] = item["symbol"]

    aggregation_res = db["popularity"].aggregate(
        [
            {"$sort": {"timestamp": 1}},
            {
                "$group": {
                    "_id": "$instrument_id",
                    "popularity_history": {
                        "$push": {"popularity": "$popularity", "timestamp": "$timestamp"}
                    },
                }
            },
        ],
        allowDiskUse=True,
    )

    written_count = 0
    for datum in aggregation_res:
        instrument_id = datum.get("instrument_id")
        symbol = symbols_by_instrument_id.get(instrument_id)
        if symbol is None:
            continue

        write_csv_file(output_directory, symbol, datum["popularity_history"])
        written_count += 1

    print(f"Finished writing {written_count} CSV files to {output_directory}!")
def main():
    fundamentals_by_instrument_id = get_all_stock_fundamentals()

    # sector -> instrument_id -> dayId -> popularity
    updates_by_sector = dict()
    # industry -> instrument_id -> dayId -> popularity
    updates_by_industry = dict()
    all_day_ids = set()

    db = get_db()
    for update in db["popularity"].find().sort("timestamp"):
        fundamentals = fundamentals_by_instrument_id.get(
            update["instrument_id"])
        if fundamentals is None:
            continue

        day_id = get_day_id(update["timestamp"])
        instrument_id = update["instrument_id"]

        sector = fundamentals["sector"]
        if sector is not None:
            if updates_by_sector.get(sector) is None:
                updates_by_sector[sector] = dict()

            if updates_by_sector[sector].get(instrument_id) is None:
                updates_by_sector[sector][instrument_id] = dict()

            if updates_by_sector[sector][instrument_id].get(day_id) is None:
                updates_by_sector[sector][instrument_id][day_id] = update[
                    "popularity"]

        industry = fundamentals["sector"]
        if industry is not None:
            if updates_by_industry.get(industry) is None:
                updates_by_industry[industry] = dict()

            if updates_by_industry[industry].get(instrument_id) is None:
                updates_by_industry[industry][instrument_id] = dict()

            if updates_by_industry[industry][instrument_id].get(
                    day_id) is None:
                updates_by_industry[industry][instrument_id][day_id] = update[
                    "popularity"]

        if industry is not None and sector is not None:
            if day_id not in all_day_ids:
                print(day_id)

            all_day_ids.add(day_id)

    print(
        "Finished scrolling all updates; starting to aggregate by popularity")

    # sector -> day_id -> popularity
    pop_by_sector = dict()
    # industry -> day_id -> popularity
    pop_by_industry = dict()

    # Sum it all up
    for day_id in sorted(all_day_ids):
        for (sector, updates_by_instrument_id) in updates_by_sector.items():
            if pop_by_sector.get(sector) is None:
                pop_by_sector[sector] = dict()

            pop_by_sector[sector][day_id] = 0

            for (instrument_id,
                 pop_by_day_id) in updates_by_instrument_id.items():
                pop = pop_by_day_id.get(day_id)
                if pop is None:
                    continue

                pop_by_sector[sector][day_id] += pop

        for (industry,
             updates_by_instrument_id) in updates_by_industry.items():
            if pop_by_industry.get(industry) is None:
                pop_by_industry[industry] = dict()

            pop_by_industry[industry][day_id] = 0

            for (instrument_id,
                 pop_by_day_id) in updates_by_instrument_id.items():
                pop = pop_by_day_id.get(day_id)
                if pop is None:
                    continue

                pop_by_industry[industry][day_id] += pop

    # Dump it all to CSV, one for each sector
    if not os.path.isdir("/tmp/out"):
        os.mkdir("/tmp/out")

    for (sector, vals) in pop_by_sector.items():
        # Adapted from https://stackoverflow.com/a/295152/3833068
        normalized_sector = "".join(
            x for x in sector.replace(" ", "-").replace("&", "and")
            if x.isalnum() or x == "-")
        with open(f"/tmp/out/sector_{normalized_sector}.csv", "w") as f:
            for (day_id, pop) in sorted(vals.items(), key=lambda x: x[0]):
                f.write(f"{day_id},{pop}\n")

    for (sector, vals) in pop_by_industry.items():
        # Adapted from https://stackoverflow.com/a/295152/3833068
        normalized_industry = "".join(
            x for x in sector.replace(" ", "-").replace("&", "and")
            if x.isalnum() or x == "-")
        with open(f"/tmp/out/industry_{normalized_industry}.csv", "w") as f:
            for (day_id, pop) in sorted(vals.items(), key=lambda x: x[0]):
                f.write(f"{day_id},{pop}\n")
示例#7
0
def populate_day(day: str):
    print(day)
    day = datetime.datetime.strptime(day, "%Y-%m-%d")
    day = day.replace(tzinfo=datetime.timezone(datetime.timedelta(hours=-4)))
    next_day = day + datetime.timedelta(days=1)
    print(f"Getting changes for day {day}")

    db = get_db()

    diffs_by_instrument_id = {}
    # Iterate through all popularity and quote updates for that day and get the diff for
    # both for each unique instrument id
    cur_instrument_id = None
    cur_start_val = None
    last_val = None
    for doc in (db["popularity"].find({
            "timestamp": {
                "$gte": day,
                "$lt": next_day
            }
    }).sort([("instrument_id", 1), ("timestamp", 1)])):
        instrument_id = doc["instrument_id"]

        if instrument_id != cur_instrument_id:
            if cur_start_val is not None and last_val is not None:
                diffs_by_instrument_id[cur_instrument_id] = {
                    "start_popularity": int(cur_start_val),
                    "end_popularity": int(last_val),
                }
            cur_instrument_id = instrument_id
            cur_start_val = doc["popularity"]

        last_val = doc["popularity"]
    diffs_by_instrument_id[cur_instrument_id] = {
        "start_popularity": cur_start_val,
        "end_popularity": last_val,
    }

    print("Got popularities, getting quotes...")
    cur_instrument_id = None
    cur_start_val = None
    last_val = None
    for doc in (db["quotes"].find({
            "updated_at": {
                "$gte": day,
                "$lt": next_day
            }
    }).sort([("instrument_id", 1), ("updated_at", 1)])):
        instrument_id = doc["instrument_id"]

        if instrument_id != cur_instrument_id:
            if (diffs_by_instrument_id.get(cur_instrument_id) is not None
                    and cur_start_val is not None and last_val is not None):
                if cur_start_val is not None:
                    diffs_by_instrument_id[cur_instrument_id][
                        "start_price"] = float(cur_start_val)
                    diffs_by_instrument_id[cur_instrument_id][
                        "end_price"] = float(last_val)
            cur_instrument_id = instrument_id
            cur_start_val = doc["last_trade_price"]

        last_val = doc["last_trade_price"]
    if diffs_by_instrument_id.get(cur_instrument_id) is not None:
        diffs_by_instrument_id[cur_instrument_id][
            "start_price"] = cur_start_val
        diffs_by_instrument_id[cur_instrument_id]["end_price"] = last_val

    total_pop_diff = 0
    for val in diffs_by_instrument_id.values():
        if val.get("start_popularity") is None or val.get(
                "end_popularity") is None:
            continue
        pop_diff = abs(val["end_popularity"] - val["start_popularity"])
        total_pop_diff += pop_diff

    print(f"{day} TOTAL POP DIFF: {total_pop_diff}")
示例#8
0
def backfill():
    db = get_db()
    buckets_by_dayid = {}

    disabled_day_ids = set(
        [datum["day_id"] for datum in db["invalid_dayids"].find()])

    for doc in db["popularity"].find():
        if (doc.get("popularity") is None or doc.get("instrument_id") is None
                or doc.get("timestamp") is None
                or doc["timestamp"].isoweekday() >= 6):
            continue

        day_id = doc["timestamp"].strftime("%Y-%m-%d")
        if day_id in disabled_day_ids:
            continue
        bucket = buckets_by_dayid.get(day_id)
        if bucket is None:
            buckets_by_dayid[day_id] = {}
            bucket = buckets_by_dayid[day_id]

        instrument_id = doc["instrument_id"]
        acc = bucket.get(instrument_id)
        if acc is None:
            bucket[instrument_id] = {}
            acc = bucket[instrument_id]

        if acc.get("start_pop_timestamp"
                   ) is None or acc["start_pop_timestamp"] > doc["timestamp"]:
            acc["start_pop_timestamp"] = doc["timestamp"]
            acc["start_pop"] = int(doc["popularity"])

        if acc.get("end_pop_timestamp"
                   ) is None or acc["end_pop_timestamp"] < doc["timestamp"]:
            acc["end_pop_timestamp"] = doc["timestamp"]
            acc["end_pop"] = int(doc["popularity"])

    for doc in db["quotes"].find():
        if (doc.get("instrument_id") is None or doc.get("updated_at") is None
                or doc.get("last_trade_price") is None
                or doc["updated_at"].isoweekday() >= 6):
            continue

        day_id = doc["updated_at"].strftime("%Y-%m-%d")
        if day_id in disabled_day_ids:
            continue
        bucket = buckets_by_dayid.get(day_id)
        if bucket is None:
            buckets_by_dayid[day_id] = {}
            bucket = buckets_by_dayid[day_id]

        instrument_id = doc["instrument_id"]
        acc = bucket.get(instrument_id)
        if acc is None:
            bucket[instrument_id] = {}
            acc = bucket[instrument_id]

        if (acc.get("start_price_timestamp") is None
                or acc["start_price_timestamp"] > doc["updated_at"]):
            acc["start_price_timestamp"] = doc["updated_at"]
            acc["start_price"] = float(doc["last_trade_price"])

        if acc.get("end_price_timestamp"
                   ) is None or acc["end_price_timestamp"] < doc["updated_at"]:
            acc["end_price_timestamp"] = doc["updated_at"]
            acc["end_price"] = float(doc["last_trade_price"])

    save_backfill(db, buckets_by_dayid)