def main(output_directory: str): if not os.path.exists(output_directory): os.makedirs(output_directory, exist_ok=True) db = get_db() aggregation_res = db["popularity"].aggregate( [ { "$sort": { "timestamp": 1 } }, { "$group": { "_id": "$instrument_id", "popularity_history": { "$push": { "popularity": "$popularity", "timestamp": "$timestamp" } }, } }, { "$lookup": { "from": "index", "localField": "_id", "foreignField": "instrument_id", "as": "indexes", } }, { "$addFields": { "symbol": { "$arrayElemAt": ["$indexes.symbol", 0] } } }, { "$project": { "popularity_history": True, "symbol": True, "_id": True } }, ], allowDiskUse=True, ) written_count = 0 for datum in aggregation_res: symbol = datum.get("symbol") if symbol is None: continue write_csv_file(output_directory, symbol, datum["popularity_history"]) written_count += 1 print(f"Finished writing {written_count} CSV files to {output_directory}!")
def main(output_directory: str): if not os.path.exists(output_directory): os.makedirs(output_directory, exist_ok=True) db = get_db() all_index_docs = list(db["index"].find()) symbol_by_instrument_id = dict() for doc in all_index_docs: symbol_by_instrument_id[doc["instrument_id"]] = doc["symbol"] updates = list( filter( lambda doc: doc["symbol"] is not None, map( lambda doc: { **doc, "symbol": symbol_by_instrument_id.get(doc["instrument_id"]) }, db["popularity"].find({ "timestamp": { "$gt": datetime.now() - timedelta(hours=25) } }), ), )) sorted_updates = sorted( updates, key=lambda doc: f"{doc['symbol']}{doc['timestamp']}") write_csv_file(output_directory, sorted_updates)
def main(output_directory: str): if not os.path.exists(output_directory): os.makedirs(output_directory, exist_ok=True) db = get_db() index_items = db["index"].find({}, projection={ "instrument_id": True, "symbol": True, "_id": False }) symbols_by_instrument_id = {} for item in index_items: symbol = item.get("symbol") if symbol is None: continue symbols_by_instrument_id[item["instrument_id"]] = symbol cursor = db["popularity"].find( {}, projection={ "_id": 0, "timestamp": 1, "popularity": 1, "instrument_id": 1 }, sort=[("instrument_id", pymongo.ASCENDING), ("timestamp", pymongo.ASCENDING)], ) written_count = 0 cur_symbol = None acc = [] for datum in cursor: instrument_id = datum.get("instrument_id") symbol = symbols_by_instrument_id.get(instrument_id) if symbol is None: continue if symbol != cur_symbol and cur_symbol is not None: write_csv_file(output_directory, cur_symbol, acc) written_count += 1 acc = [] cur_symbol = symbol acc.append(datum) if cur_symbol is not None: write_csv_file(output_directory, cur_symbol, acc) written_count += 1 print(f"Finished writing {written_count} CSV files to {output_directory}!")
def get_all_stock_fundamentals(): """ Returns a dict mapping instrument id to stock fundamentals """ db = get_db() fundamentals_by_instrument_id = dict() all_fundamentals = list(db["fundamentals"].find()) for f in all_fundamentals: fundamentals_by_instrument_id[f["instrument_id"]] = f return fundamentals_by_instrument_id
def main(output_directory: str): if not os.path.exists(output_directory): os.makedirs(output_directory, exist_ok=True) db = get_db() index_items = db["index"].find({}, projection={}) symbols_by_instrument_id = {} for item in index_items: symbols_by_instrument_id[item["_id"]] = item["symbol"] aggregation_res = db["popularity"].aggregate( [ {"$sort": {"timestamp": 1}}, { "$group": { "_id": "$instrument_id", "popularity_history": { "$push": {"popularity": "$popularity", "timestamp": "$timestamp"} }, } }, ], allowDiskUse=True, ) written_count = 0 for datum in aggregation_res: instrument_id = datum.get("instrument_id") symbol = symbols_by_instrument_id.get(instrument_id) if symbol is None: continue write_csv_file(output_directory, symbol, datum["popularity_history"]) written_count += 1 print(f"Finished writing {written_count} CSV files to {output_directory}!")
def main(): fundamentals_by_instrument_id = get_all_stock_fundamentals() # sector -> instrument_id -> dayId -> popularity updates_by_sector = dict() # industry -> instrument_id -> dayId -> popularity updates_by_industry = dict() all_day_ids = set() db = get_db() for update in db["popularity"].find().sort("timestamp"): fundamentals = fundamentals_by_instrument_id.get( update["instrument_id"]) if fundamentals is None: continue day_id = get_day_id(update["timestamp"]) instrument_id = update["instrument_id"] sector = fundamentals["sector"] if sector is not None: if updates_by_sector.get(sector) is None: updates_by_sector[sector] = dict() if updates_by_sector[sector].get(instrument_id) is None: updates_by_sector[sector][instrument_id] = dict() if updates_by_sector[sector][instrument_id].get(day_id) is None: updates_by_sector[sector][instrument_id][day_id] = update[ "popularity"] industry = fundamentals["sector"] if industry is not None: if updates_by_industry.get(industry) is None: updates_by_industry[industry] = dict() if updates_by_industry[industry].get(instrument_id) is None: updates_by_industry[industry][instrument_id] = dict() if updates_by_industry[industry][instrument_id].get( day_id) is None: updates_by_industry[industry][instrument_id][day_id] = update[ "popularity"] if industry is not None and sector is not None: if day_id not in all_day_ids: print(day_id) all_day_ids.add(day_id) print( "Finished scrolling all updates; starting to aggregate by popularity") # sector -> day_id -> popularity pop_by_sector = dict() # industry -> day_id -> popularity pop_by_industry = dict() # Sum it all up for day_id in sorted(all_day_ids): for (sector, updates_by_instrument_id) in updates_by_sector.items(): if pop_by_sector.get(sector) is None: pop_by_sector[sector] = dict() pop_by_sector[sector][day_id] = 0 for (instrument_id, pop_by_day_id) in updates_by_instrument_id.items(): pop = pop_by_day_id.get(day_id) if pop is None: continue pop_by_sector[sector][day_id] += pop for (industry, updates_by_instrument_id) in updates_by_industry.items(): if pop_by_industry.get(industry) is None: pop_by_industry[industry] = dict() pop_by_industry[industry][day_id] = 0 for (instrument_id, pop_by_day_id) in updates_by_instrument_id.items(): pop = pop_by_day_id.get(day_id) if pop is None: continue pop_by_industry[industry][day_id] += pop # Dump it all to CSV, one for each sector if not os.path.isdir("/tmp/out"): os.mkdir("/tmp/out") for (sector, vals) in pop_by_sector.items(): # Adapted from https://stackoverflow.com/a/295152/3833068 normalized_sector = "".join( x for x in sector.replace(" ", "-").replace("&", "and") if x.isalnum() or x == "-") with open(f"/tmp/out/sector_{normalized_sector}.csv", "w") as f: for (day_id, pop) in sorted(vals.items(), key=lambda x: x[0]): f.write(f"{day_id},{pop}\n") for (sector, vals) in pop_by_industry.items(): # Adapted from https://stackoverflow.com/a/295152/3833068 normalized_industry = "".join( x for x in sector.replace(" ", "-").replace("&", "and") if x.isalnum() or x == "-") with open(f"/tmp/out/industry_{normalized_industry}.csv", "w") as f: for (day_id, pop) in sorted(vals.items(), key=lambda x: x[0]): f.write(f"{day_id},{pop}\n")
def populate_day(day: str): print(day) day = datetime.datetime.strptime(day, "%Y-%m-%d") day = day.replace(tzinfo=datetime.timezone(datetime.timedelta(hours=-4))) next_day = day + datetime.timedelta(days=1) print(f"Getting changes for day {day}") db = get_db() diffs_by_instrument_id = {} # Iterate through all popularity and quote updates for that day and get the diff for # both for each unique instrument id cur_instrument_id = None cur_start_val = None last_val = None for doc in (db["popularity"].find({ "timestamp": { "$gte": day, "$lt": next_day } }).sort([("instrument_id", 1), ("timestamp", 1)])): instrument_id = doc["instrument_id"] if instrument_id != cur_instrument_id: if cur_start_val is not None and last_val is not None: diffs_by_instrument_id[cur_instrument_id] = { "start_popularity": int(cur_start_val), "end_popularity": int(last_val), } cur_instrument_id = instrument_id cur_start_val = doc["popularity"] last_val = doc["popularity"] diffs_by_instrument_id[cur_instrument_id] = { "start_popularity": cur_start_val, "end_popularity": last_val, } print("Got popularities, getting quotes...") cur_instrument_id = None cur_start_val = None last_val = None for doc in (db["quotes"].find({ "updated_at": { "$gte": day, "$lt": next_day } }).sort([("instrument_id", 1), ("updated_at", 1)])): instrument_id = doc["instrument_id"] if instrument_id != cur_instrument_id: if (diffs_by_instrument_id.get(cur_instrument_id) is not None and cur_start_val is not None and last_val is not None): if cur_start_val is not None: diffs_by_instrument_id[cur_instrument_id][ "start_price"] = float(cur_start_val) diffs_by_instrument_id[cur_instrument_id][ "end_price"] = float(last_val) cur_instrument_id = instrument_id cur_start_val = doc["last_trade_price"] last_val = doc["last_trade_price"] if diffs_by_instrument_id.get(cur_instrument_id) is not None: diffs_by_instrument_id[cur_instrument_id][ "start_price"] = cur_start_val diffs_by_instrument_id[cur_instrument_id]["end_price"] = last_val total_pop_diff = 0 for val in diffs_by_instrument_id.values(): if val.get("start_popularity") is None or val.get( "end_popularity") is None: continue pop_diff = abs(val["end_popularity"] - val["start_popularity"]) total_pop_diff += pop_diff print(f"{day} TOTAL POP DIFF: {total_pop_diff}")
def backfill(): db = get_db() buckets_by_dayid = {} disabled_day_ids = set( [datum["day_id"] for datum in db["invalid_dayids"].find()]) for doc in db["popularity"].find(): if (doc.get("popularity") is None or doc.get("instrument_id") is None or doc.get("timestamp") is None or doc["timestamp"].isoweekday() >= 6): continue day_id = doc["timestamp"].strftime("%Y-%m-%d") if day_id in disabled_day_ids: continue bucket = buckets_by_dayid.get(day_id) if bucket is None: buckets_by_dayid[day_id] = {} bucket = buckets_by_dayid[day_id] instrument_id = doc["instrument_id"] acc = bucket.get(instrument_id) if acc is None: bucket[instrument_id] = {} acc = bucket[instrument_id] if acc.get("start_pop_timestamp" ) is None or acc["start_pop_timestamp"] > doc["timestamp"]: acc["start_pop_timestamp"] = doc["timestamp"] acc["start_pop"] = int(doc["popularity"]) if acc.get("end_pop_timestamp" ) is None or acc["end_pop_timestamp"] < doc["timestamp"]: acc["end_pop_timestamp"] = doc["timestamp"] acc["end_pop"] = int(doc["popularity"]) for doc in db["quotes"].find(): if (doc.get("instrument_id") is None or doc.get("updated_at") is None or doc.get("last_trade_price") is None or doc["updated_at"].isoweekday() >= 6): continue day_id = doc["updated_at"].strftime("%Y-%m-%d") if day_id in disabled_day_ids: continue bucket = buckets_by_dayid.get(day_id) if bucket is None: buckets_by_dayid[day_id] = {} bucket = buckets_by_dayid[day_id] instrument_id = doc["instrument_id"] acc = bucket.get(instrument_id) if acc is None: bucket[instrument_id] = {} acc = bucket[instrument_id] if (acc.get("start_price_timestamp") is None or acc["start_price_timestamp"] > doc["updated_at"]): acc["start_price_timestamp"] = doc["updated_at"] acc["start_price"] = float(doc["last_trade_price"]) if acc.get("end_price_timestamp" ) is None or acc["end_price_timestamp"] < doc["updated_at"]: acc["end_price_timestamp"] = doc["updated_at"] acc["end_price"] = float(doc["last_trade_price"]) save_backfill(db, buckets_by_dayid)