def handle(self, *args, **options): conn = redshift.get_new_redshift_connection() lookback_val = options["look_back"] if lookback_val: lookback = int(lookback_val[0]) else: lookback = 14 end_ts = date.today() - timedelta(days=1) start_ts = end_ts - timedelta(days=lookback) redis_host = options["redis_host"] if redis_host: redis_client = redis.StrictRedis(host=redis_host[0]) pipeline = redis_client.pipeline(transaction=False) else: pipeline = None params = {"start_date": start_ts, "end_date": end_ts} compiled_statement = REDSHIFT_QUERY.params(params).compile(bind=conn) start_ts = time.time() for row in conn.execute(compiled_statement): as_of = row["match_start"] deck_id = row["deck_id"] dbf_map = { dbf_id: count for dbf_id, count in json.loads(row["deck_list"]) } player_class = CardClass(row["player_class"]) format = FormatType.FT_STANDARD if row[ "game_type"] == 2 else FormatType.FT_WILD played_cards = json.loads(row["played_cards"]) tree = deck_prediction_tree(player_class, format, redis_client=pipeline) min_played_cards = tree.max_depth - 1 played_card_dbfs = played_cards[:min_played_cards] deck_size = sum(dbf_map.values()) if deck_size == 30: tree.observe(deck_id, dbf_map, played_card_dbfs, as_of=as_of) if len(pipeline) >= 8000: pipeline.execute() if len(pipeline): pipeline.execute() end_ts = time.time() duration_seconds = round(end_ts - start_ts) print("Took: %i Seconds" % duration_seconds)
def handle(self, *args, **options): conn = redshift.get_new_redshift_connection() is_dry_run = options["dry_run"] verbosity = options["verbosity"] end_ts = date.today() start_ts = end_ts - timedelta(days=options["lookback"]) params = {"start_date": start_ts, "end_date": end_ts} compiled_statement = REDSHIFT_QUERY.params(params).compile(bind=conn) for card_class in CardClass: if 2 <= card_class <= 10: for a in Archetype.objects.live().filter( player_class=card_class): self.archetype_map[a.id] = a # Standard Signature Weights standard_weight_values = ClusterSnapshot.objects.get_signature_weights( FormatType.FT_STANDARD, card_class) if len(standard_weight_values): self.signature_weights[FormatType.FT_STANDARD][ card_class] = standard_weight_values # Wild Signature Weights wild_weight_values = ClusterSnapshot.objects.get_signature_weights( FormatType.FT_WILD, card_class) if len(wild_weight_values): self.signature_weights[FormatType.FormatType.FT_WILD][ card_class] = wild_weight_values result_set = list(conn.execute(compiled_statement)) total_rows = len(result_set) self.stdout.write("%i decks to update" % (total_rows)) if is_dry_run: self.stdout.write("Dry run, will not flush to databases") for counter, row in enumerate(result_set): deck_id = row["deck_id"] if not is_dry_run and counter % 100000 == 0: self.flush_db_buffer() self.flush_firehose_buffer() if deck_id is None: self.stderr.write("Got deck_id %r ... skipping" % (deck_id)) continue current_archetype_id = row["archetype_id"] player_class = CardClass(row["player_class"]) if player_class == CardClass.NEUTRAL: # Most likely noise self.stderr.write("Found and skipping NEUTRAL data: %r" % (row)) continue format = FormatType.FT_STANDARD if row[ "game_type"] == 2 else FormatType.FT_WILD dbf_map = { dbf_id: count for dbf_id, count in json.loads(row["deck_list"]) } if player_class not in self.signature_weights[format]: raise RuntimeError( "%r not found for %r. Are signatures present?" % (player_class, format)) if self.signature_weights[format][player_class]: new_archetype_id = classify_deck( dbf_map, self.signature_weights[format][player_class]) if new_archetype_id == current_archetype_id: if verbosity > 1: self.stdout.write("Deck %r - Nothing to do." % (deck_id)) continue current_name = self.get_archetype_name(current_archetype_id) new_name = self.get_archetype_name(new_archetype_id) pct_complete = str(math.floor(100.0 * counter / total_rows)) self.stdout.write( "\t[%s%%] Reclassifying deck %r: %s => %s\n" % (pct_complete, deck_id, current_name, new_name)) if not is_dry_run: self.buffer_archetype_update(deck_id, new_archetype_id) if not is_dry_run: self.flush_db_buffer() self.flush_firehose_buffer() else: self.stdout.write("Dry run complete")
def handle(self, *args, **options): if options["resume"]: if not os.path.isfile(options["out"]): self.stdout.write( "File does not exist, unable to resume. Aborting.") return else: if os.path.isfile(options["out"]) and not options["noinput"]: msg = "File already exists. Overwrite?" if input("%s [y/N] " % msg).lower() != "y": self.stdout.write("Aborting.") return from_ = options["from"] to = options["to"] if from_ > to: to, from_ = from_, to one_day = timedelta(days=1) start_date = parse(from_) \ .replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=pytz.utc) end_date = parse(to) \ .replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=pytz.utc) self.stdout.write("Gathering data from %s to %s..." % ( start_date.date(), end_date.date(), )) conn = get_new_redshift_connection() fieldnames = [ "game_date", "format", "player_class", "decklist", "observed_decklist", "play_sequence", "label", ] rows = 0 if options["resume"]: mode = "w" else: mode = "wt" with open(options["out"], mode, newline="") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) if not options["resume"]: writer.writeheader() current_date = start_date archetype_ids = set() while current_date <= end_date: self.stdout.write("Gathering decks from %s..." % current_date.date()) # For each day, let's grab the most recent snapshots for that day # While this ignores any previous snapshots on the same day, we don't expect the # deck composition to have changed significantly between snapshots snapshot = ClusterSetSnapshot.objects.prefetch_related( "classclustersnapshot_set").filter( promoted_on__lte=current_date).order_by( "-promoted_on").first() if snapshot: deck_ids, archetype_by_deck_id = _get_deck_ids_from_snapshot( snapshot) self.stdout.write("Got %d decks" % len(deck_ids)) # Grab the instance rows from Redshift, based on the cluster deck ids self.stdout.write("Gathering games from %s..." % current_date.date()) params = { "min_date": current_date, "max_date": current_date, "deck_ids": deck_ids, } compiled_statement = REDSHIFT_QUERY.params(params).compile( bind=conn) for row in conn.execute(compiled_statement): archetype_id = archetype_by_deck_id[row.deck_id] if archetype_id: archetype_ids.add(archetype_id) vals = { "game_date": str(row.game_date), "format": row.format, "player_class": row.player_class, "decklist": row.decklist, "observed_decklist": row.observed_decklist, "play_sequence": row.play_sequence, "label": archetype_id, } rows += 1 writer.writerow(vals) else: self.stdout.write("No snapshot live on %s" % current_date.date()) current_date += one_day if options["write_labels"]: with open(options["write_labels"], "wt", newline="") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=["label", "name"]) writer.writeheader() archetype_id_list = sorted(archetype_ids) for archetype_id in archetype_id_list: if archetype_id == -1: archetype_name = "Experimental" else: archetype = Archetype.objects.filter( id=archetype_id).first() archetype_name = archetype.name writer.writerow({ "label": archetype_id, "name": archetype_name }) self.stdout.write("Done. Wrote %d rows." % rows)