def extract(self, settings, force, restart, start, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if start: state = start, 0 elif restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) get_ids = sql_query({ "from": "job", "select": ["id"], "where": { "or": [ { "gt": { "last_modified": Date(last_modified) } }, { "and": [ { "eq": { "last_modified": Date(last_modified) } }, { "gt": { "id": job_id } }, ] }, ] }, "sort": ["last_modified", "id"], "limit": settings.extractor.chunk_size, }) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # SOME LIMITS PLACES ON STRING SIZE for fl in jx.drill(acc, "job_log.failure_line"): fl.message = strings.limit(fl.message, 10000) # LIMIT NUMBER OF text_log_error for job in acc: if len(job.text_log_error) > 100: shorter = [] known = set() for e in job.text_log_error: if e.line in known: continue known.add(e.line) shorter.append(e) job.text_log_error = shorter[:100] # ASSIGN TIMESTAMP for job in acc: job.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break source.close() extractor.close() except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
def extract(self, settings, force, restart, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination ).get_or_create_table(settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, alert_id = state last_modified = Date(last_modified) # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}", last_modified=last_modified, alert_id=alert_id, ) last_year = Date.today() - YEAR + DAY # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY get_ids = SQL( f""" SELECT s.id FROM treeherder.performance_alert_summary s LEFT JOIN treeherder.performance_alert a ON s.id=a.summary_id WHERE s.created>{quote_value(last_year)} AND (s.last_updated > {quote_value(last_modified)} OR a.last_updated > {quote_value(last_modified)}) GROUP BY s.id ORDER BY s.id LIMIT {quote_value(settings.extractor.chunk_size)} """ ) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break for r in acc: r.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, alert_id = last_doc.created, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, alert_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break source.close() extractor.close() except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done alert extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done alert merge") Log.stop()
def extract(self, settings, force, restart, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if restart or not state: state = 916850000 redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) perf_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note("Extracting perfs for perf.id={{perf_id}}", perf_id=perf_id) # get_ids = sql_query( # { # "from": "performance_datum", # "select": ["id"], # "where": {"gt": {"id": perf_id}}, # "sort": ["id"], # "limit": settings.extractor.chunk_size, # } # ) get_ids = SQL( str((PerformanceDatum.objects.filter( id__gt=perf_id).values("id").order_by("id") [:settings.extractor.chunk_size]).query)) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # TODO: Remove me July 2021 # OLD PERF RECORDS HAVE NO CORRESPONDING JOB # ADD job.submit_time FOR PARTITIONING for a in acc: if not a.job.submit_time: a.job.submit_time = a.push_timestamp a.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] perf_id = last_doc.id redis.set(settings.extractor.key, value2json(perf_id).encode("utf8")) if len(acc) < settings.extractor.chunk_size: break source.close() extractor.close() except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done perf extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done perf merge")