def _ingest(file_path): raw_path = AIRFLOW_RAW / f"{file_path.name[:22]}.tsv" ti = preprocess(raw_path) file_config = ti.xcom_pull("config", "init") extract_table_name = file_config["extract_table"] load_table_name = file_config["load_table"] date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive() table_name = f"fact.session_{date.format('YYYY_MM_DD')}" new_tables.update([table_name, extract_table_name, load_table_name]) Fact.etl(date, file_path.name, extract_table_name, load_table_name)
def log_hours(): logging.basicConfig(level=logging.INFO) categories = settings.HAMSTER_TO_DP.keys() tag_logged = Tag.get_or_create(name = '_logged_in_dp_') already_tagged = [ft.fact.id for ft in FactTag.filter(tag=tag_logged).distinct()] #get all facts that belong to exportable categories, finished, and # not previously posted facts = Fact.filter( Q(activity__category__name__in=categories) & ~Q(end_time__is=None) & # ~Q(fact_tags_set__tag=tag_logged) # * see note ~Q(id__in=already_tagged) ) # NOTE # I want to exclude Facts tagged with ``tag_logged``, . but that ~Q() condition # only exclude the facts that are ONLY tagged with ``tag_logged``. # the last Q is a workaround but imply a nested select. # How I should write this query? if not facts.exists(): logging.info("You're up to date! There is no unsynced tasks") return br = DotProjectBot(settings.DP_BASE_URL) br.login(settings.DP_USERNAME, settings.DP_PASSWORD) for f in facts: #process data tags = ', '.join([ft.tag.name for ft in f.fact_tags_set]) if tags and f.description: description = '%s %s: %s' % (f.activity.name, tags, f.description) elif tags: description = '%s %s' % (f.activity.name, tags) elif f.description: description = '%s %s' % (f.activity.name, f.description) else: description = f.activity.name dp_task_id = settings.HAMSTER_TO_DP[f.category.name] #and post the fact into dotproject! br.log_task(dp_task_id, f.start_time, f.duration, description) #then mark the fact as logged. ft = FactTag(fact=f, tag=tag_logged) ft.save()
def consolidate_callable(**kwargs): """ consolidate session table. """ task_instance = kwargs["ti"] table_config = task_instance.xcom_pull(key="config", task_ids="init") date = table_config["date"] date = pendulum.from_format(date, "YYYY-MM-DD[T]HH:mm:ss").naive() table_name = table_config["table_name"] session = Session() try: Fact.consolidate(date) ETL.set_status("consolidation", table_name, date, "completed", session) session.close() except Exception as e: ETL.set_status("consolidation", table_name, date, "quarantine", session) session.close() raise e
def ingest_callable(**kwargs): """ ingest preprocessed wifi log files to database. """ task_instance = kwargs["ti"] file_config = task_instance.xcom_pull(key="config", task_ids="init") file_stem = file_config["file_stem"] extract_table_name = file_config["extract_table"] load_table_name = file_config["load_table"] logging.info(f"Looping through '{file_stem}*.csv'") ingest_errors = [] for file_path in AIRFLOW_IMPORT.glob(f"{file_stem}*.csv"): logging.info(f"Ingesting {file_path}.") date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive() session = Session() if ETL.can_process("session_file", file_path, date, session): try: ETL.commit_new("session_file", file_path, date, session) Fact.etl(date, file_path.name, extract_table_name, load_table_name) ETL.set_status("session_file", file_path, date, "completed", session) session.close() except: ingest_errors.append(file_path) ETL.set_status("session_file", file_path, date, "quarantine", session) session.close() if len(ingest_errors) > 0: logging.info( f"The following files could not be ingested: {ingest_errors}.") raise Exception( f"A total of {len(ingest_errors)} files could not be ingested. Failing DAG run" )
def test_consolidate(ingest, clean_etl): date = pendulum.from_format("2020_03_27", "YYYY_MM_DD").naive() for file_stem in ["2020_04_01_00_00_00-v2", "2020_03_27_00_00_00-v2"]: file_path = Path(f"tmp/raw/{file_stem}_2020_03_27.csv") ingest(file_path) table = Fact.child_or_load_table(date) task_instance = TaskInstanceMock("init") task_instance.xcom_push("config", { "date": str(date), "table_name": table.fullname }) clean_etl("consolidation", table.fullname, date) consolidate_callable(ti=task_instance) with engine.begin() as conn: count = conn.execute(table.select()).rowcount assert count == 198 count = conn.execute( table.select(table.c.session_end == None)).rowcount assert count == 0 count = conn.execute(table.select(table.c.pulltime_last)).rowcount assert count == 101 count = conn.execute( table.select( and_(table.c.pulltime_last, table.c.session_end != table.c.pulltime))).rowcount assert count == 1 count1 = conn.execute(table.select(table.c.pulltime_last)).rowcount count2 = conn.execute( select( [ table.c.userid_key, table.c.mac_key, table.c.ap_key, table.c.ssid_key, table.c.protocol_key, table.c.session_start, ], distinct=True, )).rowcount assert count1 == count2
def clean_callable(**kwargs): """remove generated preprocessed files and fails the DAG if any previous task failed.""" task_instance = kwargs["ti"] file_config = task_instance.xcom_pull(key="config", task_ids="init") file_stem = file_config["file_stem"] for file_path in AIRFLOW_IMPORT.glob(f"{file_stem}{RAW_GLOB}"): if file_path.exists(): logging.info(f"Removing {file_path}.") file_path.unlink() extract_table_name = file_config["extract_table"] load_table_name = file_config["load_table"] Fact.remove_tables(extract_table_name, load_table_name) for task_instance in kwargs["dag_run"].get_task_instances(): if (task_instance.current_state() == State.FAILED and task_instance.task_id != kwargs["task_instance"].task_id): raise Exception( f"Failing this DAG run, because task upstream {task_instance.task_id} failed. " )
def test_ingest_preprocessed(ingest): file_stem = "2020_04_01_00_00_00-v2" file_path = Path(f"tmp/raw/{file_stem}_2020_03_27.csv") ingest(file_path) date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive() child_fact = Fact.child_or_load_table(date) with engine.begin() as conn: count = conn.execute(child_fact.select()).rowcount assert count == 104 count = conn.execute( child_fact.select(child_fact.c.session_end == None)).rowcount assert count == 104
from mongoengine import * from variables import Variables from models import Fact, Rumor import ast import random connect(host=Variables.DB_CONNECTION) if Variables.seed: with open( '/home/talalnks/RumorBusterBackend/RumorsFacts.json') as dataFile: data = ast.literal_eval(dataFile.read()) for rumor in data: existingRumor = Rumor.objects(body=rumor['Rumors']).first() if not existingRumor: r = Rumor(body=rumor['Rumors'], report_counter=random.randint(25, 150)).save() if len(rumor['Facts']) > 0: f = Fact(body=rumor['Facts'], rumor=r).save()
def submit(request): if request.method == 'POST': submit = request.POST elif request.method == 'GET': submit = request.GET else: raise Http404 mac = submit.get('mac') machine = None if mac: try: machine = Machine.objects.get(mac=mac) except Machine.DoesNotExist: machine = Machine(mac=mac) if machine: if submit.get('hostname'): machine.hostname = submit.get('hostname') machine.last_checkin = timezone.now() machine.save() response = '' facts = submit.get('Facts') if facts is not None: facts = json.loads(facts) for key in facts: try: fact = Fact.objects.get(machine=machine, name=key) except: fact = Fact(machine=machine, name=key) fact.last_update = timezone.now() fact.value = facts[key] fact.save() facts = submit.get('HistoricalFacts') if facts is not None: facts = json.loads(facts) for key in facts: fact = HistoricalFact(machine=machine, name=key) fact.timestamp = timezone.now() fact.value = facts[key] fact.save() return HttpResponse('Report submitted.\n') return HttpResponse('Report not submitted.\n')