예제 #1
0
 def _ingest(file_path):
     raw_path = AIRFLOW_RAW / f"{file_path.name[:22]}.tsv"
     ti = preprocess(raw_path)
     file_config = ti.xcom_pull("config", "init")
     extract_table_name = file_config["extract_table"]
     load_table_name = file_config["load_table"]
     date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive()
     table_name = f"fact.session_{date.format('YYYY_MM_DD')}"
     new_tables.update([table_name, extract_table_name, load_table_name])
     Fact.etl(date, file_path.name, extract_table_name, load_table_name)
예제 #2
0
파일: loghours.py 프로젝트: mgaitan/h2dp
def log_hours():
    logging.basicConfig(level=logging.INFO)
    
    categories = settings.HAMSTER_TO_DP.keys()
    tag_logged = Tag.get_or_create(name = '_logged_in_dp_')
    

    already_tagged = [ft.fact.id for ft in FactTag.filter(tag=tag_logged).distinct()]
    

    #get all facts that belong to exportable categories, finished, and
    # not previously posted
    facts = Fact.filter(
                Q(activity__category__name__in=categories) & 
                ~Q(end_time__is=None) &
              # ~Q(fact_tags_set__tag=tag_logged)     # * see note
                ~Q(id__in=already_tagged)
            )

    # NOTE
    # I want to exclude Facts tagged with ``tag_logged``, . but that ~Q() condition
    # only exclude the facts that are ONLY tagged with ``tag_logged``.
    
    # the last Q is a workaround but imply a nested select.
    # How I should write this query?

    if not facts.exists():
        logging.info("You're up to date! There is no unsynced tasks")
        return
        

    br = DotProjectBot(settings.DP_BASE_URL)
    br.login(settings.DP_USERNAME, settings.DP_PASSWORD)


    for f in facts:
        #process data
        tags = ', '.join([ft.tag.name for ft in f.fact_tags_set])

        if tags and f.description:
            description = '%s %s: %s' % (f.activity.name, tags, f.description)
        elif tags:
            description = '%s %s' % (f.activity.name, tags)
        elif f.description:
            description = '%s %s' % (f.activity.name, f.description)
        else:
            description = f.activity.name

        dp_task_id = settings.HAMSTER_TO_DP[f.category.name]

        #and post the fact into dotproject!
        br.log_task(dp_task_id, f.start_time, f.duration, description)

        #then mark the fact as logged.
        ft = FactTag(fact=f, tag=tag_logged)
        ft.save()
예제 #3
0
def consolidate_callable(**kwargs):
    """ consolidate session table. """

    task_instance = kwargs["ti"]
    table_config = task_instance.xcom_pull(key="config", task_ids="init")

    date = table_config["date"]
    date = pendulum.from_format(date, "YYYY-MM-DD[T]HH:mm:ss").naive()

    table_name = table_config["table_name"]

    session = Session()
    try:
        Fact.consolidate(date)
        ETL.set_status("consolidation", table_name, date, "completed", session)
        session.close()
    except Exception as e:
        ETL.set_status("consolidation", table_name, date, "quarantine",
                       session)
        session.close()
        raise e
예제 #4
0
파일: dag_etl.py 프로젝트: NUS-IDS/cofi
def ingest_callable(**kwargs):
    """ ingest preprocessed wifi log files to database. """

    task_instance = kwargs["ti"]
    file_config = task_instance.xcom_pull(key="config", task_ids="init")

    file_stem = file_config["file_stem"]
    extract_table_name = file_config["extract_table"]
    load_table_name = file_config["load_table"]

    logging.info(f"Looping through '{file_stem}*.csv'")

    ingest_errors = []

    for file_path in AIRFLOW_IMPORT.glob(f"{file_stem}*.csv"):
        logging.info(f"Ingesting {file_path}.")
        date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive()
        session = Session()
        if ETL.can_process("session_file", file_path, date, session):
            try:
                ETL.commit_new("session_file", file_path, date, session)
                Fact.etl(date, file_path.name, extract_table_name,
                         load_table_name)
                ETL.set_status("session_file", file_path, date, "completed",
                               session)
                session.close()
            except:
                ingest_errors.append(file_path)
                ETL.set_status("session_file", file_path, date, "quarantine",
                               session)
                session.close()

        if len(ingest_errors) > 0:
            logging.info(
                f"The following files could not be ingested: {ingest_errors}.")
            raise Exception(
                f"A total of {len(ingest_errors)} files could not be ingested. Failing DAG run"
            )
예제 #5
0
def test_consolidate(ingest, clean_etl):

    date = pendulum.from_format("2020_03_27", "YYYY_MM_DD").naive()

    for file_stem in ["2020_04_01_00_00_00-v2", "2020_03_27_00_00_00-v2"]:
        file_path = Path(f"tmp/raw/{file_stem}_2020_03_27.csv")
        ingest(file_path)

    table = Fact.child_or_load_table(date)
    task_instance = TaskInstanceMock("init")
    task_instance.xcom_push("config", {
        "date": str(date),
        "table_name": table.fullname
    })

    clean_etl("consolidation", table.fullname, date)
    consolidate_callable(ti=task_instance)

    with engine.begin() as conn:
        count = conn.execute(table.select()).rowcount
        assert count == 198

        count = conn.execute(
            table.select(table.c.session_end == None)).rowcount
        assert count == 0

        count = conn.execute(table.select(table.c.pulltime_last)).rowcount
        assert count == 101

        count = conn.execute(
            table.select(
                and_(table.c.pulltime_last,
                     table.c.session_end != table.c.pulltime))).rowcount
        assert count == 1

        count1 = conn.execute(table.select(table.c.pulltime_last)).rowcount
        count2 = conn.execute(
            select(
                [
                    table.c.userid_key,
                    table.c.mac_key,
                    table.c.ap_key,
                    table.c.ssid_key,
                    table.c.protocol_key,
                    table.c.session_start,
                ],
                distinct=True,
            )).rowcount
        assert count1 == count2
예제 #6
0
파일: dag_etl.py 프로젝트: NUS-IDS/cofi
def clean_callable(**kwargs):
    """remove generated preprocessed files
    and fails the DAG if any previous task failed."""

    task_instance = kwargs["ti"]
    file_config = task_instance.xcom_pull(key="config", task_ids="init")

    file_stem = file_config["file_stem"]

    for file_path in AIRFLOW_IMPORT.glob(f"{file_stem}{RAW_GLOB}"):
        if file_path.exists():
            logging.info(f"Removing {file_path}.")
            file_path.unlink()

    extract_table_name = file_config["extract_table"]
    load_table_name = file_config["load_table"]
    Fact.remove_tables(extract_table_name, load_table_name)

    for task_instance in kwargs["dag_run"].get_task_instances():
        if (task_instance.current_state() == State.FAILED
                and task_instance.task_id != kwargs["task_instance"].task_id):
            raise Exception(
                f"Failing this DAG run, because task upstream {task_instance.task_id} failed. "
            )
예제 #7
0
def test_ingest_preprocessed(ingest):

    file_stem = "2020_04_01_00_00_00-v2"
    file_path = Path(f"tmp/raw/{file_stem}_2020_03_27.csv")
    ingest(file_path)

    date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive()
    child_fact = Fact.child_or_load_table(date)

    with engine.begin() as conn:
        count = conn.execute(child_fact.select()).rowcount
        assert count == 104

        count = conn.execute(
            child_fact.select(child_fact.c.session_end == None)).rowcount
        assert count == 104
예제 #8
0
from mongoengine import *
from variables import Variables
from models import Fact, Rumor
import ast
import random

connect(host=Variables.DB_CONNECTION)

if Variables.seed:
    with open(
            '/home/talalnks/RumorBusterBackend/RumorsFacts.json') as dataFile:
        data = ast.literal_eval(dataFile.read())
        for rumor in data:
            existingRumor = Rumor.objects(body=rumor['Rumors']).first()
            if not existingRumor:
                r = Rumor(body=rumor['Rumors'],
                          report_counter=random.randint(25, 150)).save()
                if len(rumor['Facts']) > 0:
                    f = Fact(body=rumor['Facts'], rumor=r).save()
예제 #9
0
def submit(request):
    if request.method == 'POST':
        submit = request.POST
    elif request.method == 'GET':
        submit = request.GET
    else:
        raise Http404

    mac = submit.get('mac')
    machine = None
    if mac:
        try:
            machine = Machine.objects.get(mac=mac)
        except Machine.DoesNotExist:
            machine = Machine(mac=mac)

    if machine:
        if submit.get('hostname'):
            machine.hostname = submit.get('hostname')
        machine.last_checkin = timezone.now()
        machine.save()

        response = ''

        facts = submit.get('Facts')
        if facts is not None:
            facts = json.loads(facts)
            for key in facts:
                try:
                    fact = Fact.objects.get(machine=machine, name=key)
                except:
                    fact = Fact(machine=machine, name=key)
                fact.last_update = timezone.now()
                fact.value = facts[key]

                fact.save()

        facts = submit.get('HistoricalFacts')
        if facts is not None:
            facts = json.loads(facts)
            for key in facts:
                fact = HistoricalFact(machine=machine, name=key)
                fact.timestamp = timezone.now()
                fact.value = facts[key]

                fact.save()

        return HttpResponse('Report submitted.\n')

    return HttpResponse('Report not submitted.\n')