示例#1
0
 def multi_insert_mysql_data(self, engine, data_dict, logger):
     # 连接引擎
     # conn = engine.connect()
     # 传递参数并执行语句
     try:
         # 绑定引擎
         metadata = MetaData(engine)
         # 连接数据表
         # print("all_insert_data_dict:",data_dict)
         new_dict = {}
         for data_list in data_dict:
             for table, data in data_list.items():
                 tb_bdm_employee = Table(table, metadata, autoload=True)
                 ins = tb_bdm_employee.insert()
                 new_dict[ins] = data
         print('new_dict', new_dict)
         with engine.begin() as conn:
             # conn = engine.begin()
             for ins, data in new_dict.items():
                 result = conn.execute(ins, **data)
         return result.lastrowid
     except Exception as e:
         file_path = os.path.join(
             sys.path[0], "logs", "%s.log" % datetime.datetime.strftime(
                 datetime.datetime.now(), "%Y_%m_%d"))
         log = logger(filename=file_path)
         log.removeHandler(log.handlers)
         log.info(e.__str__())
         traceback.print_exc()
         return False
示例#2
0
def calculate_topics(application_id: str) -> None:
    """Uses the latest topic model to assign a topic for each completely fetched account in the database."""
    with engine.begin() as connection:
        topic_model = models.topic_model.select_latest(application_id,
                                                       SOURCES['TWITTER'],
                                                       connection)

        if not topic_model:
            return

        accounts = list(
            models.account.select_multiple_complete(application_id,
                                                    SOURCES['TWITTER'],
                                                    connection))
        topic_model_path = get_topic_model_path(application_id)
        lda_model = LdaModel.load(os.path.join(topic_model_path, 'ldamodel'))
        dictionary = Dictionary.load(
            os.path.join(topic_model_path, 'dictionary'))
        documents = load_documents(accounts, connection)
        topic_iteration_id = models.topic_iteration.insert_one(
            topic_model['id'], connection)
        for account, document in zip(accounts, documents):
            bow = dictionary.doc2bow(document)
            weights = get_document_topic_weights(lda_model, bow)
            models.topic.insert_one(account['id'], weights, topic_iteration_id,
                                    connection)
        cluster_accounts(topic_iteration_id, connection)
def db_upgrade_to_head():
    """ Upgrades the database to `HEAD` """
    from models import engine

    with engine.begin() as connection:
        alembic_cfg.attributes['connection'] = connection
        command.upgrade(alembic_cfg, "head")
示例#4
0
def create_topic_model(application_id: str) -> None:
    """Creates the topic model from all completely fetched accounts"""
    logging.info('Starting to create topic model for application id: %s' %
                 application_id)
    with engine.begin() as connection:
        logging.info('Requesting complete accounts')
        accounts = list(
            models.account.select_multiple_complete(application_id,
                                                    SOURCES['TWITTER'],
                                                    connection))

        logging.info('Loading documents')
        documents = load_documents(accounts, connection)

        topic_model_path = get_topic_model_path(application_id)
        create_folder_if_not_exists(topic_model_path)

        logging.info('Creating dictionary')
        dictionary = create_dictionary(documents)
        dictionary.save(os.path.join(topic_model_path, 'dictionary'))

        logging.info('Creating corpus')
        MmCorpus.serialize(os.path.join(topic_model_path, 'corpus.mm'),
                           MyCorpus(dictionary, documents))
        corpus = MmCorpus(os.path.join(topic_model_path, 'corpus.mm'))

        logging.info('Creating LDA Model')
        lda_model = create_lda_model(corpus, dictionary)
        lda_model.save(os.path.join(topic_model_path, 'ldamodel'))
        topics_words = get_topic_words(lda_model, NUM_TOPIC_WORDS)
        models.topic_model.insert_one(application_id, SOURCES['TWITTER'],
                                      topics_words, connection)
示例#5
0
    def destroy(cls, task_instances):

        for (file_path, (now, ti)) in task_instances.items():

            with engine.begin() as conn:
                conn.execute(
                    delete(ETL).where(
                        and_(ETL.task_name == f"{file_path}",
                             ETL.triggered >= now)))
示例#6
0
async def async_main():
    async with engine.begin() as conn:
        await conn.run_sync(Base.metadata.drop_all)
        await conn.run_sync(Base.metadata.create_all)
    async with aiohttp.ClientSession() as session:
        async with Session(future=True) as session_psql:
            for user in await get_users(session):
                await User(user).to_base(session_psql)
                for post in await get_posts(session, user['id']):
                    await Post(post).to_base(session_psql)
            await session_psql.commit()
示例#7
0
def test_consolidate(ingest, clean_etl):

    date = pendulum.from_format("2020_03_27", "YYYY_MM_DD").naive()

    for file_stem in ["2020_04_01_00_00_00-v2", "2020_03_27_00_00_00-v2"]:
        file_path = Path(f"tmp/raw/{file_stem}_2020_03_27.csv")
        ingest(file_path)

    table = Fact.child_or_load_table(date)
    task_instance = TaskInstanceMock("init")
    task_instance.xcom_push("config", {
        "date": str(date),
        "table_name": table.fullname
    })

    clean_etl("consolidation", table.fullname, date)
    consolidate_callable(ti=task_instance)

    with engine.begin() as conn:
        count = conn.execute(table.select()).rowcount
        assert count == 198

        count = conn.execute(
            table.select(table.c.session_end == None)).rowcount
        assert count == 0

        count = conn.execute(table.select(table.c.pulltime_last)).rowcount
        assert count == 101

        count = conn.execute(
            table.select(
                and_(table.c.pulltime_last,
                     table.c.session_end != table.c.pulltime))).rowcount
        assert count == 1

        count1 = conn.execute(table.select(table.c.pulltime_last)).rowcount
        count2 = conn.execute(
            select(
                [
                    table.c.userid_key,
                    table.c.mac_key,
                    table.c.ap_key,
                    table.c.ssid_key,
                    table.c.protocol_key,
                    table.c.session_start,
                ],
                distinct=True,
            )).rowcount
        assert count1 == count2
示例#8
0
def test_ingest_preprocessed(ingest):

    file_stem = "2020_04_01_00_00_00-v2"
    file_path = Path(f"tmp/raw/{file_stem}_2020_03_27.csv")
    ingest(file_path)

    date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive()
    child_fact = Fact.child_or_load_table(date)

    with engine.begin() as conn:
        count = conn.execute(child_fact.select()).rowcount
        assert count == 104

        count = conn.execute(
            child_fact.select(child_fact.c.session_end == None)).rowcount
        assert count == 104
示例#9
0
        def _ti(file_path):

            if file_path not in task_instances:
                now = pendulum.now("utc")

                with engine.begin() as conn:
                    conn.execute(
                        delete(ETL).where(ETL.task_name == f"{file_path}"))

                file_task_dict = mock_sensor.xcom_pull(f"{file_path}", "sense")
                ti = cls("init")
                ti.xcom_push("config", file_task_dict["config"])

                task_instances[file_path] = (now, ti)
            else:
                ti = task_instances[file_path][1]

            return ti
示例#10
0
def clean_etl():

    etl_entries = []
    now = pendulum.now("utc")

    def _clean_etl(task_type, task_name, task_timestamp):
        etl_entries.append((task_type, task_name, task_timestamp))

    yield _clean_etl

    for (task_type, task_name, task_timestamp) in etl_entries:

        with engine.begin() as conn:
            conn.execute(
                delete(ETL).where(
                    and_(
                        ETL.task_type == task_type,
                        ETL.task_name == task_name,
                        ETL.task_timestamp == task_timestamp,
                        ETL.triggered > now,
                    )))
示例#11
0
def mock_etl(session):

    etl_entries = []

    def _mock_etl(task_type, task_name, task_timestamp, status):
        etl_entries.append((task_type, task_name, task_timestamp, status))
        ETL.set_status(task_type, task_name, task_timestamp, status, session)

    yield _mock_etl

    for (task_type, task_name, task_timestamp, status) in etl_entries:

        with engine.begin() as conn:
            conn.execute(
                delete(ETL).where(
                    and_(
                        ETL.task_type == task_type,
                        ETL.task_name == task_name,
                        ETL.task_timestamp == task_timestamp,
                        ETL.status == status,
                    )))
示例#12
0
async def create_tables():
    async with engine.begin() as conn:
        await conn.run_sync(Base.metadata.drop_all)
        await conn.run_sync(Base.metadata.create_all)
示例#13
0
async def create_tables():
    logger.info("Starting to create tables")
    async with engine.begin() as conn:
        await conn.run_sync(Base.metadata.drop_all)
        await conn.run_sync(Base.metadata.create_all)
    logger.info("Finishing to create tables")