def multi_insert_mysql_data(self, engine, data_dict, logger): # 连接引擎 # conn = engine.connect() # 传递参数并执行语句 try: # 绑定引擎 metadata = MetaData(engine) # 连接数据表 # print("all_insert_data_dict:",data_dict) new_dict = {} for data_list in data_dict: for table, data in data_list.items(): tb_bdm_employee = Table(table, metadata, autoload=True) ins = tb_bdm_employee.insert() new_dict[ins] = data print('new_dict', new_dict) with engine.begin() as conn: # conn = engine.begin() for ins, data in new_dict.items(): result = conn.execute(ins, **data) return result.lastrowid except Exception as e: file_path = os.path.join( sys.path[0], "logs", "%s.log" % datetime.datetime.strftime( datetime.datetime.now(), "%Y_%m_%d")) log = logger(filename=file_path) log.removeHandler(log.handlers) log.info(e.__str__()) traceback.print_exc() return False
def calculate_topics(application_id: str) -> None: """Uses the latest topic model to assign a topic for each completely fetched account in the database.""" with engine.begin() as connection: topic_model = models.topic_model.select_latest(application_id, SOURCES['TWITTER'], connection) if not topic_model: return accounts = list( models.account.select_multiple_complete(application_id, SOURCES['TWITTER'], connection)) topic_model_path = get_topic_model_path(application_id) lda_model = LdaModel.load(os.path.join(topic_model_path, 'ldamodel')) dictionary = Dictionary.load( os.path.join(topic_model_path, 'dictionary')) documents = load_documents(accounts, connection) topic_iteration_id = models.topic_iteration.insert_one( topic_model['id'], connection) for account, document in zip(accounts, documents): bow = dictionary.doc2bow(document) weights = get_document_topic_weights(lda_model, bow) models.topic.insert_one(account['id'], weights, topic_iteration_id, connection) cluster_accounts(topic_iteration_id, connection)
def db_upgrade_to_head(): """ Upgrades the database to `HEAD` """ from models import engine with engine.begin() as connection: alembic_cfg.attributes['connection'] = connection command.upgrade(alembic_cfg, "head")
def create_topic_model(application_id: str) -> None: """Creates the topic model from all completely fetched accounts""" logging.info('Starting to create topic model for application id: %s' % application_id) with engine.begin() as connection: logging.info('Requesting complete accounts') accounts = list( models.account.select_multiple_complete(application_id, SOURCES['TWITTER'], connection)) logging.info('Loading documents') documents = load_documents(accounts, connection) topic_model_path = get_topic_model_path(application_id) create_folder_if_not_exists(topic_model_path) logging.info('Creating dictionary') dictionary = create_dictionary(documents) dictionary.save(os.path.join(topic_model_path, 'dictionary')) logging.info('Creating corpus') MmCorpus.serialize(os.path.join(topic_model_path, 'corpus.mm'), MyCorpus(dictionary, documents)) corpus = MmCorpus(os.path.join(topic_model_path, 'corpus.mm')) logging.info('Creating LDA Model') lda_model = create_lda_model(corpus, dictionary) lda_model.save(os.path.join(topic_model_path, 'ldamodel')) topics_words = get_topic_words(lda_model, NUM_TOPIC_WORDS) models.topic_model.insert_one(application_id, SOURCES['TWITTER'], topics_words, connection)
def destroy(cls, task_instances): for (file_path, (now, ti)) in task_instances.items(): with engine.begin() as conn: conn.execute( delete(ETL).where( and_(ETL.task_name == f"{file_path}", ETL.triggered >= now)))
async def async_main(): async with engine.begin() as conn: await conn.run_sync(Base.metadata.drop_all) await conn.run_sync(Base.metadata.create_all) async with aiohttp.ClientSession() as session: async with Session(future=True) as session_psql: for user in await get_users(session): await User(user).to_base(session_psql) for post in await get_posts(session, user['id']): await Post(post).to_base(session_psql) await session_psql.commit()
def test_consolidate(ingest, clean_etl): date = pendulum.from_format("2020_03_27", "YYYY_MM_DD").naive() for file_stem in ["2020_04_01_00_00_00-v2", "2020_03_27_00_00_00-v2"]: file_path = Path(f"tmp/raw/{file_stem}_2020_03_27.csv") ingest(file_path) table = Fact.child_or_load_table(date) task_instance = TaskInstanceMock("init") task_instance.xcom_push("config", { "date": str(date), "table_name": table.fullname }) clean_etl("consolidation", table.fullname, date) consolidate_callable(ti=task_instance) with engine.begin() as conn: count = conn.execute(table.select()).rowcount assert count == 198 count = conn.execute( table.select(table.c.session_end == None)).rowcount assert count == 0 count = conn.execute(table.select(table.c.pulltime_last)).rowcount assert count == 101 count = conn.execute( table.select( and_(table.c.pulltime_last, table.c.session_end != table.c.pulltime))).rowcount assert count == 1 count1 = conn.execute(table.select(table.c.pulltime_last)).rowcount count2 = conn.execute( select( [ table.c.userid_key, table.c.mac_key, table.c.ap_key, table.c.ssid_key, table.c.protocol_key, table.c.session_start, ], distinct=True, )).rowcount assert count1 == count2
def test_ingest_preprocessed(ingest): file_stem = "2020_04_01_00_00_00-v2" file_path = Path(f"tmp/raw/{file_stem}_2020_03_27.csv") ingest(file_path) date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive() child_fact = Fact.child_or_load_table(date) with engine.begin() as conn: count = conn.execute(child_fact.select()).rowcount assert count == 104 count = conn.execute( child_fact.select(child_fact.c.session_end == None)).rowcount assert count == 104
def _ti(file_path): if file_path not in task_instances: now = pendulum.now("utc") with engine.begin() as conn: conn.execute( delete(ETL).where(ETL.task_name == f"{file_path}")) file_task_dict = mock_sensor.xcom_pull(f"{file_path}", "sense") ti = cls("init") ti.xcom_push("config", file_task_dict["config"]) task_instances[file_path] = (now, ti) else: ti = task_instances[file_path][1] return ti
def clean_etl(): etl_entries = [] now = pendulum.now("utc") def _clean_etl(task_type, task_name, task_timestamp): etl_entries.append((task_type, task_name, task_timestamp)) yield _clean_etl for (task_type, task_name, task_timestamp) in etl_entries: with engine.begin() as conn: conn.execute( delete(ETL).where( and_( ETL.task_type == task_type, ETL.task_name == task_name, ETL.task_timestamp == task_timestamp, ETL.triggered > now, )))
def mock_etl(session): etl_entries = [] def _mock_etl(task_type, task_name, task_timestamp, status): etl_entries.append((task_type, task_name, task_timestamp, status)) ETL.set_status(task_type, task_name, task_timestamp, status, session) yield _mock_etl for (task_type, task_name, task_timestamp, status) in etl_entries: with engine.begin() as conn: conn.execute( delete(ETL).where( and_( ETL.task_type == task_type, ETL.task_name == task_name, ETL.task_timestamp == task_timestamp, ETL.status == status, )))
async def create_tables(): async with engine.begin() as conn: await conn.run_sync(Base.metadata.drop_all) await conn.run_sync(Base.metadata.create_all)
async def create_tables(): logger.info("Starting to create tables") async with engine.begin() as conn: await conn.run_sync(Base.metadata.drop_all) await conn.run_sync(Base.metadata.create_all) logger.info("Finishing to create tables")