def fill_dags_small(actualizable_bigartms, comboable_bigartms): dag = DAG('NLPmonitor_BigARTMs_small', catchup=False, max_active_runs=1, concurrency=10, default_args=default_args, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) for i in range(10, 301, 10): gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_{i}", description="Two lyears", number_of_topics=i, filters={ "corpus": "main", "source": None, "datetime_from": date(2017, 11, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) return dag
def fill_dags_full(actualizable_bigartms, comboable_bigartms): dag = DAG('NLPmonitor_BigARTMs_full', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_test", description="All news", number_of_topics=250, filters={ "corpus": "main", "source": None, "datetime_from": None, "datetime_to": None, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) return dag
def fill_dags_two_years(actualizable_bigartms, comboable_bigartms): dag = DAG('NLPmonitor_BigARTMs_two_years_', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args, schedule_interval=None) groups = json.loads(Variable.get('topic_groups', default_var="[]")) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_two_years", description="Two last years", number_of_topics=200, filters={ "corpus": "main", "source": None, "datetime_from": date(2017, 11, 1), "datetime_to": date(2019, 12, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_two_years_old_parse", description="Two last years old parse", number_of_topics=200, filters={ "corpus": "main", "source": None, "datetime_from": date(2017, 6, 1), "datetime_to": date(2019, 6, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_education_two_years", description="Two last years education", number_of_topics=150, filters={ "corpus": "main", "source": None, "datetime_from": date(2017, 11, 1), "datetime_to": date(2019, 12, 1), "group_id": 7, "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_education_one_year", description="One last year education", number_of_topics=100, filters={ "corpus": "main", "source": None, "datetime_from": date(2018, 11, 1), "datetime_to": date(2019, 12, 1), "group_id": 7, "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_education_half_year", description="One half year education", number_of_topics=100, filters={ "corpus": "main", "source": None, "datetime_from": date(2019, 5, 1), "datetime_to": date(2019, 12, 1), "group_id": 7, "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_science_two_years", description="Two last years science", number_of_topics=150, filters={ "corpus": "main", "source": None, "datetime_from": date(2017, 11, 1), "datetime_to": date(2019, 12, 1), "group_id": 8, "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_science_one_year", description="One last year science", number_of_topics=100, filters={ "corpus": "main", "source": None, "datetime_from": date(2018, 11, 1), "datetime_to": date(2019, 12, 1), "group_id": 8, "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_science_half_year", description="One half year science", number_of_topics=100, filters={ "corpus": "main", "source": None, "datetime_from": date(2019, 5, 1), "datetime_to": date(2019, 12, 1), "group_id": 8, "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) # BigARTMs for two_year Zhazira's folders groups_bigartm_two_years = filter( lambda x: x['topic_modelling_name'] == "bigartm_two_years", groups) for group in groups_bigartm_two_years: gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name=f"bigartm_{group['name']}_two_years", description=f"Two years {group['name']}", number_of_topics=100, filters={ "corpus": "main", "source": None, "datetime_from": date(2010, 5, 1), "datetime_to": date(2020, 1, 1), "group_id": group['id'], "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, name_translit=f"bigartm_{group['name_translit']}_two_years", topic_modelling_translit=group[ 'topic_modelling_name_translit'], ) group_info_security = filter(lambda x: x['id'] == 85, groups) for group in group_info_security: gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name=f"bigartm_{group['name']}_it_two_years", description=f"IT two years {group['name']}", number_of_topics=50, filters={ "corpus": "main", "source": None, "datetime_from": date(2010, 5, 1), "datetime_to": date(2020, 1, 1), "group_id": group['id'], "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, name_translit=f"bigartm_{group['name_translit']}_it_two_years", topic_modelling_translit=group[ 'topic_modelling_name_translit'], ) group_info_security = filter(lambda x: x['id'] == 86, groups) for group in group_info_security: gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name=f"bigartm_{group['name']}_2_level_it_two_years", description=f"IT two 2 level years {group['name']}", number_of_topics=25, filters={ "corpus": "main", "source": None, "datetime_from": date(2010, 5, 1), "datetime_to": date(2020, 1, 1), "group_id": group['id'], "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, name_translit= f"bigartm_{group['name_translit']}_2_level_it_two_years", topic_modelling_translit=group[ 'topic_modelling_name_translit'], ) return dag
def fill_dags_news_and_gos(actualizable_bigartms, comboable_bigartms): dag = DAG('NLPmonitor_BigARTMs_news_and_gos', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_main_and_gos", description="Main and gos 2 yearts", number_of_topics=200, filters={ "corpus": ["main", "gos"], "corpus_datetime_ignore": ["gos"], "source": None, "datetime_from": date(2018, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_main_and_gos2", description="Main and gos2 2 yearts", number_of_topics=200, filters={ "corpus": ["main", "gos2"], "corpus_datetime_ignore": ["gos2"], "source": None, "datetime_from": date(2018, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_1000_main_and_gos2", description="Main and gos2 2 years, 1000 topics", number_of_topics=1000, filters={ "corpus": ["main", "gos2"], "corpus_datetime_ignore": ["gos2"], "source": None, "datetime_from": date(2018, 1, 1), "datetime_to": date(2020, 6, 10), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) return dag
def fill_dags_scientometrics(actualizable_bigartms, comboable_bigartms): dag = DAG('NLPmonitor_BigARTMs_scientometrics', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_scientometrics_15", description="scientometrics 17k 15 topics", number_of_topics=15, filters={ "corpus": "scientometrics", "source": None, "datetime_from": date(2004, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_scientometrics_25", description="scientometrics 17k 25 topics", number_of_topics=25, filters={ "corpus": "scientometrics", "source": None, "datetime_from": date(2004, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_scientometrics_50", description="scientometrics 17k 50 topics", number_of_topics=50, filters={ "corpus": "scientometrics", "source": None, "datetime_from": date(2004, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_scientometrics_75", description="scientometrics 17k 75 topics", number_of_topics=75, filters={ "corpus": "scientometrics", "source": None, "datetime_from": date(2004, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) return dag
def fill_dags_scopus(actualizable_bigartms, comboable_bigartms): import datetime import json from datetime import date from airflow import DAG from airflow.models import Variable from airflow.operators.python_operator import PythonOperator from dags.bigartm.fill_dags.utils import gen_bigartm_operator, default_args dag = DAG('NLPmonitor_BigARTMs_Scopus', catchup=False, max_active_runs=1, concurrency=1, default_args=default_args, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) for num_topics in [100, 500]: gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm__scopus_{num_topics}", description=f"scopus {num_topics} topics", number_of_topics=num_topics, filters={ "corpus": "scopus_real_real", "corpus_datetime_ignore": ["scopus_real_real"], "source": None, "datetime_from": date(1900, 1, 1), "datetime_to": date(2050, 1, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_en_scopus_extend") groups = json.loads(Variable.get('topic_groups', default_var="[]")) default_args_retries = default_args.copy() default_args_retries['retries'] = 4 dag = DAG('NLPmonitor_BigARTMs_Scopus_hierarchy', catchup=False, max_active_runs=1, concurrency=3, default_args=default_args_retries, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) for group in filter( lambda x: x['topic_modelling_name'] == "bigartm__scopus_100", groups): gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name=f"bigartm__scopus_100_{group['name_translit']}", description= f"scopus 100 topics hierarchy, {group['name_translit']}", number_of_topics=50, filters={ "corpus": "scopus_real_real", "corpus_datetime_ignore": ["scopus_real_real"], "source": None, "datetime_from": date(1900, 1, 1), "datetime_to": date(2050, 1, 1), "group_id": group['id'], "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_en_scopus_extend") return dag
def fill_dags_rus_corpora(actualizable_bigartms, comboable_bigartms): dag = DAG('NLPmonitor_BigARTMs_rus', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_full_lenta", description="Lenta full", number_of_topics=250, filters={ "corpus": "rus", "source": None, "datetime_from": date(2000, 1, 1), "datetime_to": date(2020, 5, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_lenta", description="Lenta full", number_of_topics=200, filters={ "corpus": "rus", "source": None, "datetime_from": date(2018, 1, 1), "datetime_to": date(2020, 5, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) # ############### rus vs rus_propaganda ####################### gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_full_rus_and_rus_propaganda", description="", number_of_topics=250, filters={ "corpus": ["rus", "rus_propaganda"], "source": None, "datetime_from": date(2000, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_rus_and_rus_propaganda", description="", number_of_topics=200, filters={ "corpus": ["rus", "rus_propaganda"], "source": None, "datetime_from": date(2018, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_1000_rus_and_rus_propaganda", description="", number_of_topics=1000, filters={ "corpus": ["rus", "rus_propaganda"], "source": None, "datetime_from": date(2018, 2, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_2020_rus_and_rus_propaganda", description="", number_of_topics=150, filters={ "corpus": ["rus", "rus_propaganda"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) # ############### rus vs kz ####################### gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_full_rus_and_main", description="", number_of_topics=250, filters={ "corpus": ["rus", "main"], "source": None, "datetime_from": date(2000, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_rus_and_main", description="", number_of_topics=200, filters={ "corpus": ["rus", "main"], "source": None, "datetime_from": date(2018, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_2020_rus_and_main", description="", number_of_topics=150, filters={ "corpus": ["rus", "main"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) # ############### rus_propaganda vs kz ####################### gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_full_rus_propaganda_and_main", description="", number_of_topics=250, filters={ "corpus": ["rus_propaganda", "main"], "source": None, "datetime_from": date(2000, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_rus_propaganda_and_main", description="", number_of_topics=200, filters={ "corpus": ["rus_propaganda", "main"], "source": None, "datetime_from": date(2018, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_2020_rus_propaganda_and_main", description="", number_of_topics=150, filters={ "corpus": ["rus_propaganda", "main"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2020, 4, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) return dag
def fill_dags_kz(actualizable_bigartms, comboable_bigartms): dag = DAG('NLPmonitor_BigARTMs_kz', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_test_kz", description="Two last years", number_of_topics=200, filters={ "corpus": "main", "source": None, "datetime_from": None, "datetime_to": None, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_lemmatized_kz_apertium", ) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_two_years_kz", description="Two last years", number_of_topics=200, filters={ "corpus": "main", "source": None, "datetime_from": date(2018, 6, 1), "datetime_to": date(2020, 6, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_lemmatized_kz_apertium", ) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_kz", description="Two last years", number_of_topics=200, filters={ "corpus": "main", "source": None, "datetime_from": date(2019, 9, 1), "datetime_to": date(2020, 6, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_lemmatized_kz_apertium", ) return dag
def fill_dags_2020(actualizable_bigartms, comboable_bigartms): dag = DAG('NLPmonitor_BigARTMs_2020', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020", description="2020", number_of_topics=175, filters={ "corpus": "main", "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2020, 12, 31), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_education_2019", description="2019 education", number_of_topics=75, filters={ "corpus": "main", "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2020, 12, 31), "group_id": 95, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_education_2_2020", description="2020 education 2 distilled", number_of_topics=60, filters={ "corpus": "main", "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2020, 12, 31), "group_id": 96, "topic_weight_threshold": 0.025, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_education_3_2020", description="2020 education 3 distilled", number_of_topics=60, filters={ "corpus": "main", "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2020, 12, 31), "group_id": 98, "topic_weight_threshold": 0.025, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False) return dag
def fill_dags_healthcare(actualizable_bigartms, comboable_bigartms): dag = DAG('NLPmonitor_BigARTMs_Healthcare', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) # ZERO LEVEL gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_kaz", description="2020-2021", number_of_topics=200, filters={ "corpus": "main", "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 15), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus", description="2020-2021", number_of_topics=200, filters={ "corpus": ["rus", "rus_propaganda"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 15), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_kaz", description="2020-2021", number_of_topics=200, filters={ "corpus": ["main", "rus", "rus_propaganda"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 15), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") # FIRST LEVEL gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_health_1", description="2020-2021", number_of_topics=150, filters={ "corpus": ["rus", "rus_propaganda"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 15), "group_id": 101, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_kaz_health_1", description="2020-2021", number_of_topics=150, filters={ "corpus": "main", "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 15), "group_id": 102, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_kaz_health_1", description="2020-2021", number_of_topics=150, filters={ "corpus": ["main", "rus", "rus_propaganda"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 15), "group_id": 104, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") # SECOND LEVEL gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_health_2", description="2020-2021", number_of_topics=100, filters={ "corpus": ["rus", "rus_propaganda"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 15), "group_id": 105, "topic_weight_threshold": 0.04, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_kaz_health_2", description="2020-2021", number_of_topics=100, filters={ "corpus": "main", "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 15), "group_id": 106, "topic_weight_threshold": 0.04, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_kaz_health_2", description="2020-2021", number_of_topics=100, filters={ "corpus": ["main", "rus", "rus_propaganda"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 15), "group_id": 107, "topic_weight_threshold": 0.04, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") # SECOND LEVEL gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_health_3", description="2020-2021", number_of_topics=50, filters={ "corpus": ["rus", "rus_propaganda"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 20), "group_id": 108, "topic_weight_threshold": 0.1, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_kaz_health_3", description="2020-2021", number_of_topics=50, filters={ "corpus": "main", "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 20), "group_id": 109, "topic_weight_threshold": 0.1, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_kaz_health_3", description="2020-2021", number_of_topics=50, filters={ "corpus": ["main", "rus", "rus_propaganda"], "source": None, "datetime_from": date(2020, 1, 1), "datetime_to": date(2021, 4, 20), "group_id": 110, "topic_weight_threshold": 0.1, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict") return dag
def fill_dags_ngramized(actualizable_bigartms, comboable_bigartms): dag = DAG('NLPmonitor_BigARTMs_ngramized', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args, schedule_interval=None) with dag: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name="bigartm_test_ngram", description="Two last years", number_of_topics=200, filters={ "corpus": "main", "source": None, "datetime_from": None, "datetime_to": None, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field= "text_ngramized_kz_rus_ngrams_dict_pymorphy_2_4_393442_3710985", ) gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name="bigartm_two_years_ngram", description="Two last years", number_of_topics=200, filters={ "corpus": "main", "source": None, "datetime_from": date(2017, 11, 1), "datetime_to": date(2019, 12, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field= "text_ngramized_kz_rus_ngrams_dict_pymorphy_2_4_393442_3710985", ) gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name="bigartm_education_two_years_ngram", description="Two last years education", number_of_topics=150, filters={ "corpus": "main", "source": None, "datetime_from": date(2017, 11, 1), "datetime_to": date(2019, 12, 1), "group_id": 7, "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field= "text_ngramized_kz_rus_ngrams_dict_pymorphy_2_4_393442_3710985", ) gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name="bigartm_hate", description="Hate speech dataset", number_of_topics=100, filters={ "corpus": ["hate_hate", "hate_offensive", "hate_neither", "hate_test"], "source": None, "datetime_from": None, "datetime_to": None, }, regularization_params={ "SmoothSparseThetaRegularizer": 0, "SmoothSparsePhiRegularizer": 0.5, "DecorrelatorPhiRegularizer": 10, "ImproveCoherencePhiRegularizer": 0 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_en_lemminflect", ) dag8 = DAG('NLPmonitor_BigARTMs_ngramized_yandex', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args, schedule_interval=None) with dag8: wait_for_basic_tms = PythonOperator( task_id="wait_for_basic_tms", python_callable=lambda: 0, ) gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name="bigartm_test_ngram_yandex", description="Two last years", number_of_topics=200, filters={ "corpus": "main", "source": None, "datetime_from": None, "datetime_to": None, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict", ) gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name="bigartm_two_years_ngram_yandex", description="Two last years", number_of_topics=200, filters={ "corpus": "main", "source": None, "datetime_from": date(2017, 11, 1), "datetime_to": date(2019, 12, 1), }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict", ) gen_bigartm_operator( actualizable_bigartms, comboable_bigartms, name="bigartm_education_two_years_ngram_yandex", description="Two last years education", number_of_topics=150, filters={ "corpus": "main", "source": None, "datetime_from": date(2017, 11, 1), "datetime_to": date(2019, 12, 1), "group_id": 7, "topic_weight_threshold": 0.05, }, regularization_params={ "SmoothSparseThetaRegularizer": 0.15, "SmoothSparsePhiRegularizer": 0.15, "DecorrelatorPhiRegularizer": 0.15, "ImproveCoherencePhiRegularizer": 0.15 }, wait_for_basic_tms=wait_for_basic_tms, is_actualizable=False, text_field="text_ngramized_kz_rus_yandex_ngrams_dict", ) return dag