Exemplo n.º 1
0
def fill_dags_small(actualizable_bigartms, comboable_bigartms):
    dag = DAG('NLPmonitor_BigARTMs_small',
              catchup=False,
              max_active_runs=1,
              concurrency=10,
              default_args=default_args,
              schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )

        for i in range(10, 301, 10):
            gen_bigartm_operator(actualizable_bigartms,
                                 comboable_bigartms,
                                 name=f"bigartm_two_years_{i}",
                                 description="Two lyears",
                                 number_of_topics=i,
                                 filters={
                                     "corpus": "main",
                                     "source": None,
                                     "datetime_from": date(2017, 11, 1),
                                     "datetime_to": date(2020, 4, 1),
                                 },
                                 regularization_params={
                                     "SmoothSparseThetaRegularizer": 0.15,
                                     "SmoothSparsePhiRegularizer": 0.15,
                                     "DecorrelatorPhiRegularizer": 0.15,
                                     "ImproveCoherencePhiRegularizer": 0.15
                                 },
                                 wait_for_basic_tms=wait_for_basic_tms,
                                 is_actualizable=False)
    return dag
Exemplo n.º 2
0
def fill_dags_full(actualizable_bigartms, comboable_bigartms):
    dag = DAG('NLPmonitor_BigARTMs_full',
              catchup=False,
              max_active_runs=1,
              concurrency=7,
              default_args=default_args,
              schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name="bigartm_test",
                             description="All news",
                             number_of_topics=250,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": None,
                                 "datetime_to": None,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)
    return dag
def fill_dags_two_years(actualizable_bigartms, comboable_bigartms):
    dag = DAG('NLPmonitor_BigARTMs_two_years_',
              catchup=False,
              max_active_runs=1,
              concurrency=7,
              default_args=default_args,
              schedule_interval=None)
    groups = json.loads(Variable.get('topic_groups', default_var="[]"))
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name="bigartm_two_years",
                             description="Two last years",
                             number_of_topics=200,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2017, 11, 1),
                                 "datetime_to": date(2019, 12, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name="bigartm_two_years_old_parse",
                             description="Two last years old parse",
                             number_of_topics=200,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2017, 6, 1),
                                 "datetime_to": date(2019, 6, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name="bigartm_education_two_years",
                             description="Two last years education",
                             number_of_topics=150,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2017, 11, 1),
                                 "datetime_to": date(2019, 12, 1),
                                 "group_id": 7,
                                 "topic_weight_threshold": 0.05,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name="bigartm_education_one_year",
                             description="One last year education",
                             number_of_topics=100,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2018, 11, 1),
                                 "datetime_to": date(2019, 12, 1),
                                 "group_id": 7,
                                 "topic_weight_threshold": 0.05,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name="bigartm_education_half_year",
                             description="One half year education",
                             number_of_topics=100,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2019, 5, 1),
                                 "datetime_to": date(2019, 12, 1),
                                 "group_id": 7,
                                 "topic_weight_threshold": 0.05,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name="bigartm_science_two_years",
                             description="Two last years science",
                             number_of_topics=150,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2017, 11, 1),
                                 "datetime_to": date(2019, 12, 1),
                                 "group_id": 8,
                                 "topic_weight_threshold": 0.05,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name="bigartm_science_one_year",
                             description="One last year science",
                             number_of_topics=100,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2018, 11, 1),
                                 "datetime_to": date(2019, 12, 1),
                                 "group_id": 8,
                                 "topic_weight_threshold": 0.05,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name="bigartm_science_half_year",
                             description="One half year science",
                             number_of_topics=100,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2019, 5, 1),
                                 "datetime_to": date(2019, 12, 1),
                                 "group_id": 8,
                                 "topic_weight_threshold": 0.05,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        # BigARTMs for two_year Zhazira's folders
        groups_bigartm_two_years = filter(
            lambda x: x['topic_modelling_name'] == "bigartm_two_years", groups)
        for group in groups_bigartm_two_years:
            gen_bigartm_operator(
                actualizable_bigartms,
                comboable_bigartms,
                name=f"bigartm_{group['name']}_two_years",
                description=f"Two years {group['name']}",
                number_of_topics=100,
                filters={
                    "corpus": "main",
                    "source": None,
                    "datetime_from": date(2010, 5, 1),
                    "datetime_to": date(2020, 1, 1),
                    "group_id": group['id'],
                    "topic_weight_threshold": 0.05,
                },
                regularization_params={
                    "SmoothSparseThetaRegularizer": 0.15,
                    "SmoothSparsePhiRegularizer": 0.15,
                    "DecorrelatorPhiRegularizer": 0.15,
                    "ImproveCoherencePhiRegularizer": 0.15
                },
                wait_for_basic_tms=wait_for_basic_tms,
                is_actualizable=False,
                name_translit=f"bigartm_{group['name_translit']}_two_years",
                topic_modelling_translit=group[
                    'topic_modelling_name_translit'],
            )

        group_info_security = filter(lambda x: x['id'] == 85, groups)
        for group in group_info_security:
            gen_bigartm_operator(
                actualizable_bigartms,
                comboable_bigartms,
                name=f"bigartm_{group['name']}_it_two_years",
                description=f"IT two years {group['name']}",
                number_of_topics=50,
                filters={
                    "corpus": "main",
                    "source": None,
                    "datetime_from": date(2010, 5, 1),
                    "datetime_to": date(2020, 1, 1),
                    "group_id": group['id'],
                    "topic_weight_threshold": 0.05,
                },
                regularization_params={
                    "SmoothSparseThetaRegularizer": 0.15,
                    "SmoothSparsePhiRegularizer": 0.15,
                    "DecorrelatorPhiRegularizer": 0.15,
                    "ImproveCoherencePhiRegularizer": 0.15
                },
                wait_for_basic_tms=wait_for_basic_tms,
                is_actualizable=False,
                name_translit=f"bigartm_{group['name_translit']}_it_two_years",
                topic_modelling_translit=group[
                    'topic_modelling_name_translit'],
            )

        group_info_security = filter(lambda x: x['id'] == 86, groups)
        for group in group_info_security:
            gen_bigartm_operator(
                actualizable_bigartms,
                comboable_bigartms,
                name=f"bigartm_{group['name']}_2_level_it_two_years",
                description=f"IT two 2 level years {group['name']}",
                number_of_topics=25,
                filters={
                    "corpus": "main",
                    "source": None,
                    "datetime_from": date(2010, 5, 1),
                    "datetime_to": date(2020, 1, 1),
                    "group_id": group['id'],
                    "topic_weight_threshold": 0.05,
                },
                regularization_params={
                    "SmoothSparseThetaRegularizer": 0.15,
                    "SmoothSparsePhiRegularizer": 0.15,
                    "DecorrelatorPhiRegularizer": 0.15,
                    "ImproveCoherencePhiRegularizer": 0.15
                },
                wait_for_basic_tms=wait_for_basic_tms,
                is_actualizable=False,
                name_translit=
                f"bigartm_{group['name_translit']}_2_level_it_two_years",
                topic_modelling_translit=group[
                    'topic_modelling_name_translit'],
            )
    return dag
Exemplo n.º 4
0
def fill_dags_news_and_gos(actualizable_bigartms, comboable_bigartms):
    dag = DAG('NLPmonitor_BigARTMs_news_and_gos',
              catchup=False,
              max_active_runs=1,
              concurrency=7,
              default_args=default_args,
              schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name=f"bigartm_two_years_main_and_gos",
                             description="Main and gos 2 yearts",
                             number_of_topics=200,
                             filters={
                                 "corpus": ["main", "gos"],
                                 "corpus_datetime_ignore": ["gos"],
                                 "source": None,
                                 "datetime_from": date(2018, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name=f"bigartm_two_years_main_and_gos2",
                             description="Main and gos2 2 yearts",
                             number_of_topics=200,
                             filters={
                                 "corpus": ["main", "gos2"],
                                 "corpus_datetime_ignore": ["gos2"],
                                 "source": None,
                                 "datetime_from": date(2018, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms,
                             comboable_bigartms,
                             name=f"bigartm_two_years_1000_main_and_gos2",
                             description="Main and gos2 2 years, 1000 topics",
                             number_of_topics=1000,
                             filters={
                                 "corpus": ["main", "gos2"],
                                 "corpus_datetime_ignore": ["gos2"],
                                 "source": None,
                                 "datetime_from": date(2018, 1, 1),
                                 "datetime_to": date(2020, 6, 10),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)
    return dag
def fill_dags_scientometrics(actualizable_bigartms, comboable_bigartms):
    dag = DAG('NLPmonitor_BigARTMs_scientometrics', catchup=False, max_active_runs=1, concurrency=7,
               default_args=default_args, schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )
        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_scientometrics_15", description="scientometrics 17k 15 topics",
                             number_of_topics=15,
                             filters={
                                 "corpus": "scientometrics",
                                 "source": None,
                                 "datetime_from": date(2004, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_scientometrics_25", description="scientometrics 17k 25 topics",
                             number_of_topics=25,
                             filters={
                                 "corpus": "scientometrics",
                                 "source": None,
                                 "datetime_from": date(2004, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_scientometrics_50", description="scientometrics 17k 50 topics",
                             number_of_topics=50,
                             filters={
                                 "corpus": "scientometrics",
                                 "source": None,
                                 "datetime_from": date(2004, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_scientometrics_75", description="scientometrics 17k 75 topics",
                             number_of_topics=75,
                             filters={
                                 "corpus": "scientometrics",
                                 "source": None,
                                 "datetime_from": date(2004, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)
    return dag
Exemplo n.º 6
0
def fill_dags_scopus(actualizable_bigartms, comboable_bigartms):
    import datetime
    import json
    from datetime import date

    from airflow import DAG
    from airflow.models import Variable
    from airflow.operators.python_operator import PythonOperator

    from dags.bigartm.fill_dags.utils import gen_bigartm_operator, default_args

    dag = DAG('NLPmonitor_BigARTMs_Scopus',
              catchup=False,
              max_active_runs=1,
              concurrency=1,
              default_args=default_args,
              schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )
        for num_topics in [100, 500]:
            gen_bigartm_operator(actualizable_bigartms,
                                 comboable_bigartms,
                                 name=f"bigartm__scopus_{num_topics}",
                                 description=f"scopus {num_topics} topics",
                                 number_of_topics=num_topics,
                                 filters={
                                     "corpus": "scopus_real_real",
                                     "corpus_datetime_ignore":
                                     ["scopus_real_real"],
                                     "source": None,
                                     "datetime_from": date(1900, 1, 1),
                                     "datetime_to": date(2050, 1, 1),
                                 },
                                 regularization_params={
                                     "SmoothSparseThetaRegularizer": 0.15,
                                     "SmoothSparsePhiRegularizer": 0.15,
                                     "DecorrelatorPhiRegularizer": 0.15,
                                     "ImproveCoherencePhiRegularizer": 0.15
                                 },
                                 wait_for_basic_tms=wait_for_basic_tms,
                                 is_actualizable=False,
                                 text_field="text_ngramized_en_scopus_extend")

    groups = json.loads(Variable.get('topic_groups', default_var="[]"))
    default_args_retries = default_args.copy()
    default_args_retries['retries'] = 4
    dag = DAG('NLPmonitor_BigARTMs_Scopus_hierarchy',
              catchup=False,
              max_active_runs=1,
              concurrency=3,
              default_args=default_args_retries,
              schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )
        for group in filter(
                lambda x: x['topic_modelling_name'] == "bigartm__scopus_100",
                groups):
            gen_bigartm_operator(
                actualizable_bigartms,
                comboable_bigartms,
                name=f"bigartm__scopus_100_{group['name_translit']}",
                description=
                f"scopus 100 topics hierarchy, {group['name_translit']}",
                number_of_topics=50,
                filters={
                    "corpus": "scopus_real_real",
                    "corpus_datetime_ignore": ["scopus_real_real"],
                    "source": None,
                    "datetime_from": date(1900, 1, 1),
                    "datetime_to": date(2050, 1, 1),
                    "group_id": group['id'],
                    "topic_weight_threshold": 0.05,
                },
                regularization_params={
                    "SmoothSparseThetaRegularizer": 0.15,
                    "SmoothSparsePhiRegularizer": 0.15,
                    "DecorrelatorPhiRegularizer": 0.15,
                    "ImproveCoherencePhiRegularizer": 0.15
                },
                wait_for_basic_tms=wait_for_basic_tms,
                is_actualizable=False,
                text_field="text_ngramized_en_scopus_extend")

    return dag
def fill_dags_rus_corpora(actualizable_bigartms, comboable_bigartms):
    dag = DAG('NLPmonitor_BigARTMs_rus', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args,
               schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )
        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_full_lenta", description="Lenta full", number_of_topics=250,
                             filters={
                                 "corpus": "rus",
                                 "source": None,
                                 "datetime_from": date(2000, 1, 1),
                                 "datetime_to": date(2020, 5, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_lenta", description="Lenta full", number_of_topics=200,
                             filters={
                                 "corpus": "rus",
                                 "source": None,
                                 "datetime_from": date(2018, 1, 1),
                                 "datetime_to": date(2020, 5, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        # ############### rus vs rus_propaganda #######################
        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_full_rus_and_rus_propaganda", description="",
                             number_of_topics=250,
                             filters={
                                 "corpus": ["rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2000, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_rus_and_rus_propaganda", description="",
                             number_of_topics=200,
                             filters={
                                 "corpus": ["rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2018, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_1000_rus_and_rus_propaganda", description="",
                             number_of_topics=1000,
                             filters={
                                 "corpus": ["rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2018, 2, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_2020_rus_and_rus_propaganda", description="",
                             number_of_topics=150,
                             filters={
                                 "corpus": ["rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        # ############### rus vs kz #######################
        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_full_rus_and_main", description="",
                             number_of_topics=250,
                             filters={
                                 "corpus": ["rus", "main"],
                                 "source": None,
                                 "datetime_from": date(2000, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_rus_and_main", description="",
                             number_of_topics=200,
                             filters={
                                 "corpus": ["rus", "main"],
                                 "source": None,
                                 "datetime_from": date(2018, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_2020_rus_and_main", description="",
                             number_of_topics=150,
                             filters={
                                 "corpus": ["rus", "main"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        # ############### rus_propaganda vs kz #######################
        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_full_rus_propaganda_and_main", description="",
                             number_of_topics=250,
                             filters={
                                 "corpus": ["rus_propaganda", "main"],
                                 "source": None,
                                 "datetime_from": date(2000, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_two_years_rus_propaganda_and_main", description="",
                             number_of_topics=200,
                             filters={
                                 "corpus": ["rus_propaganda", "main"],
                                 "source": None,
                                 "datetime_from": date(2018, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name=f"bigartm_2020_rus_propaganda_and_main", description="",
                             number_of_topics=150,
                             filters={
                                 "corpus": ["rus_propaganda", "main"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2020, 4, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)
    return dag
Exemplo n.º 8
0
def fill_dags_kz(actualizable_bigartms, comboable_bigartms):
    dag = DAG('NLPmonitor_BigARTMs_kz', catchup=False, max_active_runs=1, concurrency=7,
               default_args=default_args, schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_test_kz", description="Two last years", number_of_topics=200,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": None,
                                 "datetime_to": None,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_lemmatized_kz_apertium",
                             )

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_two_years_kz", description="Two last years", number_of_topics=200,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2018, 6, 1),
                                 "datetime_to": date(2020, 6, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_lemmatized_kz_apertium",
                             )

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_kz", description="Two last years", number_of_topics=200,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2019, 9, 1),
                                 "datetime_to": date(2020, 6, 1),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_lemmatized_kz_apertium",
                             )
    return dag
Exemplo n.º 9
0
def fill_dags_2020(actualizable_bigartms, comboable_bigartms):
    dag = DAG('NLPmonitor_BigARTMs_2020', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args,
               schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020", description="2020", number_of_topics=175,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2020, 12, 31),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_education_2019", description="2019 education", number_of_topics=75,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2020, 12, 31),
                                 "group_id": 95,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_education_2_2020", description="2020 education 2 distilled",
                             number_of_topics=60,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2020, 12, 31),
                                 "group_id": 96,
                                 "topic_weight_threshold": 0.025,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_education_3_2020",
                             description="2020 education 3 distilled",
                             number_of_topics=60,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2020, 12, 31),
                                 "group_id": 98,
                                 "topic_weight_threshold": 0.025,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False)
    return dag
Exemplo n.º 10
0
def fill_dags_healthcare(actualizable_bigartms, comboable_bigartms):
    dag = DAG('NLPmonitor_BigARTMs_Healthcare', catchup=False, max_active_runs=1, concurrency=7, default_args=default_args,
               schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )

        # ZERO LEVEL
        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_kaz", description="2020-2021", number_of_topics=200,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 15),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")


        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus", description="2020-2021", number_of_topics=200,
                             filters={
                                 "corpus": ["rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 15),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")


        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_kaz", description="2020-2021", number_of_topics=200,
                             filters={
                                 "corpus": ["main", "rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 15),
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")


        # FIRST LEVEL
        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_health_1",
                             description="2020-2021", number_of_topics=150,
                             filters={
                                 "corpus": ["rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 15),
                                 "group_id": 101,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_kaz_health_1",
                             description="2020-2021", number_of_topics=150,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 15),
                                 "group_id": 102,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_kaz_health_1",
                             description="2020-2021", number_of_topics=150,
                             filters={
                                 "corpus": ["main", "rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 15),
                                 "group_id": 104,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")

        # SECOND LEVEL
        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_health_2",
                             description="2020-2021", number_of_topics=100,
                             filters={
                                 "corpus": ["rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 15),
                                 "group_id": 105,
                                 "topic_weight_threshold": 0.04,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_kaz_health_2",
                             description="2020-2021", number_of_topics=100,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 15),
                                 "group_id": 106,
                                 "topic_weight_threshold": 0.04,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_kaz_health_2",
                             description="2020-2021", number_of_topics=100,
                             filters={
                                 "corpus": ["main", "rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 15),
                                 "group_id": 107,
                                 "topic_weight_threshold": 0.04,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")

        # SECOND LEVEL
        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_health_3",
                             description="2020-2021", number_of_topics=50,
                             filters={
                                 "corpus": ["rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 20),
                                 "group_id": 108,
                                 "topic_weight_threshold": 0.1,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_kaz_health_3",
                             description="2020-2021", number_of_topics=50,
                             filters={
                                 "corpus": "main",
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 20),
                                 "group_id": 109,
                                 "topic_weight_threshold": 0.1,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")

        gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name="bigartm_2020_2021_rus_kaz_health_3",
                             description="2020-2021", number_of_topics=50,
                             filters={
                                 "corpus": ["main", "rus", "rus_propaganda"],
                                 "source": None,
                                 "datetime_from": date(2020, 1, 1),
                                 "datetime_to": date(2021, 4, 20),
                                 "group_id": 110,
                                 "topic_weight_threshold": 0.1,
                             },
                             regularization_params={
                                 "SmoothSparseThetaRegularizer": 0.15,
                                 "SmoothSparsePhiRegularizer": 0.15,
                                 "DecorrelatorPhiRegularizer": 0.15,
                                 "ImproveCoherencePhiRegularizer": 0.15
                             },
                             wait_for_basic_tms=wait_for_basic_tms,
                             is_actualizable=False,
                             text_field="text_ngramized_kz_rus_yandex_ngrams_dict")

    return dag
Exemplo n.º 11
0
def fill_dags_ngramized(actualizable_bigartms, comboable_bigartms):
    dag = DAG('NLPmonitor_BigARTMs_ngramized',
              catchup=False,
              max_active_runs=1,
              concurrency=7,
              default_args=default_args,
              schedule_interval=None)
    with dag:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )

        gen_bigartm_operator(
            actualizable_bigartms,
            comboable_bigartms,
            name="bigartm_test_ngram",
            description="Two last years",
            number_of_topics=200,
            filters={
                "corpus": "main",
                "source": None,
                "datetime_from": None,
                "datetime_to": None,
            },
            regularization_params={
                "SmoothSparseThetaRegularizer": 0.15,
                "SmoothSparsePhiRegularizer": 0.15,
                "DecorrelatorPhiRegularizer": 0.15,
                "ImproveCoherencePhiRegularizer": 0.15
            },
            wait_for_basic_tms=wait_for_basic_tms,
            is_actualizable=False,
            text_field=
            "text_ngramized_kz_rus_ngrams_dict_pymorphy_2_4_393442_3710985",
        )

        gen_bigartm_operator(
            actualizable_bigartms,
            comboable_bigartms,
            name="bigartm_two_years_ngram",
            description="Two last years",
            number_of_topics=200,
            filters={
                "corpus": "main",
                "source": None,
                "datetime_from": date(2017, 11, 1),
                "datetime_to": date(2019, 12, 1),
            },
            regularization_params={
                "SmoothSparseThetaRegularizer": 0.15,
                "SmoothSparsePhiRegularizer": 0.15,
                "DecorrelatorPhiRegularizer": 0.15,
                "ImproveCoherencePhiRegularizer": 0.15
            },
            wait_for_basic_tms=wait_for_basic_tms,
            is_actualizable=False,
            text_field=
            "text_ngramized_kz_rus_ngrams_dict_pymorphy_2_4_393442_3710985",
        )

        gen_bigartm_operator(
            actualizable_bigartms,
            comboable_bigartms,
            name="bigartm_education_two_years_ngram",
            description="Two last years education",
            number_of_topics=150,
            filters={
                "corpus": "main",
                "source": None,
                "datetime_from": date(2017, 11, 1),
                "datetime_to": date(2019, 12, 1),
                "group_id": 7,
                "topic_weight_threshold": 0.05,
            },
            regularization_params={
                "SmoothSparseThetaRegularizer": 0.15,
                "SmoothSparsePhiRegularizer": 0.15,
                "DecorrelatorPhiRegularizer": 0.15,
                "ImproveCoherencePhiRegularizer": 0.15
            },
            wait_for_basic_tms=wait_for_basic_tms,
            is_actualizable=False,
            text_field=
            "text_ngramized_kz_rus_ngrams_dict_pymorphy_2_4_393442_3710985",
        )

        gen_bigartm_operator(
            actualizable_bigartms,
            comboable_bigartms,
            name="bigartm_hate",
            description="Hate speech dataset",
            number_of_topics=100,
            filters={
                "corpus":
                ["hate_hate", "hate_offensive", "hate_neither", "hate_test"],
                "source":
                None,
                "datetime_from":
                None,
                "datetime_to":
                None,
            },
            regularization_params={
                "SmoothSparseThetaRegularizer": 0,
                "SmoothSparsePhiRegularizer": 0.5,
                "DecorrelatorPhiRegularizer": 10,
                "ImproveCoherencePhiRegularizer": 0
            },
            wait_for_basic_tms=wait_for_basic_tms,
            is_actualizable=False,
            text_field="text_ngramized_en_lemminflect",
        )

    dag8 = DAG('NLPmonitor_BigARTMs_ngramized_yandex',
               catchup=False,
               max_active_runs=1,
               concurrency=7,
               default_args=default_args,
               schedule_interval=None)
    with dag8:
        wait_for_basic_tms = PythonOperator(
            task_id="wait_for_basic_tms",
            python_callable=lambda: 0,
        )

        gen_bigartm_operator(
            actualizable_bigartms,
            comboable_bigartms,
            name="bigartm_test_ngram_yandex",
            description="Two last years",
            number_of_topics=200,
            filters={
                "corpus": "main",
                "source": None,
                "datetime_from": None,
                "datetime_to": None,
            },
            regularization_params={
                "SmoothSparseThetaRegularizer": 0.15,
                "SmoothSparsePhiRegularizer": 0.15,
                "DecorrelatorPhiRegularizer": 0.15,
                "ImproveCoherencePhiRegularizer": 0.15
            },
            wait_for_basic_tms=wait_for_basic_tms,
            is_actualizable=False,
            text_field="text_ngramized_kz_rus_yandex_ngrams_dict",
        )

        gen_bigartm_operator(
            actualizable_bigartms,
            comboable_bigartms,
            name="bigartm_two_years_ngram_yandex",
            description="Two last years",
            number_of_topics=200,
            filters={
                "corpus": "main",
                "source": None,
                "datetime_from": date(2017, 11, 1),
                "datetime_to": date(2019, 12, 1),
            },
            regularization_params={
                "SmoothSparseThetaRegularizer": 0.15,
                "SmoothSparsePhiRegularizer": 0.15,
                "DecorrelatorPhiRegularizer": 0.15,
                "ImproveCoherencePhiRegularizer": 0.15
            },
            wait_for_basic_tms=wait_for_basic_tms,
            is_actualizable=False,
            text_field="text_ngramized_kz_rus_yandex_ngrams_dict",
        )

        gen_bigartm_operator(
            actualizable_bigartms,
            comboable_bigartms,
            name="bigartm_education_two_years_ngram_yandex",
            description="Two last years education",
            number_of_topics=150,
            filters={
                "corpus": "main",
                "source": None,
                "datetime_from": date(2017, 11, 1),
                "datetime_to": date(2019, 12, 1),
                "group_id": 7,
                "topic_weight_threshold": 0.05,
            },
            regularization_params={
                "SmoothSparseThetaRegularizer": 0.15,
                "SmoothSparsePhiRegularizer": 0.15,
                "DecorrelatorPhiRegularizer": 0.15,
                "ImproveCoherencePhiRegularizer": 0.15
            },
            wait_for_basic_tms=wait_for_basic_tms,
            is_actualizable=False,
            text_field="text_ngramized_kz_rus_yandex_ngrams_dict",
        )
    return dag