Пример #1
0
def test_check_unique_names(test_graph_spec):
    test_graph_spec['node_lists'][0]["name"] = "chocolate node"
    test_graph_spec['node_lists'][1]["name"] = "chocolate node"

    with pytest.raises(ValueError) as excinfo:
        _ = GraphSpec.from_dict(test_graph_spec)
    assert excinfo.value.message == {
        '_schema': ["Names are not unique: [u'chocolate node']."]}
Пример #2
0
def test_env_graph_spec_loading_conflict(test_conflict_graph_spec):
    with pytest.raises(ValueError) as excinfo:
        _ = GraphSpec.from_dict(test_conflict_graph_spec)
    assert excinfo.value.message == {
        '_schema': [
            'Graph specification cannot contain both data_uri and '
            'data_uri_env. Only one of them should be specified.'
        ]
    }
Пример #3
0
def test_invalid_poll(test_graph_spec):
    test_cases = ['1 1', '', '1 1 a b', '100 100 * *']

    for test_case in test_cases:
        test_graph_spec['poll'] = test_case

        with pytest.raises(ValueError) as excinfo:
            _ = GraphSpec.from_dict(test_graph_spec)
        assert excinfo.value.message == {
            'poll': ['Invalid cron expression.']}
Пример #4
0
def test_graph_spec_loading(test_graph_spec):
    graph_spec = GraphSpec.from_dict(test_graph_spec)

    assert graph_spec.name == 'test_data'
    assert graph_spec.poll == '0 0 * * *'
    assert graph_spec.data_uri == 'data_uri_value'
    assert graph_spec.graph_uri == 'graph_uri_value'

    assert len(list(graph_spec.node_lists)) == 3
    assert len(list(graph_spec.edge_lists)) == 3
Пример #5
0
def test_invalid_variable_definition(test_graph_spec):
    test_graph_spec['node_lists'][0]['index_column'][
        'variable_definition'] = 'not String'

    with pytest.raises(ValueError) as excinfo:
        _ = GraphSpec.from_dict(test_graph_spec)
    assert excinfo.value.message == {
        'node_lists': {0: {'index_column': {'variable_definition': [
            "Variable definition must be "
            "'String', 'Price' or 'Categorical'."
        ]}}}}
Пример #6
0
def test_safe_name(test_graph_spec):
    graph_spec = GraphSpec.from_dict(test_graph_spec)

    # data set has 'chocolate nodes', 'sweets nodes' and 'toffee nodes'
    intended_node_safe_names = [
        'fn_chocolate_nodes',
        'fn_sweets_nodes',
        'fn_toffee_nodes'
    ]

    node_safe_names = [node_list.safe_name
                       for node_list
                       in graph_spec.node_lists]

    for safe_name in node_safe_names:
        assert safe_name in intended_node_safe_names
Пример #7
0
def test_graph_spec_dumping(test_graph_spec):
    graph_spec = GraphSpec.from_dict(test_graph_spec)
    dumped_graph_spec = graph_spec.to_dict()

    assert graph_spec.name == dumped_graph_spec['name']
    assert graph_spec.poll == dumped_graph_spec['poll']
    assert graph_spec.data_uri == dumped_graph_spec['data_uri']
    assert graph_spec.graph_uri == dumped_graph_spec['graph_uri']

    assert len(list(graph_spec.node_lists)) == len(dumped_graph_spec['node_lists'])
    assert len(list(graph_spec.edge_lists)) == len(dumped_graph_spec['edge_lists'])

    for node_list, dumped_node_list in zip(graph_spec.node_lists,
                                           dumped_graph_spec['node_lists']):
        assert node_list.name == dumped_node_list['name']
        assert node_list.safe_name == dumped_node_list['safe_name']

    for edge_list, dumped_edge_list in zip(graph_spec.edge_lists,
                                           dumped_graph_spec['edge_lists']):
        assert edge_list.name == dumped_edge_list['name']
        assert edge_list.safe_name == dumped_edge_list['safe_name']
Пример #8
0
def test_table_details(test_graph_spec):
    graph_spec = GraphSpec.from_dict(test_graph_spec)

    assert graph_spec.table_details == {
        'connection': 'data_uri_value',
        'poll_frequency': '0 2 * * *',
        'tables': {
            ('test_data_chocolate_edge_list',
             'fn_test_data_chocolate_edge_list'): {
                ('chocolate_s', 'fn_chocolate_s'),
                ('chocolate_t', 'fn_chocolate_t')
            },
            ('test_data_chocolate_node_list',
             'fn_test_data_chocolate_node_list'): {
                ('id', 'fn_id')
            },
            ('test_data_sweets_edge_list',
             'fn_test_data_sweets_edge_list'): {
                ('sweets_s', 'fn_sweets_s'),
                ('sweets_t', 'fn_sweets_t')
            },
            ('test_data_sweets_node_list',
             'fn_test_data_sweets_node_list'): {
                ('id', 'fn_id'),
                ('prop', 'fn_prop')
            },
            ('test_data_toffee_edge_list',
             'fn_test_data_toffee_edge_list'): {
                ('toffee_s', 'fn_toffee_s'),
                ('toffee_t', 'fn_toffee_t')
            },
            ('test_data_toffee_node_list',
             'fn_test_data_toffee_node_list'): {
                ('hide', 'fn_hide'),
                ('id', 'fn_id'),
                ('prop', 'fn_prop')
            }
        }
    }
PIPELINE_ARGS = {
    'owner': 'finnet',
    'depends_on_past': False,
    'start_date': datetime(2017, 1, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=10)
}


# pylint: disable=invalid-name
try:
    with open(SPEC_FILE) as spec_data:
        graph_spec = GraphSpec.from_dict(json.load(spec_data))
except (ValueError, IOError, KeyError) as e:
    logging.warning(e)
    graph_spec = None


if graph_spec:
    dag = DAG('fn_figov_grey_list_detection',
              catchup=True,
              default_args=PIPELINE_ARGS,
              schedule_interval=graph_spec.poll)

    num_days_between_runs = compute_days_between_runs(graph_spec.poll)

    # 1. Create task to wait for data ingestion
    grey_list_wait_for_data_ingestion = ExternalTaskSensor(
Пример #10
0
def test_pipeline_tasks():
    """
    Test the pipeline
    """

    # Set the list of tasks to test
    dolist = [
        'build_lists', 'resolve_entities',
        'neo4j_purger', 'neo4j_writer',
        'graph_tools'
    ]

    # Get neo4j ssh username and port
    neo4j_ssh_username = os.environ.get('NEO4J_SSH_USERNAME', 'neo4j')
    neo4j_ssh_port = int(os.environ.get('NEO4J_SSH_PORT', 9000))

    # Setup the spark configuration
    config = dict()
    config['SparkConfiguration'] = (SparkConf()
                                    .setMaster('local[*]')
                                    .setAppName("test create data")
                                    .set("spark.executor.memory", "1024m"))

    # Get the graph specs
    datalist = os.listdir(LOCAL_DATA_PATH)
    jsonlist = [k for k in datalist if re.match(r'.*\.json$', k)]

    # Read in the graph spec
    for gspec in jsonlist:
        # Load the graph spec
        with open(os.path.join(LOCAL_DATA_PATH, gspec), 'r') as f:
            graph_spec = GraphSpec.from_dict(json.load(f))
            spec = graph_spec.to_dict()

        tables_path = os.path.join(DATA_PATH, graph_spec.name, 'tables')
        n_path = os.path.join(DATA_PATH, graph_spec.name, 'node_list')
        e_path = os.path.join(DATA_PATH, graph_spec.name, 'edge_list')
        n_path_res = os.path.join(DATA_PATH, graph_spec.name, 'node_list_resolved')
        e_path_res = os.path.join(DATA_PATH, graph_spec.name, 'edge_list_resolved')

        logging.info("Processing " + gspec)

        # Use graph specification's neo4j connection
        neo_config = {
            'uri': spec['graph_uri'],
            'max_retries': config.get('neo4j.max_retries', 5),
            'max_batchsize': config.get('neo4j.max_batchsize', 10000)
        }

        # Build list
        if 'build_lists' in dolist:
            logging.info("Building lists...")
            build_node_lists(
                graph_specification=graph_spec,
                spark_config=(SparkConfFactory()
                              .set_master('local[*]')
                              .set_app_name('test create data')
                              .set('spark.executor.memory', '1g')),
                tables_path=tables_path,
                node_path=n_path,
                data_format=DATA_FORMAT,
            )
            build_edge_lists(
                graph_specification=graph_spec,
                spark_config=(SparkConfFactory()
                              .set_master('local[*]')
                              .set_app_name('test create data')
                              .set('spark.executor.memory', '1g')),
                tables_path=tables_path,
                edge_path=e_path,
                data_format=DATA_FORMAT,
            )
            logging.info("Checking build_lists...")
            with get_spark_context(config['SparkConfiguration']) as spark_ctx:
                sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
                assert test_build_lists(spark_ctx, sql_context, spec)

        # Resolve entities
        if 'resolve_entities' in dolist:
            logging.info("Resolving entities...")
            resolve_node_entities(
                graph_specification=graph_spec,
                spark_config=(SparkConfFactory()
                              .set_master('local[*]')
                              .set_app_name('test create data')
                              .set('spark.executor.memory', '1g')),
                entity_maps=dict(),
                input_node_path=n_path,
                output_node_path=n_path_res,
                output_node_id='_canonical_id',
                data_format=DATA_FORMAT
            )
            resolve_edge_entities(
                graph_specification=graph_spec,
                spark_config=(SparkConfFactory()
                              .set_master('local[*]')
                              .set_app_name('test create data')
                              .set('spark.executor.memory', '1g')),
                entity_maps=dict(),
                input_edge_path=e_path,
                output_edge_path=e_path_res,
                output_edge_source_id='_canonical_id_source',
                output_edge_target_id='_canonical_id_target',
                data_format=DATA_FORMAT
            )

        # Purging the graph
        if 'neo4j_purger' in dolist:
            logging.info("Purging Neo4j...")
            neo4j_manager.purge(graph_spec,
                                username=neo4j_ssh_username,
                                port=neo4j_ssh_port)
            logging.info("Checking purging neo4j...")
            with get_neo4j_context(neo_config['uri']) as neo_context:
                assert test_neo4j_purger(neo_context)

        # Graph writer
        if 'neo4j_writer' in dolist:
            logging.info("Writing to Neo4j...")
            graph_to_neo4j.graph_to_neo4j(graph_specification=graph_spec,
                                          spark_config=SparkConfFactory()
                                          .set_master('local[*]')
                                          .set_app_name('write neo4j nodes')
                                          .set("spark.driver.maxResultSize",
                                               "1g")
                                          .set('spark.executor.memory',
                                               '1g'),
                                          input_node_path=n_path_res,
                                          input_edge_path=e_path_res,
                                          username=neo4j_ssh_username,
                                          port=neo4j_ssh_port
                                          )

            # This inserts node properties that were not captured above, more convenient like this???
            neo4j_writer.write_neo4j_nodes(graph_specification=spec,
                                           spark_config=SparkConfFactory()
                                           .set_master('local[*]')
                                           .set_app_name('write neo4j nodes')
                                           .set('spark.executor.memory',
                                                '1g')
                                           )

            datetime_now = datetime.now()
            logging.info("Backing up db, then purge it...")
            neo4j_manager.backup(graph_spec, datetime_now,
                                 username=neo4j_ssh_username,
                                 port=neo4j_ssh_port)
            neo4j_manager.purge(graph_spec,
                                username=neo4j_ssh_username,
                                port=neo4j_ssh_port)
            logging.info("Restoring the backup to db...")
            neo4j_manager.restore(graph_spec,
                                  datetime_now,
                                  username=neo4j_ssh_username,
                                  port=neo4j_ssh_port)

            logging.info("Checking write neo4j...")
            with get_spark_context(config['SparkConfiguration']) as spark_ctx:
                sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
                with get_neo4j_context(neo_config['uri']) as neo_context:
                    assert test_neo4j_writer(
                        spark_ctx, sql_context, neo_context, spec
                    )

        if 'graph_tools' in dolist:
            # Test graph_construction_coi.get_graph_dataframes
            data_path = os.environ['PIPELINE_DATA_PATH']
            graph_name = graph_spec.name
            node_path_resolved = os.path.join(data_path, graph_name, 'node_list_resolved')
            edge_path_resolved = os.path.join(data_path, graph_name, 'edge_list_resolved')
            with get_spark_context(config['SparkConfiguration']) as spark_ctx:
                sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
                graph = get_graph_dataframes(graph_spec, sql_context,
                                             node_path_resolved, edge_path_resolved,
                                             DATA_FORMAT)

                assert 'node_list' in graph
                assert 'edge_list' in graph
                assert len(graph['node_list']) == len(graph_spec.node_lists)
                for cur_node_list in graph_spec.node_lists:
                    assert cur_node_list.safe_name in graph['node_list']
                assert len(graph['edge_list']) == len(graph_spec.edge_lists)
                for cur_edge_list in graph_spec.edge_lists:
                    assert cur_edge_list.safe_name in graph['edge_list']

            # Test graph_construction_coi.data_loading
            with get_spark_context(config['SparkConfiguration']) as spark_ctx:
                sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
                tables = load_node_edge_lists(sql_context, graph_spec,
                                              node_path_resolved, edge_path_resolved,
                                              DATA_FORMAT)
                for cur_edge_list in graph_spec.edge_lists:
                    assert (cur_edge_list.safe_table_name,
                            cur_edge_list.source_column.safe_name,
                            cur_edge_list.target_column.safe_name) in tables
                assert len(tables) == len(graph_spec.node_lists) + len(graph_spec.edge_lists)
    logging.info("Completed run_tests()")