def test_check_unique_names(test_graph_spec): test_graph_spec['node_lists'][0]["name"] = "chocolate node" test_graph_spec['node_lists'][1]["name"] = "chocolate node" with pytest.raises(ValueError) as excinfo: _ = GraphSpec.from_dict(test_graph_spec) assert excinfo.value.message == { '_schema': ["Names are not unique: [u'chocolate node']."]}
def test_env_graph_spec_loading_conflict(test_conflict_graph_spec): with pytest.raises(ValueError) as excinfo: _ = GraphSpec.from_dict(test_conflict_graph_spec) assert excinfo.value.message == { '_schema': [ 'Graph specification cannot contain both data_uri and ' 'data_uri_env. Only one of them should be specified.' ] }
def test_invalid_poll(test_graph_spec): test_cases = ['1 1', '', '1 1 a b', '100 100 * *'] for test_case in test_cases: test_graph_spec['poll'] = test_case with pytest.raises(ValueError) as excinfo: _ = GraphSpec.from_dict(test_graph_spec) assert excinfo.value.message == { 'poll': ['Invalid cron expression.']}
def test_graph_spec_loading(test_graph_spec): graph_spec = GraphSpec.from_dict(test_graph_spec) assert graph_spec.name == 'test_data' assert graph_spec.poll == '0 0 * * *' assert graph_spec.data_uri == 'data_uri_value' assert graph_spec.graph_uri == 'graph_uri_value' assert len(list(graph_spec.node_lists)) == 3 assert len(list(graph_spec.edge_lists)) == 3
def test_invalid_variable_definition(test_graph_spec): test_graph_spec['node_lists'][0]['index_column'][ 'variable_definition'] = 'not String' with pytest.raises(ValueError) as excinfo: _ = GraphSpec.from_dict(test_graph_spec) assert excinfo.value.message == { 'node_lists': {0: {'index_column': {'variable_definition': [ "Variable definition must be " "'String', 'Price' or 'Categorical'." ]}}}}
def test_safe_name(test_graph_spec): graph_spec = GraphSpec.from_dict(test_graph_spec) # data set has 'chocolate nodes', 'sweets nodes' and 'toffee nodes' intended_node_safe_names = [ 'fn_chocolate_nodes', 'fn_sweets_nodes', 'fn_toffee_nodes' ] node_safe_names = [node_list.safe_name for node_list in graph_spec.node_lists] for safe_name in node_safe_names: assert safe_name in intended_node_safe_names
def test_graph_spec_dumping(test_graph_spec): graph_spec = GraphSpec.from_dict(test_graph_spec) dumped_graph_spec = graph_spec.to_dict() assert graph_spec.name == dumped_graph_spec['name'] assert graph_spec.poll == dumped_graph_spec['poll'] assert graph_spec.data_uri == dumped_graph_spec['data_uri'] assert graph_spec.graph_uri == dumped_graph_spec['graph_uri'] assert len(list(graph_spec.node_lists)) == len(dumped_graph_spec['node_lists']) assert len(list(graph_spec.edge_lists)) == len(dumped_graph_spec['edge_lists']) for node_list, dumped_node_list in zip(graph_spec.node_lists, dumped_graph_spec['node_lists']): assert node_list.name == dumped_node_list['name'] assert node_list.safe_name == dumped_node_list['safe_name'] for edge_list, dumped_edge_list in zip(graph_spec.edge_lists, dumped_graph_spec['edge_lists']): assert edge_list.name == dumped_edge_list['name'] assert edge_list.safe_name == dumped_edge_list['safe_name']
def test_table_details(test_graph_spec): graph_spec = GraphSpec.from_dict(test_graph_spec) assert graph_spec.table_details == { 'connection': 'data_uri_value', 'poll_frequency': '0 2 * * *', 'tables': { ('test_data_chocolate_edge_list', 'fn_test_data_chocolate_edge_list'): { ('chocolate_s', 'fn_chocolate_s'), ('chocolate_t', 'fn_chocolate_t') }, ('test_data_chocolate_node_list', 'fn_test_data_chocolate_node_list'): { ('id', 'fn_id') }, ('test_data_sweets_edge_list', 'fn_test_data_sweets_edge_list'): { ('sweets_s', 'fn_sweets_s'), ('sweets_t', 'fn_sweets_t') }, ('test_data_sweets_node_list', 'fn_test_data_sweets_node_list'): { ('id', 'fn_id'), ('prop', 'fn_prop') }, ('test_data_toffee_edge_list', 'fn_test_data_toffee_edge_list'): { ('toffee_s', 'fn_toffee_s'), ('toffee_t', 'fn_toffee_t') }, ('test_data_toffee_node_list', 'fn_test_data_toffee_node_list'): { ('hide', 'fn_hide'), ('id', 'fn_id'), ('prop', 'fn_prop') } } }
PIPELINE_ARGS = { 'owner': 'finnet', 'depends_on_past': False, 'start_date': datetime(2017, 1, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 3, 'retry_delay': timedelta(minutes=10) } # pylint: disable=invalid-name try: with open(SPEC_FILE) as spec_data: graph_spec = GraphSpec.from_dict(json.load(spec_data)) except (ValueError, IOError, KeyError) as e: logging.warning(e) graph_spec = None if graph_spec: dag = DAG('fn_figov_grey_list_detection', catchup=True, default_args=PIPELINE_ARGS, schedule_interval=graph_spec.poll) num_days_between_runs = compute_days_between_runs(graph_spec.poll) # 1. Create task to wait for data ingestion grey_list_wait_for_data_ingestion = ExternalTaskSensor(
def test_pipeline_tasks(): """ Test the pipeline """ # Set the list of tasks to test dolist = [ 'build_lists', 'resolve_entities', 'neo4j_purger', 'neo4j_writer', 'graph_tools' ] # Get neo4j ssh username and port neo4j_ssh_username = os.environ.get('NEO4J_SSH_USERNAME', 'neo4j') neo4j_ssh_port = int(os.environ.get('NEO4J_SSH_PORT', 9000)) # Setup the spark configuration config = dict() config['SparkConfiguration'] = (SparkConf() .setMaster('local[*]') .setAppName("test create data") .set("spark.executor.memory", "1024m")) # Get the graph specs datalist = os.listdir(LOCAL_DATA_PATH) jsonlist = [k for k in datalist if re.match(r'.*\.json$', k)] # Read in the graph spec for gspec in jsonlist: # Load the graph spec with open(os.path.join(LOCAL_DATA_PATH, gspec), 'r') as f: graph_spec = GraphSpec.from_dict(json.load(f)) spec = graph_spec.to_dict() tables_path = os.path.join(DATA_PATH, graph_spec.name, 'tables') n_path = os.path.join(DATA_PATH, graph_spec.name, 'node_list') e_path = os.path.join(DATA_PATH, graph_spec.name, 'edge_list') n_path_res = os.path.join(DATA_PATH, graph_spec.name, 'node_list_resolved') e_path_res = os.path.join(DATA_PATH, graph_spec.name, 'edge_list_resolved') logging.info("Processing " + gspec) # Use graph specification's neo4j connection neo_config = { 'uri': spec['graph_uri'], 'max_retries': config.get('neo4j.max_retries', 5), 'max_batchsize': config.get('neo4j.max_batchsize', 10000) } # Build list if 'build_lists' in dolist: logging.info("Building lists...") build_node_lists( graph_specification=graph_spec, spark_config=(SparkConfFactory() .set_master('local[*]') .set_app_name('test create data') .set('spark.executor.memory', '1g')), tables_path=tables_path, node_path=n_path, data_format=DATA_FORMAT, ) build_edge_lists( graph_specification=graph_spec, spark_config=(SparkConfFactory() .set_master('local[*]') .set_app_name('test create data') .set('spark.executor.memory', '1g')), tables_path=tables_path, edge_path=e_path, data_format=DATA_FORMAT, ) logging.info("Checking build_lists...") with get_spark_context(config['SparkConfiguration']) as spark_ctx: sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx)) assert test_build_lists(spark_ctx, sql_context, spec) # Resolve entities if 'resolve_entities' in dolist: logging.info("Resolving entities...") resolve_node_entities( graph_specification=graph_spec, spark_config=(SparkConfFactory() .set_master('local[*]') .set_app_name('test create data') .set('spark.executor.memory', '1g')), entity_maps=dict(), input_node_path=n_path, output_node_path=n_path_res, output_node_id='_canonical_id', data_format=DATA_FORMAT ) resolve_edge_entities( graph_specification=graph_spec, spark_config=(SparkConfFactory() .set_master('local[*]') .set_app_name('test create data') .set('spark.executor.memory', '1g')), entity_maps=dict(), input_edge_path=e_path, output_edge_path=e_path_res, output_edge_source_id='_canonical_id_source', output_edge_target_id='_canonical_id_target', data_format=DATA_FORMAT ) # Purging the graph if 'neo4j_purger' in dolist: logging.info("Purging Neo4j...") neo4j_manager.purge(graph_spec, username=neo4j_ssh_username, port=neo4j_ssh_port) logging.info("Checking purging neo4j...") with get_neo4j_context(neo_config['uri']) as neo_context: assert test_neo4j_purger(neo_context) # Graph writer if 'neo4j_writer' in dolist: logging.info("Writing to Neo4j...") graph_to_neo4j.graph_to_neo4j(graph_specification=graph_spec, spark_config=SparkConfFactory() .set_master('local[*]') .set_app_name('write neo4j nodes') .set("spark.driver.maxResultSize", "1g") .set('spark.executor.memory', '1g'), input_node_path=n_path_res, input_edge_path=e_path_res, username=neo4j_ssh_username, port=neo4j_ssh_port ) # This inserts node properties that were not captured above, more convenient like this??? neo4j_writer.write_neo4j_nodes(graph_specification=spec, spark_config=SparkConfFactory() .set_master('local[*]') .set_app_name('write neo4j nodes') .set('spark.executor.memory', '1g') ) datetime_now = datetime.now() logging.info("Backing up db, then purge it...") neo4j_manager.backup(graph_spec, datetime_now, username=neo4j_ssh_username, port=neo4j_ssh_port) neo4j_manager.purge(graph_spec, username=neo4j_ssh_username, port=neo4j_ssh_port) logging.info("Restoring the backup to db...") neo4j_manager.restore(graph_spec, datetime_now, username=neo4j_ssh_username, port=neo4j_ssh_port) logging.info("Checking write neo4j...") with get_spark_context(config['SparkConfiguration']) as spark_ctx: sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx)) with get_neo4j_context(neo_config['uri']) as neo_context: assert test_neo4j_writer( spark_ctx, sql_context, neo_context, spec ) if 'graph_tools' in dolist: # Test graph_construction_coi.get_graph_dataframes data_path = os.environ['PIPELINE_DATA_PATH'] graph_name = graph_spec.name node_path_resolved = os.path.join(data_path, graph_name, 'node_list_resolved') edge_path_resolved = os.path.join(data_path, graph_name, 'edge_list_resolved') with get_spark_context(config['SparkConfiguration']) as spark_ctx: sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx)) graph = get_graph_dataframes(graph_spec, sql_context, node_path_resolved, edge_path_resolved, DATA_FORMAT) assert 'node_list' in graph assert 'edge_list' in graph assert len(graph['node_list']) == len(graph_spec.node_lists) for cur_node_list in graph_spec.node_lists: assert cur_node_list.safe_name in graph['node_list'] assert len(graph['edge_list']) == len(graph_spec.edge_lists) for cur_edge_list in graph_spec.edge_lists: assert cur_edge_list.safe_name in graph['edge_list'] # Test graph_construction_coi.data_loading with get_spark_context(config['SparkConfiguration']) as spark_ctx: sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx)) tables = load_node_edge_lists(sql_context, graph_spec, node_path_resolved, edge_path_resolved, DATA_FORMAT) for cur_edge_list in graph_spec.edge_lists: assert (cur_edge_list.safe_table_name, cur_edge_list.source_column.safe_name, cur_edge_list.target_column.safe_name) in tables assert len(tables) == len(graph_spec.node_lists) + len(graph_spec.edge_lists) logging.info("Completed run_tests()")