예제 #1
0
    def test_get_pipeline_spec_from_config(self, mock_notebook_item):
        mock_notebook_item.return_value = google.datalab.bigquery.Query(
            'foo_query_sql_string')

        # empty pipeline_spec
        with self.assertRaisesRegexp(Exception,
                                     'Pipeline has no tasks to execute.'):
            bq._get_pipeline_spec_from_config({})

        # empty input , transformation, output as path
        pipeline_config = {
            'transformation': {
                'query': 'foo_query'
            },
            'output': {
                'path': 'foo_table'
            }
        }

        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql': u'foo_query_sql_string',
                    'type': 'pydatalab.bq.execute'
                },
                'bq_pipeline_extract_task': {
                    'path': 'foo_table',
                    'type': 'pydatalab.bq.extract',
                    'up_stream': ['bq_pipeline_execute_task']
                }
            }
        }

        self.assertDictEqual(
            bq._get_pipeline_spec_from_config(pipeline_config), expected)

        # input as path, transformation, output as path
        pipeline_config = {
            'input': {
                'path': 'foo_path',
                'data_source': 'foo_data_source',
            },
            'transformation': {
                'query': 'foo_query'
            },
            'output': {
                'path': 'foo_table'
            }
        }

        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql': u'foo_query_sql_string',
                    'data_source': 'foo_data_source',
                    'path': 'foo_path',
                    'type': 'pydatalab.bq.execute'
                },
                'bq_pipeline_extract_task': {
                    'path': 'foo_table',
                    'type': 'pydatalab.bq.extract',
                    'up_stream': ['bq_pipeline_execute_task']
                }
            }
        }

        self.assertDictEqual(
            bq._get_pipeline_spec_from_config(pipeline_config), expected)

        # input as path->table, transformation, output as path
        pipeline_config = {
            'input': {
                'path': 'foo_path',
                'table': 'foo_table_1'
            },
            'transformation': {
                'query': 'foo_query'
            },
            'output': {
                'path': 'foo_path_2'
            }
        }

        expected = {
            'tasks': {
                'bq_pipeline_load_task': {
                    'type': 'pydatalab.bq.load',
                    'path': 'foo_path',
                    'table': 'foo_table_1',
                },
                'bq_pipeline_execute_task': {
                    'sql': u'foo_query_sql_string',
                    'type': 'pydatalab.bq.execute',
                    'up_stream': ['bq_pipeline_load_task'],
                },
                'bq_pipeline_extract_task': {
                    'path': 'foo_path_2',
                    'type': 'pydatalab.bq.extract',
                    'up_stream': ['bq_pipeline_execute_task']
                }
            }
        }

        # input as table, transformation, output as path
        pipeline_config = {
            'input': {
                'table': 'foo_table_1'
            },
            'transformation': {
                'query': 'foo_query'
            },
            'output': {
                'path': 'foo_path_2'
            }
        }

        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql': u'foo_query_sql_string',
                    'type': 'pydatalab.bq.execute',
                },
                'bq_pipeline_extract_task': {
                    'path': 'foo_path_2',
                    'type': 'pydatalab.bq.extract',
                    'up_stream': ['bq_pipeline_execute_task']
                }
            }
        }

        self.assertDictEqual(
            bq._get_pipeline_spec_from_config(pipeline_config), expected)

        # input as table, transformation, output as table
        pipeline_config = {
            'input': {
                'table': 'foo_table_1'
            },
            'transformation': {
                'query': 'foo_query'
            },
            'output': {
                'table': 'foo_table_1'
            }
        }

        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql': u'foo_query_sql_string',
                    'type': 'pydatalab.bq.execute',
                    'table': 'foo_table_1'
                },
            }
        }

        self.assertDictEqual(
            bq._get_pipeline_spec_from_config(pipeline_config), expected)

        # input as table, no transformation, output as path
        pipeline_config = {
            'input': {
                'table': 'foo_table'
            },
            'output': {
                'path': 'foo_path'
            }
        }

        expected = {
            'tasks': {
                'bq_pipeline_extract_task': {
                    'type': 'pydatalab.bq.extract',
                    'path': 'foo_path',
                    'table': 'foo_table'
                },
            }
        }

        self.assertDictEqual(
            bq._get_pipeline_spec_from_config(pipeline_config), expected)

        # output only; this should be identical to the above
        pipeline_config = {
            'output': {
                'table': 'foo_table',
                'path': 'foo_path'
            }
        }

        expected = {
            'tasks': {
                'bq_pipeline_extract_task': {
                    'type': 'pydatalab.bq.extract',
                    'path': 'foo_path',
                    'table': 'foo_table'
                },
            }
        }

        self.assertDictEqual(
            bq._get_pipeline_spec_from_config(pipeline_config), expected)

        # input as path, no transformation, output as table
        pipeline_config = {
            'input': {
                'path': 'foo_path'
            },
            'output': {
                'table': 'foo_table'
            }
        }

        expected = {
            'tasks': {
                'bq_pipeline_load_task': {
                    'type': 'pydatalab.bq.load',
                    'path': 'foo_path',
                    'table': 'foo_table'
                },
            }
        }

        self.assertDictEqual(
            bq._get_pipeline_spec_from_config(pipeline_config), expected)

        # input only; this should be identical to the above
        pipeline_config = {
            'input': {
                'path': 'foo_path',
                'table': 'foo_table'
            },
        }

        self.assertDictEqual(
            bq._get_pipeline_spec_from_config(pipeline_config), expected)

        # only transformation
        pipeline_config = {
            'transformation': {
                'query': 'foo_query'
            },
        }

        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql': u'foo_query_sql_string',
                    'type': 'pydatalab.bq.execute',
                },
            }
        }

        self.assertDictEqual(
            bq._get_pipeline_spec_from_config(pipeline_config), expected)
예제 #2
0
    def test_get_pipeline_spec_from_config(self, mock_notebook_item):
        mock_notebook_item.return_value = google.datalab.bigquery.Query(
            'foo_query_sql_string')

        # empty pipeline_spec
        with self.assertRaisesRegexp(Exception,
                                     'Pipeline has no tasks to execute.'):
            bq._get_pipeline_spec_from_config({})

        # empty input , transformation, output as path
        pipeline_config = {
            'transformation': {
                'query': 'foo_query'
            },
            'output': {
                'path': 'foo_table'
            }
        }
        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql': u'foo_query_sql_string',
                    'type': 'pydatalab.bq.execute',
                },
                'bq_pipeline_extract_task': {
                    'table':
                    """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""",
                    'path': 'foo_table',
                    'type': 'pydatalab.bq.extract',
                    'up_stream': ['bq_pipeline_execute_task']
                }
            }
        }

        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # input as path, transformation, output as path
        pipeline_config = {
            'input': {
                'path': 'foo_path',
                'data_source': 'foo_data_source',
            },
            'transformation': {
                'query': 'foo_query'
            },
            'output': {
                'path': 'foo_table'
            }
        }
        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql': u'foo_query_sql_string',
                    'data_source': 'foo_data_source',
                    'path': 'foo_path',
                    'type': 'pydatalab.bq.execute',
                },
                'bq_pipeline_extract_task': {
                    'table':
                    """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""",
                    'path': 'foo_table',
                    'type': 'pydatalab.bq.extract',
                    'up_stream': ['bq_pipeline_execute_task']
                }
            }
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # input as path->table, transformation, output as path
        pipeline_config = {
            'input': {
                'path': 'foo_path',
                'table': 'foo_table_1'
            },
            'transformation': {
                'query': 'foo_query'
            },
            'output': {
                'path': 'foo_path_2'
            }
        }
        expected = {
            'tasks': {
                'bq_pipeline_load_task': {
                    'type': 'pydatalab.bq.load',
                    'path': 'foo_path',
                    'table': 'foo_table_1',
                },
                'bq_pipeline_execute_task': {
                    'sql':
                    u'WITH input AS (\n  SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string',
                    'type': 'pydatalab.bq.execute',
                    'up_stream': ['bq_pipeline_load_task'],
                },
                'bq_pipeline_extract_task': {
                    'table':
                    """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""",
                    'path': 'foo_path_2',
                    'type': 'pydatalab.bq.extract',
                    'up_stream': ['bq_pipeline_execute_task']
                }
            }
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # input as table, transformation, output as path
        pipeline_config = {
            'input': {
                'table': 'foo_table_1'
            },
            'transformation': {
                'query': 'foo_query'
            },
            'output': {
                'path': 'foo_path_2'
            }
        }
        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql':
                    u'WITH input AS (\n  SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string',
                    'type': 'pydatalab.bq.execute',
                },
                'bq_pipeline_extract_task': {
                    'table':
                    """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""",
                    'path': 'foo_path_2',
                    'type': 'pydatalab.bq.extract',
                    'up_stream': ['bq_pipeline_execute_task']
                }
            }
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # input as table, transformation, output as table
        pipeline_config = {
            'input': {
                'table': 'foo_table_1'
            },
            'transformation': {
                'query': 'foo_query'
            },
            'output': {
                'table': 'foo_table_1'
            }
        }
        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql':
                    u'WITH input AS (\n  SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string',
                    'type': 'pydatalab.bq.execute',
                    'table': 'foo_table_1',
                },
            }
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # input as table, no transformation, output as path
        pipeline_config = {
            'input': {
                'table': 'foo_table'
            },
            'output': {
                'path': 'foo_path'
            }
        }
        expected = {
            'tasks': {
                'bq_pipeline_extract_task': {
                    'type': 'pydatalab.bq.extract',
                    'path': 'foo_path',
                    'table': 'foo_table'
                },
            }
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # output only; this should be identical to the above
        pipeline_config = {
            'output': {
                'table': 'foo_table',
                'path': 'foo_path'
            }
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # output can also be called extract, and it should be identical to the above
        pipeline_config = {
            'extract': {
                'table': 'foo_table',
                'path': 'foo_path'
            }
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # input as path, no transformation, output as table
        pipeline_config = {
            'input': {
                'path': 'foo_path'
            },
            'output': {
                'table': 'foo_table'
            }
        }
        expected = {
            'tasks': {
                'bq_pipeline_load_task': {
                    'type': 'pydatalab.bq.load',
                    'path': 'foo_path',
                    'table': 'foo_table'
                },
            }
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # input only; this should be identical to the above
        pipeline_config = {
            'input': {
                'path': 'foo_path',
                'table': 'foo_table'
            },
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # input can also be called load, and it should be identical to the above
        pipeline_config = {
            'load': {
                'path': 'foo_path',
                'table': 'foo_table'
            },
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        # only transformation
        pipeline_config = {
            'transformation': {
                'query': 'foo_query'
            },
        }
        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql': u'foo_query_sql_string',
                    'type': 'pydatalab.bq.execute',
                },
            }
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, None)

        user_parameters = [
            {
                'name': 'foo1',
                'value': 'foo1',
                'type': 'STRING'
            },
            {
                'name': 'foo2',
                'value': 'foo2',
                'type': 'INTEGER'
            },
        ]
        # only transformation with parameters
        pipeline_config = {
            'transformation': {
                'query': 'foo_query'
            },
            'parameters': user_parameters
        }

        expected = {
            'tasks': {
                'bq_pipeline_execute_task': {
                    'sql': u'foo_query_sql_string',
                    'type': 'pydatalab.bq.execute',
                },
            }
        }
        actual = bq._get_pipeline_spec_from_config(pipeline_config)
        self.assertPipelineConfigEquals(actual, expected, user_parameters)
예제 #3
0
  def test_get_pipeline_spec_from_config(self, mock_notebook_item):
    mock_notebook_item.return_value = google.datalab.bigquery.Query('foo_query_sql_string')

    # empty pipeline_spec
    with self.assertRaisesRegexp(Exception, 'Pipeline has no tasks to execute.'):
      bq._get_pipeline_spec_from_config({})

    # empty input , transformation, output as path
    pipeline_config = {
      'transformation': {
        'query': 'foo_query'
      },
      'output': {
        'path': 'foo_table'
      }
    }
    expected = {
      'tasks': {
        'bq_pipeline_execute_task': {
          'sql': u'foo_query_sql_string',
          'type': 'pydatalab.bq.execute',
        },
        'bq_pipeline_extract_task': {
          'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""",
          'path': 'foo_table',
          'type': 'pydatalab.bq.extract',
          'up_stream': ['bq_pipeline_execute_task']
        }
      }
    }

    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # input as path, transformation, output as path
    pipeline_config = {
      'input': {
        'path': 'foo_path',
        'data_source': 'foo_data_source',
      },
      'transformation': {
        'query': 'foo_query'
      },
      'output': {
        'path': 'foo_table'
      }
    }
    expected = {
      'tasks': {
        'bq_pipeline_execute_task': {
          'sql': u'foo_query_sql_string',
          'data_source': 'foo_data_source',
          'path': 'foo_path',
          'type': 'pydatalab.bq.execute',
        },
        'bq_pipeline_extract_task': {
          'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""",
          'path': 'foo_table',
          'type': 'pydatalab.bq.extract',
          'up_stream': ['bq_pipeline_execute_task']
        }
      }
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # input as path->table, transformation, output as path
    pipeline_config = {
      'input': {
        'path': 'foo_path',
        'table': 'foo_table_1'
      },
      'transformation': {
        'query': 'foo_query'
      },
      'output': {
        'path': 'foo_path_2'
      }
    }
    expected = {
      'tasks': {
        'bq_pipeline_load_task': {
          'type': 'pydatalab.bq.load',
          'path': 'foo_path',
          'table': 'foo_table_1',
        },
        'bq_pipeline_execute_task': {
          'sql': u'WITH input AS (\n  SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string',
          'type': 'pydatalab.bq.execute',
          'up_stream': ['bq_pipeline_load_task'],
        },
        'bq_pipeline_extract_task': {
          'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""",
          'path': 'foo_path_2',
          'type': 'pydatalab.bq.extract',
          'up_stream': ['bq_pipeline_execute_task']
        }
      }
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # input as table, transformation, output as path
    pipeline_config = {
      'input': {
        'table': 'foo_table_1'
      },
      'transformation': {
        'query': 'foo_query'
      },
      'output': {
        'path': 'foo_path_2'
      }
    }
    expected = {
      'tasks': {
        'bq_pipeline_execute_task': {
          'sql': u'WITH input AS (\n  SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string',
          'type': 'pydatalab.bq.execute',
        },
        'bq_pipeline_extract_task': {
          'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""",
          'path': 'foo_path_2',
          'type': 'pydatalab.bq.extract',
          'up_stream': ['bq_pipeline_execute_task']
        }
      }
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # input as table, transformation, output as table
    pipeline_config = {
      'input': {
        'table': 'foo_table_1'
      },
      'transformation': {
        'query': 'foo_query'
      },
      'output': {
        'table': 'foo_table_1'
      }
    }
    expected = {
      'tasks': {
        'bq_pipeline_execute_task': {
          'sql': u'WITH input AS (\n  SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string',
          'type': 'pydatalab.bq.execute',
          'table': 'foo_table_1',
        },
      }
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # input as table, no transformation, output as path
    pipeline_config = {
      'input': {
        'table': 'foo_table'
      },
      'output': {
        'path': 'foo_path'
      }
    }
    expected = {
      'tasks': {
        'bq_pipeline_extract_task': {
          'type': 'pydatalab.bq.extract',
          'path': 'foo_path',
          'table': 'foo_table'
        },
      }
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # output only; this should be identical to the above
    pipeline_config = {
      'output': {
        'table': 'foo_table',
        'path': 'foo_path'
      }
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # output can also be called extract, and it should be identical to the above
    pipeline_config = {
      'extract': {
        'table': 'foo_table',
        'path': 'foo_path'
      }
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # input as path, no transformation, output as table
    pipeline_config = {
      'input': {
        'path': 'foo_path'
      },
      'output': {
        'table': 'foo_table'
      }
    }
    expected = {
      'tasks': {
        'bq_pipeline_load_task': {
          'type': 'pydatalab.bq.load',
          'path': 'foo_path',
          'table': 'foo_table'
        },
      }
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # input only; this should be identical to the above
    pipeline_config = {
      'input': {
        'path': 'foo_path',
        'table': 'foo_table'
      },
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # input can also be called load, and it should be identical to the above
    pipeline_config = {
      'load': {
        'path': 'foo_path',
        'table': 'foo_table'
      },
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    # only transformation
    pipeline_config = {
      'transformation': {
        'query': 'foo_query'
      },
    }
    expected = {
      'tasks': {
        'bq_pipeline_execute_task': {
          'sql': u'foo_query_sql_string',
          'type': 'pydatalab.bq.execute',
        },
      }
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, None)

    user_parameters = [
      {'name': 'foo1', 'value': 'foo1', 'type': 'STRING'},
      {'name': 'foo2', 'value': 'foo2', 'type': 'INTEGER'},
    ]
    # only transformation with parameters
    pipeline_config = {
      'transformation': {
        'query': 'foo_query'
      },
      'parameters': user_parameters
    }

    expected = {
      'tasks': {
        'bq_pipeline_execute_task': {
          'sql': u'foo_query_sql_string',
          'type': 'pydatalab.bq.execute',
        },
      }
    }
    actual = bq._get_pipeline_spec_from_config(pipeline_config)
    self.assertPipelineConfigEquals(actual, expected, user_parameters)