def test_shard_variants(self):
    expected_shards = self._get_expected_variant_shards()
    variants = [variant
                for variant_list in expected_shards.values()
                for variant in variant_list]

    sharding = variant_sharding.VariantSharding(
        'gcp_variant_transforms/data/sharding_configs/'
        'homo_sapiens_default.yaml')
    pipeline = TestPipeline()
    shards = (
        pipeline
        | Create(variants, reshuffle=False)
        | 'ShardVariants' >> beam.Partition(
            shard_variants.ShardVariants(sharding),
            sharding.get_num_shards()))
    for i in range(sharding.get_num_shards()):
      assert_that(shards[i], equal_to(expected_shards.get(i, [])),
                  label=str(i))
    pipeline.run()
Пример #2
0
    def expand(self, pcoll):
        # This is a composite transform involves the following:
        #   1. Create a singleton of the user provided `query` and apply a ``ParDo``
        #   that splits the query into `num_splits` queries if possible.
        #
        #   If the value of `num_splits` is 0, the number of splits will be
        #   computed dynamically based on the size of the data for the `query`.
        #
        #   2. The resulting ``PCollection`` is sharded across workers using a
        #   ``Reshuffle`` operation.
        #
        #   3. In the third step, a ``ParDo`` reads entities for each query and
        #   outputs a ``PCollection[Entity]``.

        return (pcoll.pipeline
                | 'UserQuery' >> Create([self._query])
                | 'SplitQuery' >> ParDo(
                    ReadFromDatastore._SplitQueryFn(self._num_splits))
                | Reshuffle()
                | 'Read' >> ParDo(ReadFromDatastore._QueryFn()))
    def test_partition_variants(self):
        expected_partitions = self._get_standard_variant_partitions()
        expected_partitions.update(self._get_nonstandard_variant_partitions())
        variants = [
            variant for variant_list in expected_partitions.values()
            for variant in variant_list
        ]

        partitioner = variant_partition.VariantPartition()
        pipeline = TestPipeline()
        partitions = (pipeline
                      | Create(variants)
                      | 'PartitionVariants' >> Partition(
                          partition_variants.PartitionVariants(partitioner),
                          partitioner.get_num_partitions()))
        for i in xrange(partitioner.get_num_partitions()):
            assert_that(partitions[i],
                        equal_to(expected_partitions.get(i, [])),
                        label=str(i))
        pipeline.run()
    def test_combine_pipeline(self):
        headers_1 = self._get_header_from_lines(FILE_1_LINES)
        headers_2 = self._get_header_from_lines(FILE_2_LINES)

        # TODO(nmousavi): Either use TestPipeline or combiner_fn.* everywhere.
        # After moving out _HeaderMerger to its file, it makes sense to use
        # TestPipeline everywhere.
        header_merger = HeaderMerger(
            vcf_field_conflict_resolver.FieldConflictResolver(
                split_alternate_allele_info_fields=True))
        expected = vcf_header_io.VcfHeader()
        header_merger.merge(expected, headers_1)
        header_merger.merge(expected, headers_2)

        pipeline = TestPipeline()
        merged_headers = (pipeline
                          | Create([headers_1, headers_2])
                          | 'MergeHeaders' >> merge_headers.MergeHeaders())

        assert_that(merged_headers, equal_to([expected]))
  def test_header_fields_inferred_one_variant(self):
    with TestPipeline() as p:
      variant = self._get_sample_variant_1()
      inferred_headers = (
          p
          | Create([variant])
          | 'InferUndefinedHeaderFields' >>
          infer_undefined_headers.InferUndefinedHeaderFields(
              defined_headers=None))

      expected_infos = {'IS': Info('IS', 1, 'String', '', '', ''),
                        'IF': Info('IF', 0, 'Flag', '', '', ''),
                        'IA': Info('IA', None, 'String', '', '', '')}
      expected_formats = {'FI': Format('FI', 1, 'String', ''),
                          'FU': Format('FU', None, 'String', '')}

      expected = vcf_header_io.VcfHeader(
          infos=expected_infos, formats=expected_formats)
      assert_that(inferred_headers, equal_to([expected]))
      p.run()
Пример #6
0
    def test_pipeline_sdk_not_overridden(self):
        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api', '--experiments=use_unified_worker',
            '--temp_location', 'gs://any-location/temp',
            '--worker_harness_container_image=dummy_prefix/dummy_name:dummy_tag'
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

        proto_pipeline, _ = pipeline.to_runner_api(return_context=True)

        dummy_env = beam_runner_api_pb2.Environment(
            urn=common_urns.environments.DOCKER.urn,
            payload=(beam_runner_api_pb2.DockerPayload(
                container_image='dummy_prefix/dummy_name:dummy_tag')
                     ).SerializeToString())
        proto_pipeline.components.environments['dummy_env_id'].CopyFrom(
            dummy_env)

        dummy_transform = beam_runner_api_pb2.PTransform(
            environment_id='dummy_env_id')
        proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
            dummy_transform)

        # Accessing non-public method for testing.
        apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
            proto_pipeline, dict(), pipeline_options)

        self.assertIsNotNone(2, len(proto_pipeline.components.environments))

        from apache_beam.utils import proto_utils
        found_override = False
        for env in proto_pipeline.components.environments.values():
            docker_payload = proto_utils.parse_Bytes(
                env.payload, beam_runner_api_pb2.DockerPayload)
            if docker_payload.container_image.startswith(
                    names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY):
                found_override = True

        self.assertFalse(found_override)
Пример #7
0
    def test_memory_usage(self):
        try:
            import resource
        except ImportError:
            # Skip the test if resource module is not available (e.g. non-Unix os).
            self.skipTest('resource module not available.')
        if platform.mac_ver()[0]:
            # Skip the test on macos, depending on version it returns ru_maxrss in
            # different units.
            self.skipTest('ru_maxrss is not in standard units.')

        def get_memory_usage_in_bytes():
            return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * (2**10)

        def check_memory(value, memory_threshold):
            memory_usage = get_memory_usage_in_bytes()
            if memory_usage > memory_threshold:
                raise RuntimeError('High memory usage: %d > %d' %
                                   (memory_usage, memory_threshold))
            return value

        len_elements = 1000000
        num_elements = 10
        num_maps = 100

        pipeline = TestPipeline(runner='DirectRunner')

        # Consumed memory should not be proportional to the number of maps.
        memory_threshold = (get_memory_usage_in_bytes() +
                            (3 * len_elements * num_elements))

        biglist = pipeline | 'oom:create' >> Create(
            ['x' * len_elements] * num_elements)
        for i in range(num_maps):
            biglist = biglist | ('oom:addone-%d' % i) >> Map(lambda x: x + 'y')
        result = biglist | 'oom:check' >> Map(check_memory, memory_threshold)
        assert_that(
            result,
            equal_to(['x' * len_elements + 'y' * num_maps] * num_elements))

        pipeline.run()
Пример #8
0
    def test_densify_variants_pipeline(self):
        call_names = ['sample1', 'sample2', 'sample3']
        variant_calls = [
            vcfio.VariantCall(name=call_names[0]),
            vcfio.VariantCall(name=call_names[1]),
            vcfio.VariantCall(name=call_names[2]),
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]),
        ]

        pipeline = TestPipeline()
        densified_variants = (
            pipeline
            | Create(variants)
            |
            'DensifyVariants' >> densify_variants.DensifyVariants(call_names))
        assert_that(densified_variants, asserts.has_calls(call_names))

        pipeline.run()
Пример #9
0
 def test_convert_variant_to_bigquery_row(self):
     variant_1, row_1, header_num_dict_1 = self._get_sample_variant_1()
     variant_2, row_2, header_num_dict_2 = self._get_sample_variant_2()
     variant_3, row_3, header_num_dict_3 = self._get_sample_variant_3()
     header_num_dict = header_num_dict_1.copy()
     header_num_dict.update(header_num_dict_2)
     header_num_dict.update(header_num_dict_3)
     header_fields = vcf_header_util.make_header(header_num_dict)
     proc_var_1 = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant_1)
     proc_var_2 = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant_2)
     proc_var_3 = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant_3)
     pipeline = TestPipeline(blocking=True)
     bigquery_rows = (pipeline
                      | Create([proc_var_1, proc_var_2, proc_var_3])
                      | 'ConvertToRow' >> beam.ParDo(
                          ConvertVariantToRow(self._row_generator)))
     assert_that(bigquery_rows, equal_to([row_1, row_2, row_3]))
     pipeline.run()
Пример #10
0
    def test_window_param(self):
        class TestDoFn(DoFn):
            def process(self, element, window=DoFn.WindowParam):
                yield (element, (float(window.start), float(window.end)))

        pipeline = TestPipeline()
        pcoll = (pipeline
                 | Create([1, 7])
                 | Map(lambda x: TimestampedValue(x, x))
                 | WindowInto(windowfn=SlidingWindows(10, 5))
                 | ParDo(TestDoFn()))
        assert_that(
            pcoll,
            equal_to([(1, (-5, 5)), (1, (0, 10)), (7, (0, 10)), (7, (5, 15))]))
        pcoll2 = pcoll | 'Again' >> ParDo(TestDoFn())
        assert_that(pcoll2,
                    equal_to([((1, (-5, 5)), (-5, 5)), ((1, (0, 10)), (0, 10)),
                              ((7, (0, 10)), (0, 10)),
                              ((7, (5, 15)), (5, 15))]),
                    label='doubled windows')
        pipeline.run()
Пример #11
0
    def test_pipeline(self):
        infos = {
            'IS': createInfo('IS', 1, 'String', ''),
            'ISI': createInfo('ISI', 1, 'Integer', ''),
            'ISF': createInfo('ISF', 1, 'Float', ''),
            'IB': createInfo('IB', 0, 'Flag', ''),
            'IA': createInfo('IA', 'A', 'Integer', '')
        }
        formats = OrderedDict([
            ('FS', createFormat('FS', 1, 'String', 'desc')),
            ('FI', createFormat('FI', 2, 'Integer', 'desc')),
            ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')),
            ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))
        ])

        with TestPipeline() as p:
            variant_1 = self._get_sample_variant_info_ia_cardinality_mismatch()
            variant_2 = self._get_sample_variant_format_fi_float_value()
            inferred_headers = (
                p
                | Create([variant_1, variant_2])
                | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
                    defined_headers=vcf_header_io.VcfHeader(infos=infos,
                                                            formats=formats),
                    allow_incompatible_records=True,
                    infer_headers=True))

            expected_infos = {
                'IA': createInfo('IA', '.', 'Float', ''),
                'IF': createInfo('IF', 1, 'Float', '')
            }
            expected_formats = {
                'FI': createFormat('FI', 2, 'Float', 'desc'),
                'FU': createFormat('FU', '.', 'Float', '')
            }
            expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                               formats=expected_formats)
            assert_that(inferred_headers,
                        asserts.header_fields_equal_ignore_order([expected]))
            p.run()
    def test_merge_header_definitions_save_five_copies(self):
        lines_1 = [
            '##INFO=<ID=NS,Number=1,Type=Float,Description="Number samples">\n',
            '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample1 Sample2\n'
        ]
        lines_2 = [
            '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
            '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample3\n'
        ]

        vcf_reader_1 = vcf.Reader(fsock=iter(lines_1))
        vcf_reader_2 = vcf.Reader(fsock=iter(lines_2))
        file_names = ['file1', 'file2', 'file3', 'file4', 'file5', 'file6']
        headers = []
        for file_name in file_names:
            headers.append(
                self._get_vcf_header_from_reader(vcf_reader_1, file_name))
        headers.append(self._get_vcf_header_from_reader(vcf_reader_2, 'file7'))

        pipeline = TestPipeline()
        merged_definitions = (
            pipeline
            | Create(headers)
            |
            'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())

        expected = VcfHeaderDefinitions()
        expected._infos = {
            'NS': {
                Definition(1, 'Float'):
                ['file1', 'file2', 'file3', 'file4', 'file5'],
                Definition(1, 'Integer'): ['file7']
            }
        }
        assert_that(merged_definitions, equal_to([expected]))
        pipeline.run()
Пример #13
0
 def test_timestamp_param_map(self):
     with TestPipeline() as p:
         assert_that(
             p | Create([1, 2])
             | beam.Map(lambda _, t=DoFn.TimestampParam: t),
             equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
Пример #14
0
 def test_apply_custom_transform(self):
     with TestPipeline() as pipeline:
         pcoll = pipeline | 'pcoll' >> Create([1, 2, 3])
         result = pcoll | PipelineTest.CustomTransform()
         assert_that(result, equal_to([2, 3, 4]))
Пример #15
0
 def test_create_singleton_pcollection(self):
     with TestPipeline() as pipeline:
         pcoll = pipeline | 'label' >> Create([[1, 2, 3]])
         assert_that(pcoll, equal_to([[1, 2, 3]]))
Пример #16
0
 def timestamped_key_values(self, pipeline, key, *timestamps):
   return (pipeline | 'start' >> Create(timestamps)
           | Map(lambda x: WindowedValue((key, x), x, [GlobalWindow()])))
Пример #17
0
 def test_eager_pipeline(self):
     p = Pipeline('EagerRunner')
     self.assertEqual([1, 4, 9],
                      p | Create([1, 2, 3]) | Map(lambda x: x * x))
Пример #18
0
 def test_run(self):
     elems = [
         {
             'a': 1,
             'b': 1,
             'c': {
                 'x': 1,
                 'y': 1
             },
             'd': {
                 'x': 1,
                 'y': 1
             },
             'e': 1,
             'f': 1,
             'g': 1,
             'h': 1
         },
         {
             'a': 2,
             'b': 2,
             'c': {
                 'x': 2,
                 'y': 2
             },
             'd': {
                 'x': 2,
                 'y': 2
             },
             'e': 2,
             'f': 2,
             'g': 2,
             'h': 2
         },
     ]
     with TestPipeline() as p:
         pc = (p
               | Create(elems)
               | RestructDict(
                   mappings={
                       'a': 'moved_a',
                       'b': 'nested.moved_b',
                       'c.x': 'nested.moved_c_x',
                       'c.y': 'moved_c_y',
                       'd': True,
                       'e': False,
                       'f': None,
                   }))
         assert_that(
             pc,
             equal_to([
                 {
                     'moved_a': 1,
                     'nested': {
                         'moved_b': 1,
                         'moved_c_x': 1
                     },
                     'moved_c_y': 1,
                     'd': {
                         'x': 1,
                         'y': 1
                     }
                 },
                 {
                     'moved_a': 2,
                     'nested': {
                         'moved_b': 2,
                         'moved_c_x': 2
                     },
                     'moved_c_y': 2,
                     'd': {
                         'x': 2,
                         'y': 2
                     }
                 },
             ]))
Пример #19
0
 def test_create_singleton_pcollection(self):
     pipeline = TestPipeline()
     pcoll = pipeline | 'label' >> Create([[1, 2, 3]])
     assert_that(pcoll, equal_to([[1, 2, 3]]))
     pipeline.run()