def test_read_variants_large_mode(self): pipeline = test_pipeline.TestPipeline() all_patterns = [testdata_util.get_full_file_path('valid-4.0.vcf')] variants = pipeline_common.read_variants(pipeline, all_patterns, PipelineModes.LARGE, False) assert_that(variants, asserts.count_equals_to(5)) pipeline.run()
def _read_variants( all_patterns, # type: List[str] pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace pipeline_mode, # type: int pre_infer_headers=False, # type: bool keep_raw_sample_names=False, # type: bool use_1_based_coordinate=True # type: bool ): # type: (...) -> pvalue.PCollection """Helper method for returning a PCollection of Variants from VCFs.""" representative_header_lines = None if known_args.representative_header_file: representative_header_lines = vcf_header_parser.get_metadata_header_lines( known_args.representative_header_file) return pipeline_common.read_variants( pipeline, all_patterns, pipeline_mode, known_args.allow_malformed_records, representative_header_lines, pre_infer_headers=pre_infer_headers, sample_name_encoding=( SampleNameEncoding.NONE if keep_raw_sample_names else SampleNameEncoding[known_args.sample_name_encoding]), use_1_based_coordinate=use_1_based_coordinate)
def test_read_variants_use_1_based_coordinate(self): pipeline = test_pipeline.TestPipeline() all_patterns = [testdata_util.get_full_file_path('valid-4.0.vcf')] variants = pipeline_common.read_variants(pipeline, all_patterns, PipelineModes.SMALL, False, use_1_based_coordinate=True) assert_that(variants, asserts.count_equals_to(5)) pipeline.run()
def run(argv=None): # type: (List[str]) -> (str, str) """Runs preprocess pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args( argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) all_patterns = known_args.all_patterns pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns) merged_headers = pipeline_common.get_merged_headers(headers) merged_definitions = ( headers | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) if known_args.report_all_conflicts: variants = pipeline_common.read_variants( p, all_patterns, pipeline_mode, allow_malformed_records=True, pre_infer_headers=True) malformed_records = variants | filter_variants.ExtractMalformedVariants( ) inferred_headers, merged_headers = (_get_inferred_headers( variants, merged_headers)) _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers), beam.pvalue.AsSingleton(inferred_headers), beam.pvalue.AsIter(malformed_records))) else: _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers))) if known_args.resolved_headers_path: pipeline_common.write_headers(merged_headers, known_args.resolved_headers_path)
def _read_variants(all_patterns, # type: List[str] pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace pipeline_mode # type: int ): # type: (...) -> pvalue.PCollection """Helper method for returning a PCollection of Variants from VCFs.""" representative_header_lines = None if known_args.representative_header_file: representative_header_lines = vcf_header_parser.get_metadata_header_lines( known_args.representative_header_file) return pipeline_common.read_variants( pipeline, all_patterns, pipeline_mode, known_args.allow_malformed_records, representative_header_lines, vcfio.VcfParserType[known_args.vcf_parser])