def _read_variants( all_patterns, # type: List[str] pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace pipeline_mode # type: int ): # type: (...) -> pvalue.PCollection """Helper method for returning a PCollection of Variants from VCFs.""" representative_header_lines = None if known_args.representative_header_file: representative_header_lines = vcf_header_parser.get_metadata_header_lines( known_args.representative_header_file) if pipeline_mode == pipeline_common.PipelineModes.LARGE: variants = ( pipeline | 'InputFilePattern' >> beam.Create(all_patterns) | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf( representative_header_lines=representative_header_lines, allow_malformed_records=(known_args.allow_malformed_records))) else: variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf( all_patterns[0], representative_header_lines=representative_header_lines, allow_malformed_records=known_args.allow_malformed_records, vcf_parser_type=vcfio.VcfParserType[known_args.vcf_parser]) return variants
def read_variants( pipeline, # type: beam.Pipeline all_patterns, # type: List[str] pipeline_mode, # type: PipelineModes allow_malformed_records, # type: bool representative_header_lines=None, # type: List[str] pre_infer_headers=False, # type: bool sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH, # type: int use_1_based_coordinate=False, # type: bool move_hom_ref_calls=False # type: bool ): # type: (...) -> pvalue.PCollection """Returns a PCollection of Variants by reading VCFs.""" compression_type = get_compression_type(all_patterns) if compression_type == filesystem.CompressionTypes.GZIP: splittable_bgzf = _get_splittable_bgzf(all_patterns) if splittable_bgzf: return (pipeline | 'ReadVariants' >> vcfio.ReadFromBGZF( splittable_bgzf, representative_header_lines, allow_malformed_records, pre_infer_headers, sample_name_encoding, use_1_based_coordinate, move_hom_ref_calls)) if pipeline_mode == PipelineModes.LARGE: variants = ( pipeline | 'InputFilePattern' >> beam.Create(all_patterns) | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf( representative_header_lines=representative_header_lines, compression_type=compression_type, allow_malformed_records=allow_malformed_records, pre_infer_headers=pre_infer_headers, sample_name_encoding=sample_name_encoding, use_1_based_coordinate=use_1_based_coordinate, move_hom_ref_calls=move_hom_ref_calls)) else: variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf( all_patterns[0], representative_header_lines=representative_header_lines, compression_type=compression_type, allow_malformed_records=allow_malformed_records, pre_infer_headers=pre_infer_headers, sample_name_encoding=sample_name_encoding, use_1_based_coordinate=use_1_based_coordinate, move_hom_ref_calls=move_hom_ref_calls) if compression_type == filesystem.CompressionTypes.GZIP: variants |= 'FusionBreak' >> fusion_break.FusionBreak() return variants
def _read_variants(pipeline, known_args): """Helper method for returning a ``PCollection`` of Variants from VCFs.""" if known_args.optimize_for_large_inputs: variants = (pipeline | 'InputFilePattern' >> beam.Create( [known_args.input_pattern]) | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf( allow_malformed_records=( known_args.allow_malformed_records))) else: variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf( known_args.input_pattern, allow_malformed_records=known_args.allow_malformed_records) return variants
def run(argv=None): # type: (List[str]) -> (str, str) """Runs preprocess pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args( argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) all_patterns = known_args.all_patterns pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns) merged_headers = pipeline_common.get_merged_headers(headers) merged_definitions = ( headers | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) if known_args.report_all_conflicts: if len(all_patterns) == 1: variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf( all_patterns[0], allow_malformed_records=True) else: variants = (p | 'InputFilePattern' >> beam.Create(all_patterns) | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf(allow_malformed_records=True)) malformed_records = variants | filter_variants.ExtractMalformedVariants( ) inferred_headers, merged_headers = (_get_inferred_headers( variants, merged_headers)) _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers), beam.pvalue.AsSingleton(inferred_headers), beam.pvalue.AsIter(malformed_records))) else: _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers))) if known_args.resolved_headers_path: pipeline_common.write_headers(merged_headers, known_args.resolved_headers_path)
def _read_variants(pipeline, known_args): # type: (beam.Pipeline, argparse.Namespace) -> pvalue.PCollection """Helper method for returning a PCollection of Variants from VCFs.""" representative_header_lines = None if known_args.representative_header_file: representative_header_lines = vcf_header_parser.get_metadata_header_lines( known_args.representative_header_file) if known_args.optimize_for_large_inputs: variants = ( pipeline | 'InputFilePattern' >> beam.Create([known_args.input_pattern]) | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf( representative_header_lines=representative_header_lines, allow_malformed_records=(known_args.allow_malformed_records))) else: variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf( known_args.input_pattern, representative_header_lines=representative_header_lines, allow_malformed_records=known_args.allow_malformed_records) return variants
def read_variants( pipeline, # type: beam.Pipeline all_patterns, # type: List[str] pipeline_mode, # type: PipelineModes allow_malformed_records, # type: bool representative_header_lines=None, # type: List[str] vcf_parser=vcfio.VcfParserType.PYVCF # type: vcfio.VcfParserType ): # type: (...) -> pvalue.PCollection """Returns a PCollection of Variants by reading VCFs.""" compression_type = get_compression_type(all_patterns) if compression_type == filesystem.CompressionTypes.GZIP: splittable_bgzf = _get_splittable_bgzf(all_patterns) if splittable_bgzf: return (pipeline | 'ReadVariants' >> vcfio.ReadFromBGZF(splittable_bgzf, representative_header_lines, allow_malformed_records)) if pipeline_mode == PipelineModes.LARGE: variants = (pipeline | 'InputFilePattern' >> beam.Create(all_patterns) | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf( representative_header_lines=representative_header_lines, compression_type=compression_type, allow_malformed_records=allow_malformed_records)) else: variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf( all_patterns[0], representative_header_lines=representative_header_lines, compression_type=compression_type, allow_malformed_records=allow_malformed_records, vcf_parser_type=vcf_parser) if compression_type == filesystem.CompressionTypes.GZIP: variants |= 'FusionBreak' >> fusion_break.FusionBreak() return variants