def __init__(self, min_bundle_size=0, desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE, use_fastavro=_use_fastavro(), label='ReadAllFiles'): """Initializes ``ReadAllFromAvro``. Args: min_bundle_size: the minimum size in bytes, to be considered when splitting the input into bundles. desired_bundle_size: the desired size in bytes, to be considered when splitting the input into bundles. """ if sys.version_info[0] >= 3 and not use_fastavro: warnings.warn( "Due to a known issue in avro-python3 package, it is " "recommended to use fastavro with Beam Avro IO on " "Python 3 until BEAM-6522 is addressed.") source_from_file = partial(_create_avro_source, min_bundle_size=min_bundle_size, use_fastavro=use_fastavro) self._read_all_files = filebasedsource.ReadAllFiles( True, CompressionTypes.AUTO, desired_bundle_size, min_bundle_size, source_from_file) self.label = label
def __init__(self, min_bundle_size=0, desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE, use_fastavro=True, with_filename=False, label='ReadAllFiles'): """Initializes ``ReadAllFromAvro``. Args: min_bundle_size: the minimum size in bytes, to be considered when splitting the input into bundles. desired_bundle_size: the desired size in bytes, to be considered when splitting the input into bundles. use_fastavro (bool); when set, use the `fastavro` library for IO, which is significantly faster, and is now the default. with_filename: If True, returns a Key Value with the key being the file name and the value being the actual data. If False, it only returns the data. """ source_from_file = partial(_create_avro_source, min_bundle_size=min_bundle_size, use_fastavro=use_fastavro) self._read_all_files = filebasedsource.ReadAllFiles( True, CompressionTypes.AUTO, desired_bundle_size, min_bundle_size, source_from_file, with_filename) self.label = label
def __init__( self, representative_header_lines=None, # type: List[str] desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE, # type: int compression_type=CompressionTypes.AUTO, # type: str allow_malformed_records=False, # type: bool pre_infer_headers=False, # type: bool sample_name_encoding=SampleNameEncoding. WITHOUT_FILE_PATH, # type: int use_1_based_coordinate=False, # type: bool move_hom_ref_calls=False, # type: bool **kwargs # type: **str ): # type: (...) -> None """Initialize the :class:`ReadAllFromVcf` transform. Args: representative_header_lines: Header definitions to be used for parsing VCF files. If supplied, header definitions in VCF files are ignored. desired_bundle_size: Desired size of bundles that should be generated when splitting this source into bundles. See :class:`~apache_beam.io.filebasedsource.FileBasedSource` for more details. compression_type: Used to handle compressed input files. Typical value is :attr:`CompressionTypes.AUTO <apache_beam.io.filesystem.CompressionTypes.AUTO>`, in which case the underlying file_path's extension will be used to detect the compression. allow_malformed_records: If true, malformed records from VCF files will be returned as :class:`MalformedVcfRecord` instead of failing the pipeline. pre_infer_headers: If true, drop headers and make sure PySam return the exact data for variants and calls, without type matching. sample_name_encoding: specify how we want to encode sample_name mainly to deal with same sample_name used across multiple VCF files. use_1_based_coordinate: specify whether the coordinates should be stored in BQ using 0-based exclusive (default) or 1-based inclusive coordinate. move_hom_ref_calls: If true, filter out 0 GT data out of call list and add the call name to a hom_ref_calls column. """ super().__init__(**kwargs) source_from_file = partial( _create_vcf_source, representative_header_lines=representative_header_lines, compression_type=compression_type, allow_malformed_records=allow_malformed_records, pre_infer_headers=pre_infer_headers, sample_name_encoding=sample_name_encoding, use_1_based_coordinate=use_1_based_coordinate, move_hom_ref_calls=move_hom_ref_calls) self._read_all_files = filebasedsource.ReadAllFiles( True, # splittable CompressionTypes.AUTO, desired_bundle_size, 0, # min_bundle_size source_from_file)
def __init__(self, min_bundle_size=0, desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE): """Initializes ``ReadAllFromAvro``. Args: min_bundle_size: the minimum size in bytes, to be considered when splitting the input into bundles. desired_bundle_size: the desired size in bytes, to be considered when splitting the input into bundles. """ source_from_file = partial( _create_avro_source, min_bundle_size=min_bundle_size) self._read_all_files = filebasedsource.ReadAllFiles( True, CompressionTypes.AUTO, desired_bundle_size, min_bundle_size, source_from_file)