def read(infile, fmt="tab", into=None, sample_id=None, meta=None, **kwargs): """Read tabular data from a file or stream into a genome object. Supported formats: see `READERS` If a format supports multiple samples, return the sample specified by `sample_id`, or if unspecified, return the first sample and warn if there were other samples present in the file. Parameters ---------- infile : handle or string Filename or opened file-like object to read. fmt : string File format. into : class GenomicArray class or subclass to instantiate, overriding the default for the target file format. sample_id : string Sample identifier. meta : dict Metadata, as arbitrary key-value pairs. **kwargs : Additional keyword arguments to the format-specific reader function. Returns ------- GenomicArray or subclass The data from the given file instantiated as `into`, if specified, or the default base class for the given file format (usually GenomicArray). """ from cnvlib.core import fbase if fmt == 'auto': return read_auto(infile) elif fmt in READERS: reader, suggest_into = READERS[fmt] else: raise ValueError("Unknown format: %s" % fmt) if meta is None: meta = {} if "sample_id" not in meta: if sample_id: meta["sample_id"] = sample_id elif isinstance(infile, basestring): meta["sample_id"] = fbase(infile) elif hasattr(infile, "name"): meta["sample_id"] = fbase(infile.name) else: # meta["sample_id"] = "<unknown>" pass if "filename" not in meta: if isinstance(infile, basestring): meta["filename"] = infile elif hasattr(infile, "name"): meta["filename"] = infile.name if fmt in ("seg", "vcf") and sample_id is not None: # Multi-sample formats: choose one sample kwargs["sample_id"] = sample_id try: dframe = reader(infile, **kwargs) except pd.io.common.EmptyDataError: # File is blank/empty, most likely logging.info("Blank %s file?: %s", fmt, infile) dframe = [] if fmt == "vcf": from cnvlib.vary import VariantArray as VA suggest_into = VA result = (into or suggest_into)(dframe, meta) result.sort_columns() result.sort() return result
def read(infile, fmt="tab", into=None, sample_id=None, meta=None, **kwargs): """Read tabular data from a file or stream into a genome object. Supported formats: see `READERS` If a format supports multiple samples, return the sample specified by `sample_id`, or if unspecified, return the first sample and warn if there were other samples present in the file. Parameters ---------- infile : handle or string Filename or opened file-like object to read. fmt : string File format. into : class GenomicArray class or subclass to instantiate, overriding the default for the target file format. sample_id : string Sample identifier. meta : dict Metadata, as arbitrary key-value pairs. **kwargs : Additional keyword arguments to the format-specific reader function. Returns ------- GenomicArray or subclass The data from the given file instantiated as `into`, if specified, or the default base class for the given file format (usually GenomicArray). """ from cnvlib.core import fbase if fmt == 'auto': return read_auto(infile) elif fmt in READERS: reader, suggest_into = READERS[fmt] else: raise ValueError("Unknown format: %s" % fmt) if meta is None: meta = {} if "sample_id" not in meta: if sample_id: meta["sample_id"] = sample_id else: fname = get_filename(infile) if fname: meta["sample_id"] = fbase(fname) if "filename" not in meta: fname = get_filename(infile) if fname: meta["filename"] = infile if fmt in ("seg", "vcf") and sample_id is not None: # Multi-sample formats: choose one sample kwargs["sample_id"] = sample_id try: dframe = reader(infile, **kwargs) except pd.io.common.EmptyDataError: # File is blank/empty, most likely logging.info("Blank %s file?: %s", fmt, infile) dframe = [] if fmt == "vcf": from cnvlib.vary import VariantArray as VA suggest_into = VA result = (into or suggest_into)(dframe, meta) result.sort_columns() result.sort() return result