def save_enriched_motifs(df, fname: str) -> None: """ Save enriched motifs. Supported file formats are CSV, TSV, GMT, DAT (pickle), JSON or YAML. :param df: :param fname: :return: """ extension = PurePath(fname).suffixes if is_valid_suffix(extension, 'ctx'): df.to_csv(fname, sep=suffixes_to_separator(extension)) else: regulons = df2regulons(df) if '.json' in extension: name2targets = { r.name: list(r.gene2weight.keys()) for r in regulons } with openfile(fname, 'w') as f: f.write(json.dumps(name2targets)) elif '.dat' in extension: with openfile(fname, 'wb') as f: pickle.dump(regulons, f) elif '.gmt' in extension: GeneSignature.to_gmt(fname, regulons) elif is_valid_suffix(extension, 'ctx_yaml'): save_to_yaml(regulons, fname) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]: """ Load genes signatures from disk. Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs). :param fname: The name of the file that contains the signatures. :return: A list of gene signatures. """ extension = PurePath(fname).suffixes if is_valid_suffix(extension, 'ctx'): # csv/tsv return df2regulons( load_motifs(fname, sep=suffixes_to_separator(extension))) elif is_valid_suffix(extension, 'ctx_yaml'): return load_from_yaml(fname) elif '.gmt' in extension: sep = guess_separator(fname) return GeneSignature.from_gmt(fname, field_separator=sep, gene_separator=sep) elif extension == '.dat': with openfile(fname, 'rb') as f: return pickle.load(f) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def load_modules(fname: str) -> Sequence[Type[GeneSignature]]: # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml # The alternative for which was opted in the end is binary pickling. extension = PurePath(fname).suffixes if is_valid_suffix(extension, 'ctx_yaml'): return load_from_yaml(fname) elif '.dat' in extension: with openfile(fname, 'rb') as f: return pickle.load(f) elif '.gmt' in extension: return GeneSignature.from_gmt(fname) else: raise ValueError("Unknown file format for \"{}\".".format(fname))
def guess_separator(fname: str) -> str: with openfile(fname, 'r') as f: lines = f.readlines() # decode if gzipped file: for i,x in enumerate(lines): if isinstance(x, (bytes, bytearray)): lines[i] = x.decode() def count_columns(sep): return [len(line.split(sep)) for line in lines if not line.strip().startswith('#') and line.strip()] # Check if '\t' is used: for sep in ('\t', ';', ','): if min(count_columns(sep)) >= 3: return sep raise ValueError("Unknown file format \"{}\".".format(fname))