def __init__(self, name, **kwargs): # If using CLI then "cli" is included in kwargs self._cli = False if kwargs.get("cli"): self._cli = True # this is False and only updated during .run() self.quiet = False # No special characters in assembly name check_name(name) self.name = name if (not kwargs.get("quiet")) and (not self._cli): self._print("New Assembly: {}".format(self.name)) # Default ipcluster launch info self.ipcluster = { "cluster_id": "", "profile": "default", "engines": "Local", "quiet": 0, "timeout": 120, "cores": 0, # detect_cpus(), "threads": 2, "pids": {}, } # ipcluster settings can be set during init using kwargs for key, val in kwargs.items(): if key in self.ipcluster: self.ipcluster[key] = val # statsfiles is a dict with file locations # stats_dfs is a dict with pandas dataframes self.stats_files = ObjDict({}) self.stats_dfs = ObjDict({}) # samples linked {sample-name: sample-object} self.samples = {} # populations {popname: poplist} self.populations = {} # multiplex files linked self.barcodes = {} # outfiles locations self.outfiles = ObjDict() self.outfiles.loci = "" # storing supercatg file self.clust_database = "" self.database = "" ## the default params dict self.params = Params(self) self.hackersonly = Hackers() ## Store data directories for this Assembly. Init with default project self.dirs = ObjDict({ "project": os.path.realpath(self.params.project_dir), "fastqs": "", "edits": "", "clusts": "", "consens": "", "across": "", "outfiles": "", })
def __init__(self, name=""): self.name = name self.barcode = "" # link to files self.files = ObjDict({ "fastqs": [], "edits": [], "mapped_reads": [], "unmapped_reads": [], "clusters": [], "consens": [], "database": [] }) ## summary stats dictionary self.stats = pd.Series(index=[ "state", "reads_raw", "reads_passed_filter", "reads_merged", "refseq_mapped_reads", "refseq_unmapped_reads", "clusters_total", "clusters_hidepth", "hetero_est", "error_est", "reads_consens", ], dtype=object) ## stats for each step self.stats_dfs = ObjDict({ "s1": pd.Series(index=[ "reads_raw", ], dtype=object), "s2": pd.Series(index=[ "reads_raw", "trim_adapter_bp_read1", "trim_adapter_bp_read2", "trim_quality_bp_read1", "trim_quality_bp_read2", "reads_filtered_by_Ns", "reads_filtered_by_minlen", "reads_passed_filter", ], dtype=object), #"filtered_by_qscore", #"filtered_by_adapter", "s3": pd.Series(index=[ "merged_pairs", "clusters_total", "hidepth_min", "clusters_hidepth", "avg_depth_total", "avg_depth_mj", "avg_depth_stat", "sd_depth_total", "sd_depth_mj", "sd_depth_stat", "filtered_bad_align", ], dtype=object), "s4": pd.Series(index=[ "hetero_est", "error_est", ], dtype=object), "s5": pd.Series(index=[ "clusters_total", "filtered_by_depth", "filtered_by_maxH", "filtered_by_maxAlleles", "filtered_by_maxN", "reads_consens", "nsites", "nhetero", "heterozygosity", ], dtype=object), }) ## store cluster depth information (biggest memory cost), self.depths = {}