def set_tooltips_to_column(self, tooltips_col, target_col): """Hide a column with tooltips and connect it with a column. :param str tooltips_col: column with your tooltips. :param str target_col: column to connect. """ # hide tooltips try: self.datatable_columns[tooltips_col]['visible'] = 'false' except KeyError: logger.warning( "KeyError: Column name '{0}' does not exist.".format( target_col)) pass # function to add tooltips fct = """function(data, type, row, meta){{ return '<a href="#" data-toggle="tooltip" title="'+row.{0}+'">'+data+'</a>'; }} """.format(tooltips_col) try: self.datatable_columns[target_col]['render'] = fct except KeyError: logger.warning( "KeyError: Column name '{0}' does not exist.".format( target_col)) pass
def set_links_to_column(self, link_col, target_col): """Hide a column with urls and connect it with a column. :param str link_col: column with your URLs. :param str target_col: column to connect. """ # hide the link column try: self.datatable_columns[link_col]['visible'] = 'false' except KeyError: logger.warning( "KeyError: Column name '{0}' does not exist.".format( target_col)) pass # function to add link fct = """function(data, type, row, meta){{ return '<a href="'+row.{0}+'" target="_blank">'+data+'</a>'; }} """.format(link_col) try: self.datatable_columns[target_col]['render'] = fct except KeyError: logger.warning( "KeyError: Column name '{0}' does not exist.".format( target_col)) pass
def get_df(self): import pandas as pd data = {} for sample, filename in zip(self.sample_names, self.filenames): df = pd.read_csv(filename) df = df.groupby("kingdom")['percentage'].sum() # if a taxon is obsolete, the kingdom is empty. # We will set the kingdom as Unclassified and raise a warning # if the count is > 5% if " " in df.index: percent = df.loc[" "] if percent > 5: logger.warning( "Found {}% of taxons in obsolete category".format( percent)) if "Unclassified" in df.index: df.loc['Unclassified'] += df.loc[' '] df.drop(" ", inplace=True) else: df.loc['Unclassified'] = df.loc[' '] df.drop(" ", inplace=True) data[sample] = df df = pd.DataFrame(data) #df.to_json(output.data) df = df.sort_index(ascending=False) return df
def _parse_data(self): """Parse the YAML file to get the block content (comments) before each top-level sections. See doc in the constructor Removes all # so that the block of comments can be interpreted as a standard docstring in Sequanix """ current_block = [] current_section = "docstring" # if we get a line that starts with #, this is a new comment or # part of a block comment. Otherwise, it means the current block # comment has ended. for this in self.data: # Beginning of a new section at top level if self.regex_section.findall(this): name = self.regex_section.findall(this)[0] current_section = name.strip(":") self.sections[current_section] = "".join(current_block) current_block = [] current_section = None elif this.startswith('#'): # a comment at top level current_block.append(this) elif this.strip() == "": # an empty line #this was the main comment, or an isolated comment current_block = [] else: # a non-empty line to skip current_block = [] for key in self._get_expected_sections(): if key not in self.sections.keys(): logger.warning("section %s not dealt by the parsing function" % key)
def _get_specials(self, section): """This method extracts data from the docstring Lines such as :: field_choice__ = ["a", "b"] are extracted. Where _choice is a special keyword to be found. """ if section not in self.sections.keys(): logger.warning("%s not found in the yaml " % section) return comments = self.sections[section] specials = {} for line in comments.split("\n"): if "#############" in line: pass elif sum([this in line for this in self._specials]): for special in self._specials: line = line[2:] key, value = line.split("=", 1) key = key.strip().rstrip("__") value = value.strip() specials[key] = list(eval(value)) return specials
def save_significant_pathways(self, mode, cutoff=0.05, nmax=20, background=None): #pragma: no cover """mode should be up, down or all""" if background is None: background = self.background # select the relevant pathways df = self._enrichr(mode, background).results df = self._get_final_df(df, cutoff=cutoff, nmax=nmax) logger.warning("Found {} pathways to save".format(len(df))) if len(df) == nmax: logger.warning("Restricted pathways to {}".format(nmax)) logger.info("saving {} deregulated pathways".format(len(df))) summaries = {} # save them for ID in df['Term']: summary = self.save_pathway(ID, filename="{}_{}.png".format(ID, mode)) summaries[ID] = summary return summaries
def get_sequana_adapters(type_, direction): """Return path to a list of adapters in FASTA format :param tag: PCRFree, Rubicon, Nextera :param type_: fwd, rev, revcomp :return: path to the adapter filename """ # search possible types registered = _get_registered_adapters() if type_ not in registered: logger.error("This adapter type (%s) is not valid" % type_) logger.error("choose one in %s types" % registered) raise ValueError directions = ["fwd", "rev", "revcomp"] if direction not in directions: logger.error("This kind of tag (%s) is not valid" % direction) logger.error("choose one in %s " % directions) raise ValueError try: this = sequana_data("adapters_%s_%s.fa" % (type_, direction)) logger.warning("Rename {} (remove the adapters_ prefix)".format(this)) return this except: return sequana_data("%s_%s.fa" % (type_, direction))
def df(self): # RG: ID read group ?? # np: number of passes # rq ? # rs: list 6 numbers ? # za: # zm ID of the ZMW # sn: SNR how is this computed ? # zs # - sn: list of ACGT SNRs. A, C, G, T in that order if self._df is not None: return self._df logger.info("Scanning input file. Please wait") self.reset() N = 0 all_results = [] # This takes 60% of the time...could use cython ? for read in self.data: tags = dict(read.tags) #11% of the time res = [] # count reads N += 1 if (N % 10000) == 0: logger.info("Read %d sequences" %N) # res[0] = read length res.append(read.query_length) # also stored in tags["qe"] - tags["qs"] # collections.counter is slow, let us do it ourself res.append( 100. / read.qlen * sum( [read.query_sequence.count(letter) if read.query_sequence else 0 for letter in "CGcgSs"])) # res[1:4] contains SNR stored in tags['sn'] in the order A, C, G, T try: snr = list(tags['sn']) except: snr = [None] * 4 res = res + snr # res[6] = ZMW name, also stored in tags["zm"] res.append(int(tags['zm'])) res.append(tags['np']) # aggregate results all_results.append(res) self._df = pd.DataFrame(all_results, columns=['read_length','GC_content','snr_A','snr_C','snr_G','snr_T','ZMW', "nb_passes"]) self._df.ZMW = self._df.ZMW.astype(int) if len(self._df.ZMW.unique()) != len(self._df): logger.warning("Found non unique ZMW. This may not be a CCS but " "a subread file. Consider using PacbioSubreads class") self.reset() return self._df
def _parse_data(self): taxonomy = {} logger.info("Reading kraken data") columns = ["status", "taxon", "length"] # we select only col 0,2,3 to save memoty, which is required on very # large files try: # each call to concat in the for loop below # will take time and increase with chunk position. # for 15M reads, this has a big cost. So chunksize set to 1M # is better than 1000 and still reasonable in memory reader = pd.read_csv(self.filename, sep="\t", header=None, usecols=[0, 2, 3], chunksize=1000000) except pd.parser.CParserError: raise NotImplementedError # this section is for the case #only_classified_output when there is no found classified read self.unclassified = N # size of the input data set self.classified = 0 self._df = pd.DataFrame([], columns=columns) self._taxons = self._df.taxon return for chunk in reader: try: self._df self._df = pd.concat([self._df, chunk]) except AttributeError: self._df = chunk self._df.columns = columns count = sum(self._df.taxon == 1) if count: logger.warning("Found %s taxons with root ID (1)" % count) # This gives the list of taxons as index and their amount # above, we select only columns 0, 2, 3 the column are still labelled # 0, 2, 3 in the df self._taxons = self._df.groupby("taxon").size() try: self._taxons.drop(0, inplace=True) except: pass # 0 may not be there self._taxons.sort_values(ascending=False, inplace=True) category = self.df.groupby("status").size() if 'C' in category.index: self.classified = category['C'] else: self.classified = 0 if 'U' in category.index: self.unclassified = category['U'] else: self.unclassified = 0
def _get_adapter_by_index(self, index_name, prefix): """Return adapter corresponding to the unique index :param index_name: the unique index name to be found. If several sequence do match, this is an error meaning the fasta file with all adapters is not correctly formatted. :return: an instance of :class:`Adapter` if index_name match an adapter; returns None otherwise :: from sequana import sequana_data, AdapterReader filename = sequana_data("adapters_Nextera_fwd.fa") ar = AdapterReader(filename) ar.get_adapter_by_identifier("N712") """ # there should be only one adapters = [] for this in self._data: if prefix + str(index_name) in this.identifier.split("|"): this_adapter = Adapter(identifier=this.identifier, sequence=this.sequence, comment=this.comment) adapters.append(this_adapter) if len(adapters) == 0: return None elif len(adapters) >= 2: logger.warning( "Found two adapters matching index {}. This may happen with Nextera adapters" .format(index_name)) return adapters
def switch_header_to_gi(self, acc): """Kraken will only accept the GI from NCBI so we need to convert the ENA accession to GI numbers""" # Accession may have a version .1, .2 hence this try/except first # without the version and then with the version. # Note also that some accession are different from an earlier version. # For instance, AF525933 is in the virus.txt list from ENA but # the new updated accession ois AH012103 showing that the list and DB # must not be fully synchronised. # http://www.ebi.ac.uk/ena/data/search?query=AF525933 # In such case, the results attribute will be missing that accession, # which needs to be searched for specifically. We cannot now its name # before downloading the fasta. if acc in self.results.keys(): res = self.results[acc] else: try: res = self.results[acc.split(".")[0]] except: logger.warning( "\nUnknown accession (%s). May be an updated version. Checking..." % acc) res = self.ena_id_to_gi_number([acc]) self.results.update(res) res = res[acc] logger.info('Found %s using GI number' % acc) return ">" + res['identifier'] + " " + res['comment']
def exists(self, filename, exit_on_error=True, warning_only=False): if os.path.exists(filename) is False: if warning_only is False: logger.error("{} file does not exists".format(filename)) if exit_on_error: sys.exit(1) elif warning_only is True: logger.warning("{} file does not exists".format(filename))
def window_size(self, n): if n % 2 == 0: logger.warning("Window size must be an odd number.") self._window_size = n + 1 logger.warning("{0} is incremented to {1}".format( n, self._window_size)) else: self._window_size = n
def run(self): # To normalise one need to ignore the insertions since there # are already included in the ACGT nucleotides cols = ["A", "C", "G", "T", "N", "DEL"] df = self.get_bases() deletions = self.identify_deletions() # consensus without deletions dd = df.apply(lambda x: x.idxmax(), axis=1) # check that deletions are consistent with the data for d in deletions: pos = int(d.resume["position"]) ref = d.resume["reference"] # compare the reference of the deletions with the consensus if "".join(dd.loc[pos:pos+len(ref)-1]) != ref: logger.warning("reference string {} not found in consensus at position {}".format(ref, pos)) # Now, we insert the deletions removing the reference and then adding # the alternate # We aware that some deletions may overlap for d in deletions: pos = int(d.resume["position"]) ref = d.resume["reference"] alt = d.resume["alternative"] # the data up to the position of the reference/alternate SNP # indices may not start at zero so we use loc instead of iloc dfA = df.loc[0:pos-1] # the alternate data needs a dummy dataframe. The indices are # e.g. 0,1,2,3,4,5 and # We reset the indices to start at the dfA last position and # to be constant. For instance a dataframe dfB of 3 rows to be # appended aftre position 1500 will have the indices 1500,1500,1500 # This garantee that the following dataframe dfC has indices > to # those of dfB while allowing the next iteration to use the same # consistance indices when searching for the next deletions dfB = df.iloc[0:len(alt)].copy() dfB.index = [pos] * len(dfB) dfB *= 0 for i, nucleotide in enumerate(alt): dfB.iloc[i][nucleotide] = 10000 # the rest of the data dfC = df.loc[pos+len(ref):] # !! do no reset indices !!! so that inserted dfB is still sorted # and next accession with iloc/loc are still correctin the next # iteration df = dfA.append(dfB).append(dfC)#.reset_index(drop = True) # now we can reset the indices df.reset_index(drop=True, inplace=True) dd = df.apply(lambda x: x.idxmax(), axis=1) return dd
def download_taxonomic_file(self, overwrite=False): """Loads entire flat file from EBI Do not overwrite the file by default. """ import ftplib from sequana import sequana_config_path if os.path.exists(self.database) and overwrite is False: logger.info( "Found taxonomy.dat file in sequana your path {}".format( sequana_config_path)) return else: logger.info( "Downloading and extracting the taxonomy file from the web. Please be patient." ) if self.source == "ena": url = 'ftp.ebi.ac.uk' else: url = 'ftp.ncbi.nlm.nih.gov' self.ftp = ftplib.FTP(url) self.ftp.login() if self.source == "ena": # for the EBI ftp only: self.ftp.cwd('databases') self.ftp.cwd('pub') self.ftp.cwd('databases') self.ftp.cwd('taxonomy') logger.warning( 'Downloading and saving in %s. This is from ebi and may be behind the NCBI taxonomy' % self.database) self.ftp.retrbinary('RETR taxonomy.dat', open(self.database, 'wb').write) ftp.close() else: self.ftp.cwd('pub') self.ftp.cwd('taxonomy') logger.warning('Downloading and saving in %s from ncbi ftp' % self.database) import tempfile import shutil with tempfile.TemporaryDirectory() as tmpdir: filename = tmpdir + os.sep + "taxdump.tar.gz" self.ftp.retrbinary('RETR taxdump.tar.gz', open(filename, "wb").write) import tarfile tf = tarfile.open(filename) assert "nodes.dmp" in tf.getnames() assert "names.dmp" in tf.getnames() tf.extract("nodes.dmp", tmpdir) tf.extract("names.dmp", tmpdir) ncbi = NCBITaxonomy(tmpdir + os.sep + "names.dmp", tmpdir + os.sep + "nodes.dmp") ncbi.create_taxonomy_file(tmpdir + os.sep + "taxonomy.dat") shutil.move(tmpdir + os.sep + "taxonomy.dat", self.database) self.ftp.close()
def _parse_data(self): taxonomy = {} logger.info("Reading kraken data") columns = ["status", "taxon", "length"] # we select only col 0,2,3 to save memoty, which is required on very # large files try: # each call to concat in the for loop below # will take time and increase with chunk position. # for 15M reads, this has a big cost. So chunksize set to 1M # is better than 1000 and still reasonable in memory reader = pd.read_csv(self.filename, sep="\t", header=None, usecols=[0,2,3], chunksize=1000000) except pd.parser.CParserError: raise NotImplementedError # this section is for the case #only_classified_output when there is no found classified read self.unclassified = N # size of the input data set self.classified = 0 self._df = pd.DataFrame([], columns=columns) self._taxons = self._df.taxon return for chunk in reader: try: self._df self._df = pd.concat([self._df, chunk]) except AttributeError: self._df = chunk self._df.columns = columns count = sum(self._df.taxon == 1) if count: logger.warning("Found %s taxons with root ID (1)" % count) # This gives the list of taxons as index and their amount # above, we select only columns 0, 2, 3 the column are still labelled # 0, 2, 3 in the df self._taxons = self._df.groupby("taxon").size() try: self._taxons.drop(0, inplace=True) except: pass # 0 may not be there self._taxons.sort_values(ascending=False, inplace=True) category = self.df.groupby("status").size() if 'C' in category.index: self.classified = category['C'] else: self.classified = 0 if 'U' in category.index: self.unclassified = category['U'] else: self.unclassified = 0
def get_cond_from_sample(self, sample_name): try: candidates = [x for x in self.condition_names if sample_name.startswith(x)] if len(candidates) == 1: return candidates[0] else: raise ValueError("ambiguous sample name found in several conditions") except: logger.warning("{} not found".format(sample_name)) return None
def _check_if_joint(self): try: # Needs a try/except for empty VCF files line = next(self) self.rewind() if len(line.samples) > 1: return True except: logger.warning("Your input VCF may be empty") return False
def copy_config_from_sequana(module, source="config.yaml", target="config.yaml"): # identify config name from the requested module user_config = module.path + os.sep + source if os.path.exists(user_config): shutil.copy(user_config, target) txt = "copied %s from sequana %s pipeline" logger.info(txt % (source, module.name)) else: logger.warning(user_config + "not found")
def get_roi(self): """Keep positions with zscore outside of the thresholds range. :return: a dataframe from :class:`FilteredGenomeCov` .. note:: depends on the :attr:`thresholds` low and high values. """ features = self.bed.feature_dict try: second_high = self.thresholds.high2 second_low = self.thresholds.low2 query = "zscore > @second_high or zscore < @second_low" # in the genbank, the names appears as e.g. JB12345 # but in the fasta or BED files, it may be something like # gi|269939526|emb|FN433596.1| # so they do not match. We can try to guess it alternative = None if features: if self.chrom_name not in features.keys(): msg = """Chromosome name (%s) not found in the genbank. Make sure the chromosome names in the BAM/BED files are compatible with the genbank content. Genbank files contains the following keys """ for this in features.keys(): msg += "\n - %s" % this alternative = [x for x in self.chrom_name.split("|") if x] alternative = alternative[-1] # assume the accession is last alternative = alternative.split('.')[0] # remove version if alternative in features.keys(): msg += "\n Guessed the chromosome name to be: %s" % alternative else: features = None logger.warning(msg % self.chrom_name) if features: if alternative: return FilteredGenomeCov(self.df.query(query), self.thresholds, features[alternative]) else: return FilteredGenomeCov(self.df.query(query), self.thresholds, features[self.chrom_name]) else: return FilteredGenomeCov(self.df.query(query), self.thresholds) except KeyError: logger.error("Column zscore is missing in data frame.\n" "You must run compute_zscore before get low coverage." "\n\n", self.__doc__) sys.exit(1)
def _get_files(self, pattern): filenames = glob.glob(os.sep.join([self.directory, self.phix_directory, pattern])) if len(filenames) == 4: mode = "pe" elif len(filenames) == 2: mode = "se" elif len(filenames) == 0: return else: logger.warning("PhixModule: more than 4 files " "matched the pattern %s" % pattern) return return filenames, mode
def _get_files(self, pattern): filenames = glob.glob( os.sep.join([self.directory, self.phix_directory, pattern])) if len(filenames) == 4: mode = "pe" elif len(filenames) == 2: mode = "se" elif len(filenames) == 0: return else: logger.warning("PhixModule: more than 4 files " "matched the pattern %s" % pattern) return return filenames, mode
def _download_minikraken(self, verbose=True): dv = DevTools() base = sequana_config_path + os.sep + "" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) logger.info("Downloading minikraken (4Gb)") filename = base + os.sep + "minikraken.tgz" if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2": logger.warning("%s already present" % filename) else: wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
def _get_files(self, pattern): # !! need to sort the files so that R1 appears before R2 filenames = sorted(glob.glob(self.directory + os.sep + pattern)) if len(filenames) == 2: mode = "pe" elif len(filenames) == 1: mode = "se" elif len(filenames) == 0: return else: logger.warning("FastQStatsModule: more than 2 files " "matched the pattern %s" % pattern) return return filenames, mode
def kraken_to_krona(self, output_filename=None, nofile=False): """ :return: status: True is everything went fine otherwise False """ if output_filename is None: output_filename = self.filename + ".summary" taxon_to_find = list(self.taxons.index) if len(taxon_to_find) == 0: logger.warning( "No reads were identified. You will need a more complete database" ) self.output_filename = output_filename with open(output_filename, "w") as fout: fout.write("%s\t%s" % (self.unclassified, "Unclassified")) return False # classified reads as root (1) """try: logger.warning("Removing taxon 1 (%s values) " % self.taxons.iloc[1]) logger.info("Found %s taxons " % len(taxon_to_find)) taxon_to_find.pop(taxon_to_find.index(1)) except: pass """ if len(taxon_to_find) == 0: return False df = self.get_taxonomy_db(taxon_to_find) self.lineage = [";".join(this) for this in df[df.columns[0:-1]].values] self.scnames = list(df['name'].values) # do we need a cast ? # Now save the file self.output_filename = output_filename with open(output_filename, "w") as fout: for i, this in enumerate(self.lineage): taxon = taxon_to_find[i] count = self.taxons.loc[taxon] line = str(count) + "\t" + "\t".join(this.split(';')) line += " " + self.scnames[i] fout.write(line + '\n') try: fout.write("%s\t%s" % (self.unclassified, "Unclassified")) except: pass #unclassified may not exists if all classified self._data_created = True return True
def parse_cutadapt(self): d = {} # output tobefound = self._get_data_tobefound() adapters = [] data = self._rawdata.splitlines() # some metadata to extract for this in tobefound: key, pattern = this found = [line for line in data if line.startswith(pattern)] if len(found) == 0: logger.warning("ReportCutadapt: %s (not found)" % pattern) elif len(found) == 1: text = found[0].split(":", 1)[1].strip() try: this, percent = text.split() self.jinja[key] = this self.jinja[key+'_percent'] = percent except: self.jinja[key] = text self.jinja[key+'_percent'] = "?" dd = {} positions = [] executable = "cutadapt" for pos, this in enumerate(data): if "This is Atropos" in this: executable = "atropos" if "Command line parameters: " in this: cmd = this.split("Command line parameters: ")[1] self.jinja['command'] = executable + " " + cmd if this.startswith("=== ") and "Adapter" in this: name = this.split("=== ")[1].split(" ===")[0].strip() dd['name'] = name continue if this.startswith('Sequence:'): info = this.split("Sequence:", 1)[1].strip() info = info.split(";") dd['info'] = { 'Sequence': info[0].strip(), 'Type': info[1].split(':',1)[1].strip(), 'Length': info[2].split(':',1)[1].strip(), 'Trimmed': info[3].split(':',1)[1].strip() } adapters.append(dd.copy()) self.data["adapters"] = adapters
def parse_cutadapt(self): d = {} # output tobefound = self._get_data_tobefound() adapters = [] data = self._rawdata.splitlines() # some metadata to extract for this in tobefound: key, pattern = this found = [line for line in data if line.startswith(pattern)] if len(found) == 0: logger.warning("ReportCutadapt: %s (not found)" % pattern) elif len(found) == 1: text = found[0].split(":", 1)[1].strip() try: this, percent = text.split() self.jinja[key] = this self.jinja[key + '_percent'] = percent except: self.jinja[key] = text self.jinja[key + '_percent'] = "?" dd = {} positions = [] executable = "cutadapt" for pos, this in enumerate(data): if "This is Atropos" in this: executable = "atropos" if "Command line parameters: " in this: cmd = this.split("Command line parameters: ")[1] self.jinja['command'] = executable + " " + cmd if this.startswith("=== ") and "Adapter" in this: name = this.split("=== ")[1].split(" ===")[0].strip() dd['name'] = name continue if this.startswith('Sequence:'): info = this.split("Sequence:", 1)[1].strip() info = info.split(";") dd['info'] = { 'Sequence': info[0].strip(), 'Type': info[1].split(':', 1)[1].strip(), 'Length': info[2].split(':', 1)[1].strip(), 'Trimmed': info[3].split(':', 1)[1].strip() } adapters.append(dd.copy()) self.data["adapters"] = adapters
def __init__(self, design_filename, adapters): """.. rubric:: Constructor :param str design_filename: a CSV file that is compatible with our :class:`sequana.expdesign.ExpDesignAdapter` :param adapters: the type of adapters (PCRFree, Nextera, Rubicon, TruSeq, SMARTer, Small) The files of adapters are stored in Sequana and accessible with the sequana_data function. So, for instance if adapters is set to Nextera, the following file is used to identify the adapters:: sequana_data("adapters_Nextera_fwd.fa") New adapters files can be added on request. See resources/data/adapters for the full list. You can also use:: from sequana.adapters import _get_registered_adapters _get_registered_adapters() """ from sequana.expdesign import ExpDesignAdapter self.design = ExpDesignAdapter(design_filename) if self.design.df.index.name == "Sample_ID" or \ "Sample_ID" in self.design.df.columns: self.design.df.set_index("Sample_ID", inplace=True) else: raise ValueError("Incorrect design file. Missing Sample_ID field") self.adapters = adapters try: file1 = sequana_data("adapters_%s_fwd.fa" % adapters) logger.warning("rename your file removing prefix adatper") except: file1 = sequana_data("%s_fwd.fa" % adapters) try: file2 = sequana_data("adapters_%s_revcomp.fa" % adapters) logger.warning("rename your file removing prefix adatper") except: file2 = sequana_data("%s_revcomp.fa" % adapters) self._adapters_fwd = AdapterReader(file1) self._adapters_revc = AdapterReader(file2) # !!! revcomp
def _block2docstring(self, section): if section not in self.sections.keys(): logger.warning("%s not found in the yaml " % section) return comments = self.sections[section] docstring = [] for line in comments.split("\n"): if "#############" in line: pass elif sum([this in line for this in self._specials]): pass else: if len(line) < 2: # an empty line (to keep) docstring.append("") else: docstring.append(line[2:]) # strip the "# "characters docstring = "\n".join(docstring).strip() return docstring
def _block2docstring(self, section): if section not in self.sections.keys(): logger.warning("%s not found in the yaml " % section) return comments = self.sections[section] docstring = [] for line in comments.split("\n"): if "#############" in line: pass elif sum([this in line for this in self._specials]): pass else: if len(line)<2: # an empty line (to keep) docstring.append("") else: docstring.append(line[2:]) # strip the "# "characters docstring = "\n".join(docstring).strip() return docstring
def scanner(self): data = {} # shlex removes all white lines and split by return carriage # strip is also applied rawdata = shlex.split(open(self.filename, "r")) for line in rawdata: # sometimes, IEM will store the ;;; at the end # so we can get [HEADER];;;;;;;;;;; if line.startswith('[') and "]" in line: line = line.strip(";").strip(",").strip() currentkey = line.replace("[", "").replace("]", "") data[currentkey] = [] else: data[currentkey].append(line) for key in data.keys(): data[key] = "\n".join(data[key]) for this in ["Header", "Reads", "Settings", "Data"]: if this not in data.keys(): logger.warning("%s not found in the DesignExpMiSeq file" % this) self.data = data self.df = pd.read_csv(io.StringIO(data["Data"])) ncols = [8, 9, 10, 12] if self.df.shape[1] not in ncols: self.df = pd.read_csv(io.StringIO(data["Data"]), ";") if self.df.shape[1] not in ncols: logger.warning( "Data section must have 10 or 12 columns. Check the samplesheet" ) # Fixes https://github.com/sequana/sequana/issues/507 self.df["Sample_ID"] = self.df["Sample_ID"].astype(str) self.df.rename(columns={ "I7_Index_ID": "Index1_ID", "index": "Index1_Seq", "I5_Index_ID": "Index2_ID", "index2": "Index2_Seq" }, inplace=True)
def splitter_mapped_unmapped(self, filename, prefix): # helpful resources: # https://broadinstitute.github.io/picard/explain-flags.html logger.info("Creating 2 files (mapped and unmapped reads)") data = SAM(filename) results = {"flags": [], "mapped": 0, "unmapped": 0, "bad": 0} logger.info("Please wait while creating output files") with open("{}/{}.unmapped.fastq".format(self.outdir, prefix), "w") as fnosirv: with open("{}/{}.mapped.fastq".format(self.outdir, prefix), "w") as fsirv: for a in data: if a.flag & 2048: # suppl # a bad read, we can just drop it results['bad'] += 1 elif a.flag & 1024: # PCR duplicate results['bad'] += 1 elif a.flag & 256: # secondary alignment results["bad"] += 1 elif a.flag & 16: # mapped read = "@{}\n{}\n+\n{}\n".format( a.qname, a.query_sequence, a.qual) assert len(a.query_sequence) == len(a.qual) fsirv.write(read) results["mapped"] += 1 elif a.flag & 4: # unmapped read = "@{}\n{}\n+\n{}\n".format( a.qname, a.query_sequence, a.qual) assert len(a.query_sequence) == len(a.qual) fnosirv.write(read) results["unmapped"] += 1 elif a.flag == 0: # mapped read = "@{}\n{}\n+\n{}\n".format( a.qname, a.query_sequence, a.qual) assert len(a.query_sequence) == len(a.qual) fsirv.write(read) results["mapped"] += 1 else: logger.warning("{} flag not handled".format(a.flag)) results["flags"].append(a.flag) return results
def _scanner(self): current_section = None data = collections.defaultdict(list) with open(self.filename, "r") as fin: for line in fin.readlines(): line = self._line_cleaner(line) if len(line) == 0: continue if line.startswith("[") and line.endswith("]"): name = line.lstrip("[").rstrip("]") current_section = name else: data[current_section] += [line] if "Header" not in data.keys(): logger.warning("Input file must contain [Header]") if "Data" not in data.keys(): logger.warning("Input file must contain [Data]") self.data = data
def _plot(self, Xr, pca=None, pc1=0, pc2=1, colors=None, show_labels=True): if colors is None: colors = [self.colors[k] for k in self.labels] if len(colors) != len(Xr): colors = ["r"] * len(Xr[:,0]) else: for k in self.labels: if k not in colors.keys(): logger.warning("No key color for this sample: {}. Set to red".format(k)) colors[k] = "r" colors = [colors[k] for k in self.labels] pylab.scatter(Xr[:,pc1], Xr[:,pc2], c=colors) ax = pylab.gca() X1, X2 = pylab.xlim() dX = X2 - X1 pylab.xlim([X1 + X1*0.05, X2 + X2*0.05]) Y1, Y2 = pylab.ylim() dY = Y2 - Y1 pylab.ylim([Y1 + Y1*0.05, Y2 + Y2*0.05]) count = 0 if show_labels: for x,y in zip(Xr[:,pc1], Xr[:,pc2]): x += dX / 40 y += dY / 40 ax.annotate(self.labels[count], (x,y)) count += 1 if count > 100: break if pca: pylab.xlabel("PC{} ({}%)".format(pc1+1, round(pca.explained_variance_ratio_[pc1]*100, 2))) pylab.ylabel("PC{} ({}%)".format(pc2+1, round(pca.explained_variance_ratio_[pc2]*100, 2))) pylab.grid(True)
def _download_kraken_toydb(self, verbose=True): """Download the kraken DB toy example from sequana_data into .config/sequana directory Checks the md5 checksums. About 32Mb of data """ dv = DevTools() base = sequana_config_path + os.sep + "kraken_toydb" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) baseurl = "https://github.com/sequana/data/raw/master/" # download only if required logger.info("Downloading the database into %s" % base) md5sums = [ "28661f8baf0514105b0c6957bec0fc6e", "97a39d44ed86cadea470352d6f69748d", "d91a0fcbbc0f4bbac918755b6400dea6", "c8bae69565af2170ece194925b5fdeb9"] filenames = [ "database.idx", "database.kdb", "taxonomy/names.dmp", "taxonomy/nodes.dmp"] for filename, md5sum in zip(filenames, md5sums): url = baseurl + "kraken_toydb/%s" % filename filename = base + os.sep + filename if os.path.exists(filename) and md5(filename) == md5sum: logger.warning("%s already present" % filename) else: logger.info("Downloading %s" % url) wget(url, filename)
def _download_kraken_toydb(self, verbose=True): """Download the kraken DB toy example from sequana_data into .config/sequana directory Checks the md5 checksums. About 32Mb of data """ dv = DevTools() base = sequana_config_path + os.sep + "kraken_toydb" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) baseurl = "https://github.com/sequana/data/raw/master/" # download only if required logger.info("Downloading the database into %s" % base) md5sums = [ "28661f8baf0514105b0c6957bec0fc6e", "97a39d44ed86cadea470352d6f69748d", "d91a0fcbbc0f4bbac918755b6400dea6", "c8bae69565af2170ece194925b5fdeb9" ] filenames = [ "database.idx", "database.kdb", "taxonomy/names.dmp", "taxonomy/nodes.dmp" ] for filename, md5sum in zip(filenames, md5sums): url = baseurl + "kraken_toydb/%s" % filename filename = base + os.sep + filename if os.path.exists(filename) and md5(filename) == md5sum: logger.warning("%s already present" % filename) else: logger.info("Downloading %s" % url) wget(url, filename)
def set_links_to_column(self, link_col, target_col): """Hide a column with urls and connect it with a column. :param str link_col: column with your URLs. :param str target_col: column to connect. """ # hide the link column try: self.datatable_columns[link_col]['visible'] = 'false' except KeyError: logger.warning("KeyError: Column name '{0}' does not exist." .format(target_col)) pass # function to add link fct = """function(data, type, row, meta){{ return '<a href="'+row.{0}+'" target="_blank">'+data+'</a>'; }} """.format(link_col) try: self.datatable_columns[target_col]['render'] = fct except KeyError: logger.warning("KeyError: Column name '{0}' does not exist." .format(target_col)) pass
def set_tooltips_to_column(self, tooltips_col, target_col): """Hide a column with tooltips and connect it with a column. :param str tooltips_col: column with your tooltips. :param str target_col: column to connect. """ # hide tooltips try: self.datatable_columns[tooltips_col]['visible'] = 'false' except KeyError: logger.warning("KeyError: Column name '{0}' does not exist." .format(target_col)) pass # function to add tooltips fct = """function(data, type, row, meta){{ return '<a href="#" data-toggle="tooltip" title="'+row.{0}+'">'+data+'</a>'; }} """.format(tooltips_col) try: self.datatable_columns[target_col]['render'] = fct except KeyError: logger.warning("KeyError: Column name '{0}' does not exist." .format(target_col)) pass
def kraken_to_krona(self, output_filename=None, mode=None, nofile=False): """ :return: status: True is everything went fine otherwise False """ if output_filename is None: output_filename = self.filename + ".summary" taxon_to_find = list(self.taxons.index) if len(taxon_to_find) == 0: logger.warning("No reads were identified. You will need a more complete database") self.output_filename = output_filename with open(output_filename, "w") as fout: fout.write("%s\t%s" % (self.unclassified, "Unclassified")) return False # classified reads as root (1) """try: logger.warning("Removing taxon 1 (%s values) " % self.taxons.ix[1]) logger.info("Found %s taxons " % len(taxon_to_find)) taxon_to_find.pop(taxon_to_find.index(1)) except: pass """ if len(taxon_to_find) == 0: return False if mode != "adapters": df = self.get_taxonomy_biokit(taxon_to_find) self.lineage = [";".join(this) for this in df[df.columns[0:-1]].values] self.scnames = list(df['name'].values) # do we need a cast ? else: # Let us get the known adapters and their identifiers from sequana.adapters import AdapterDB adapters = AdapterDB() adapters.load_all() self.scnames = [] for taxon in self.taxons.index: if str(taxon) in [1, "1"]: self.scnames.append('unknown') continue if str(taxon) not in list(adapters.df.identifier): self.scnames.append('unknown') continue self.scnames.append(adapters.get_name(taxon)) self.lineage = ["Adapters;%s"% x for x in self.scnames] assert len(self.lineage) == len(self.taxons) assert len(self.scnames) == len(self.taxons) # Now save the file self.output_filename = output_filename with open(output_filename, "w") as fout: for i, this in enumerate(self.lineage): taxon = taxon_to_find[i] count = self.taxons.loc[taxon] line = str(count)+"\t"+"\t".join(this.split(';')) line += " " +self.scnames[i] fout.write(line+'\n') try: fout.write("%s\t%s" % (self.unclassified, "Unclassified")) except: pass #unclassified may not exists if all classified self._data_created = True return True
def __len__(self): if self._N is None: logger.warning("Scanning the BAM. Please wait") self._N = sum(1 for _ in self._data) self.reset() return self._N
def run_analysis(chrom, options, feature_dict): logger.info("Computing some metrics") if chrom.DOC < 8: logger.warning("The depth of coverage is below 8. sequana_coverage is" " not optimised for such depth. You may want to " " increase the threshold to avoid too many false detections") logger.info(chrom.__str__()) if options.w_median > len(chrom.df) / 4: NW = int(len(chrom.df) / 4) if NW % 2 == 0: NW += 1 logger.warning("median window length is too long. \n" " Setting the window length automatically to a fifth of\n" " the chromosome length ({})".format(NW)) options.w_median = NW # compute the running median, zscore and ROIs for each chunk summarizing the # results in a ChromosomeCovMultiChunk instane logger.info('Using running median (w=%s)' % options.w_median) logger.info("Number of mixture models %s " % options.k) results = chrom.run(options.w_median, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering) # Print some info related to the fitted mixture models try: mu = results.data[0][0].as_dict()['data']['fit_mu'] sigma = results.data[0][0].as_dict()['data']['fit_sigma'] pi = results.data[0][0].as_dict()['data']['fit_pi'] logger.info("Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s" % (round(mu,3), round(sigma,3), round(pi,3))) except: pass # some information about the ROIs found high = chrom.thresholds.high2 low = chrom.thresholds.low2 logger.info("Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format( chrom.thresholds.low, chrom.thresholds.high, low, high)) ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instane logger.info("Number of ROIs found: {}".format(len(ROIs.df))) logger.info(" - below average: {}".format(len(ROIs.get_low_rois()))) logger.info(" - above average: {}".format(len(ROIs.get_high_rois()))) # Create directory and save ROIs directory = options.output_directory directory += os.sep + "coverage_reports" directory += os.sep + chrom.chrom_name mkdirs(directory) ROIs.df.to_csv("{}/rois.csv".format(directory)) # save summary and metrics logger.info("Computing extra metrics") summary = results.get_summary() summary.to_json(directory + os.sep + "sequana_summary_coverage.json") logger.info("Evenness: {}".format(summary.data['evenness'])) logger.info("Centralness (3 sigma): {}".format(summary.data['C3'])) logger.info("Centralness (4 sigma): {}".format(summary.data['C4'])) if options.skip_html: return logger.info("Creating report in %s. Please wait" % config.output_dir) if chrom._mode == "chunks": logger.warning(("This chromosome is large. " "Plots in the HTML reports are skipped")) datatable = CoverageModule.init_roi_datatable(ROIs) ChromosomeCoverageModule(chrom, datatable, options={"W": options.w_median, "k": options.k, "ROIs": ROIs, "circular": options.circular}, command=" ".join(["sequana_coverage"] + sys.argv[1:]))
def main(args=None): if args is None: args = sys.argv[:] user_options = Options(prog="sequana") # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) logger.level = options.logging_level if options.download_reference: logger.info("Downloading reference %s from %s\n" % (options.download_reference, options.database)) from bioservices.apps import download_fasta as df df.download_fasta(options.download_reference, method=options.database) if options.download_genbank is None: return if options.download_genbank: logger.info("Downloading genbank %s from %s\n" % (options.download_genbank, options.database)) from sequana.snpeff import download_fasta_and_genbank download_fasta_and_genbank(options.download_genbank, options.download_genbank, genbank=True, fasta=False) return if options.genbank: assert os.path.exists(options.genbank), \ "%s does not exists" % options.genbank logger.info("Reading %s. This may take time depending on " "your input file" % options.input) # Convert BAM to BED if options.input.endswith(".bam"): bedfile = options.input.replace(".bam", ".bed") logger.info("Converting BAM into BED file") shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile)) elif options.input.endswith(".bed"): bedfile = options.input else: raise ValueError("Input file must be a BAM or BED file") # Set the thresholds if options.low_threshold is None: options.low_threshold = -options.threshold if options.high_threshold is None: options.high_threshold = options.threshold # and output directory config.output_dir = options.output_directory config.sample_name = os.path.basename(options.input).split('.')[0] # Now we can create the instance of GenomeCoverage if options.chromosome == -1: chrom_list = [] else: chrom_list = [options.chromosome] gc = GenomeCov(bedfile, options.genbank, options.low_threshold, options.high_threshold, options.double_threshold, options.double_threshold, chunksize=options.chunksize, chromosome_list=chrom_list) # if we have the reference, let us use it if options.reference: logger.info('Computing GC content') gc.compute_gc_content(options.reference, options.w_gc, options.circular) # Now we scan the chromosomes, if len(gc.chrom_names) == 1: logger.warning("There is only one chromosome. Selected automatically.") run_analysis(gc.chr_list[0], options, gc.feature_dict) elif options.chromosome <-1 or options.chromosome > len(gc.chrom_names): msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names)) logger.error(msg) sys.exit(1) else: if options.chromosome == -1: chromosomes = gc.chrom_names # take all chromosomes else: # For user, we start at position 1 but in python, we start at zero chromosomes = [gc.chrom_names[options.chromosome-1]] logger.info("There are %s chromosomes/contigs." % len(gc)) for this in gc.chrom_names: data = (this, gc.positions[this]["start"], gc.positions[this]["end"]) logger.info(" {} (starting pos: {}, ending pos: {})".format(*data)) # here we read chromosome by chromosome to save memory. # However, if the data is small. for i, chrom in enumerate(chromosomes): logger.info("==================== analysing chrom/contig %s/%s (%s)" % (i + 1, len(gc), gc.chrom_names[i])) # since we read just one contig/chromosome, the chr_list contains # only one contig, so we access to it with index 0 run_analysis(gc.chr_list[i], options, gc.feature_dict) if options.skip_multiqc is False: logger.info("=========================") logger.info("Creating multiqc report") pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/") cmd = 'multiqc . -m sequana_coverage -f -c {}'.format(pathtocfg) import subprocess proc = subprocess.Popen(cmd.split(), cwd=options.output_directory) proc.wait()
def __init__(self, filename_fastq, fof_databases, threads=1, output_directory="./kraken_hierarchical/", keep_temp_files=False, force=False): """.. rubric:: **constructor** :param filename_fastq: FastQ file to analyse :param fof_databases: file that contains a list of databases paths (one per line). The order is important. Note that you may also provide a list of datab ase paths. :param threads: number of threads to be used by Kraken :param output_directory: name of the output directory :param keep_temp_files: bool, if True, will keep intermediate files from each Kraken analysis, and save html report at each step :param bool force: if the output directory already exists, the instanciation fails so that the existing data is not overrwritten. If you wish to overwrite the existing directory, set this parameter to True. """ # When running kraken in paired mode and saving the unclassified reads # in a file, the output file (fastq) contains both R1 and R2 so there # are concatenated in the same file. Actually, if there is R1 and R2, # there are concatenated as R1 N R2 (with the letter N as link). # So, in the hiearchical search, paired case, the first iteration has 2 # input files, must subsequent iterations will have only one file as # input, that is the output of the previous run (provided by # --unclassified-out option) self.filename_fastq = filename_fastq # input databases may be stored in a file if isinstance(fof_databases, str) and os.path.exists(fof_databases): with open(fof_databases, 'r') as fof: self.databases = [absolute_path.split('\n')[0] for absolute_path in fof.readlines()] # or simply provided as a list elif isinstance(fof_databases, list): self.databases = fof_databases[:] else: raise TypeError("input databases must be a list of valid kraken " "databases or a file (see documebntation)") self.threads = threads self.output_directory = output_directory self.keep_temp_files = keep_temp_files # check if the output directory already exist try: os.mkdir(output_directory) except OSError: if os.path.isdir(output_directory) and force is False: logger.error('Output directory %s already exists' % output_directory) raise Exception elif force is True: logger.warning("Output directory %s already exists. You may " "overwrite existing results" % output_directory) # list of input fastq files if isinstance(filename_fastq, list) and len(filename_fastq) in [1, 2]: self.inputs = filename_fastq[:] elif isinstance(filename_fastq, str): self.inputs = [filename_fastq] else: msg = "input file must be a string or list of 2 filenames" msg += "\nYou provided {}".format(filename_fastq) raise TypeError(msg)