def update_config(self, config, options, section_name): for option_name in config[section_name]: try: config[section_name][option_name] = getattr( options, section_name + "_" + option_name) except: logger.debug( "update_config. Could not find {}".format(option_name))
def setup(self): """Initialise the pipeline. - Create a directory (usually named after the pipeline name) - Copy the pipeline and associated files (e.g. config file) - Create a script in the directory ready to use If there is a "requirements" section in your config file, it looks like:: requirements: - path to file1 - path to file2 It means that those files will be required by the pipeline to run correctly. If the file exists, use it , otherwise look into the pipeline itself. """ # First we create the beginning of the command with the optional # parameters for a run on a SLURM scheduler cmd = "#!/bin/bash\nsnakemake -s {}.rules --stats stats.txt" self.command = cmd.format(self.name) # FIXME a job is not a core. Ideally, we should add a core option if self._guess_scheduler() == "local": self.command += " -p --cores {} ".format(self.options.jobs) else: self.command += " -p --jobs {}".format(self.options.jobs) if self.options.run_mode is None: self.options.run_mode = self._guess_scheduler() logger.debug("Guessed scheduler is {}".format( self.options.run_mode)) if self.options.run_mode == "slurm": if self.options.slurm_queue == "common": slurm_queue = "" else: slurm_queue = "-A {} --qos {} -p {}".format( self.options.slurm_queue, self.options.slurm_queue, self.options.slurm_queue) if self.module.cluster_config: self.command += ' --cluster "sbatch --mem={{cluster.ram}} --cpus-per-task={{threads}}"'.format( slurm_queue) self.command += " --cluster-config cluster_config.json " else: self.command += ' --cluster "sbatch --mem {} -c {} {}"'.format( self.options.slurm_memory, self.options.slurm_cores_per_job, slurm_queue) # This should be in the setup, not in the teardown since we may want to # copy files when creating the pipeline. This is the case e.g. in the # rnaseq pipeline self._create_directories()
def __init__(self, rule_name, rule_dict, count=0, browser_keywords=[], generic=False, specials=None): super().__init__(rule_name) self.setStyleSheet("""QGroupBox { font-weight:bold; font-size: 18px; border: 2px solid gray; border-radius: 4px; margin-top: 0.5em; } QGroupBox::Title { subcontrol-origin: margin; color:red; left: 20px; padding: 0 3px 0 3px } """) # to handle recursive case self.do_widget = None self.rule_name = rule_name self.rule_dict = rule_dict self.layout = QW.QVBoxLayout(self) self.layout.setSpacing(2) self.setAutoFillBackground(True) rules = list(self.rule_dict.keys()) rules.sort() if "do" in rules: rules.remove("do") rules.insert(0, "do") for rule in rules: option = rule value = self.rule_dict[rule] if option.endswith("_directory"): logger.debug("adding directory widget") option_widget = FileBrowserOption(option, value, directory=True) elif option.endswith("_file"): option_widget = FileBrowserOption(option, value, directory=False) elif option.endswith("_choice"): try: values = specials[rule] option_widget = ComboboxOptions(option, value, values) except Exception as err: print(err) option_widget = TextOption(option, value) elif option in browser_keywords: option_widget = FileBrowserOption(option, value, directory=False) elif isinstance(value, bool) or option == "do": # for the do option, we need to check its value option_widget = BooleanOption(option, value) if option == Ruleform.do_option: self.do_widget = option_widget option_widget.connect(self._widget_lock) elif generic is True: value = str(value) option_widget = TextOption(option, value) elif isinstance(value, (list)): option_widget = ListOption(option, value) else: try: option_widget = NumberOption(option, value) except TypeError: try: option_widget = TextOption(option, value) except TypeError: option_widget = Ruleform(option, value) self.layout.addWidget(option_widget) try: self._widget_lock(self.do_widget.get_value()) except AttributeError: pass
def add_section(self): logger.info("Found %s projects/samples/ directories" % len(self.summaries)) for filename in self.filenames: logger.info(filename) self.jinja = {} self.jinja['canvas'] = '<script type="text/javascript" src="js/canvasjs.min.js"></script>' self.jinja['canvas'] += """<script type="text/javascript"> window.onload = function () {""" # Information to put on top of the page (added later in a module.intro) # We should get the link name from the project name contained in the json links = [{'href': filename.replace(".json", ".html"),'caption': project} for filename, project in zip(self.filenames,self.projects)] introhtml = "<div><b>Number of samples:</b>{}</div>".format(len(self.summaries)) #introhtml += '<div class="multicolumns"><ul>' #for link in links: # introhtml += ' <li><a href="{}">{}</a></li> '.format( # link["href"], link["caption"]) #introhtml += '\n</ul>\n</div>' self.jinja['sections'] = [] # This will used to stored all information self.df = {} # The order does not matter here, everything is done in JINJA try:self.populate_nreads_raw() except Exception as err: print(err) try: self.populate_phix() except Exception as err: logger.debug("multi_summary: skip phix") try: self.populate_gc_samples() except Exception as err: logger.debug("multi_summary: skip gc samples") try: self.populate_trimming() except Exception as err: logger.debug("multi_summary: skip trimming") try: self.populate_mean_quality() except Exception as err: logger.debug("multi_summary: skip mean quality") try: self.populate_adapters() except Exception as err: logger.debug("multi_summary: skip adapters") try: self.populate_output_total_reads() except Exception as err: logger.debug("multi_summary: skip total reads") # Now we have all data in df as dictionaries. Let us merge them together keys = list(self.df.keys()) if len(keys) >= 1: df = pd.DataFrame(self.df[keys[0]]) if len(keys) > 1: # we can merge things for key in keys[1:]: df = pd.merge(df, pd.DataFrame(self.df[key]), on=['name', 'url']) # For the quality_control pipeline columns = [] for this in ["name", "url", "N_raw", "GC_raw_(%)", "Mean_quality_raw", 'Phix_content_(%)', "Adapters_content_(%)", "Trimmed_reads_(%)", "N_final" ]: if this in df.columns: columns.append(this) df = df[columns] df.rename(columns={"name": "Sample name"}, inplace=True) from sequana.utils.datatables_js import DataTable datatable = DataTable(df, "multi_summary") datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 30, 'scrollCollapse': 'true', 'dom': 'rtpB', "paging": "false", 'buttons': ['copy', 'csv']} datatable.datatable.set_links_to_column("url", "Sample name") js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html = "{} {}".format(html_tab, js) self.jinja['canvas'] += """ function onClick(e){ window.open(e.dataPoint.url) } }</script>""" caption = """<p>The table below gives a brief summary of the analysis. The first column contains clickable sample name that redirects to complete summary page. The table contains the following columns:</p> <b>Table caption</b> <table> <tr><td>N_raw</td><td>Number of reads in the data</td></tr> <tr><td>GC_raw_(%)</td><td>GC content in percentage in the raw data across all reads</td></tr> <tr><td>Mean_quality_raw</td><td>Mean quality across all reads all bases in the raw data</td></tr> <tr><td>Phix_content_(%)</td><td>Percentage of reads found with Phix174</td></tr> <tr><td>Adapters_content_(%)</td><td>Percentage of reads with adapters (after phix removal if applied) </td></tr> <tr><td>Trimmed_reads_(%)</td><td>Percentage of reads trimmed (after phix and adapter removal)</td></tr> <tr><td>N_final</td><td>Final number of reads (after phix and adapter removal and trimming)</td></tr> </table> """ infohtml = self.create_hide_section('information', '(Show information)', caption, True) infohtml = "\n".join(infohtml) self.intro = introhtml + """ <hr><b>Summary</b>: """ + infohtml +html self.sections.append({ 'name': None, 'anchor': None, 'content': self.jinja['canvas'] + "\n".join(self.jinja['sections']) })
def teardown(self, check_schema=True, check_input_files=True): """Save all files required to run the pipeline and perform sanity checks We copy the following files into the working directory: * the config file (config.yaml) * a NAME.sh that contains the snakemake command * the Snakefile (NAME.rules) For book-keeping and some parts of the pipelines, we copied the config file and its snakefile into the .sequana directory. We also copy the logo.png file if present into this .sequana directory and if present: * the cluster_config configuration files for snakemake * multiqc_config file for mutliqc reports * the schema.yaml file used to check the content of the config.yaml file if the config.yaml contains a requirements section, the files requested are copied in the working directory """ if check_input_files: self.check_input_files() # the config file self.config._update_yaml() self.config.save("{}/{}/config.yaml".format(self.workdir, ".sequana")) try: os.symlink("{}/config.yaml".format(".sequana"), "{}/config.yaml".format(self.workdir)) except: pass # the command with open("{}/{}.sh".format(self.workdir, self.name), "w") as fout: fout.write(self.command) # the snakefile shutil.copy(self.module.snakefile, "{}/{}".format(self.workdir, ".sequana")) try: os.symlink("{}/{}.rules".format(".sequana", self.name), "{}/{}.rules".format(self.workdir, self.name)) except: pass # the cluster config if any if self.module.logo: shutil.copy(self.module.logo, "{}/{}".format(self.workdir, ".sequana")) # the cluster config if any if self.module.cluster_config: shutil.copy(self.module.cluster_config, "{}".format(self.workdir)) # the multiqc if any if self.module.multiqc_config: shutil.copy(self.module.multiqc_config, "{}".format(self.workdir)) # the schema if any if self.module.schema_config: shutil.copy(self.module.schema_config, "{}".format(self.workdir)) # This is the place where we can check the entire validity of the # inputs based on the schema if check_schema: #logger.info("Checking config file with schema") from sequana import SequanaConfig cfg = SequanaConfig("{}/config.yaml".format(self.workdir)) cfg.check_config_with_schema("{}/schema.yaml".format( self.workdir)) # finally, we copy the files be found in the requirements section of the # config file. self.copy_requirements() # some information msg = "Check the script in {}/{}.sh as well as " msg += "the configuration file in {}/config.yaml.\n" print( self.colors.purple( msg.format(self.workdir, self.name, self.workdir))) msg = "Once ready, execute the script {}.sh using \n\n\t".format( self.name) if self.options.run_mode == "slurm": msg += "cd {}; sbatch {}.sh\n\n".format(self.workdir, self.name) else: msg += "cd {}; sh {}.sh\n\n".format(self.workdir, self.name) print(self.colors.purple(msg)) # Save an info.txt with the command used with open(self.workdir + "/.sequana/info.txt", "w") as fout: # from sequana import version fout.write("# sequana version: {}\n".format(version)) fout.write("# sequana_{} version: {}\n".format( self.name, self._get_package_version())) cmd1 = os.path.basename(sys.argv[0]) fout.write(" ".join([cmd1] + sys.argv[1:])) # save environement try: cmd = "conda list" with open("{}/.sequana/env.yml".format(self.workdir), "w") as fout: subprocess.call(cmd.split(), stdout=fout) logger.debug("Saved your conda environment into env.yml") except: cmd = "pip freeze" with open("{}/.sequana/pip.yml".format(self.workdir), "w") as fout: subprocess.call(cmd.split(), stdout=fout) logger.debug( "Saved your pip environement into pip.txt (conda not found)") # General information from easydev import CustomConfig configuration = CustomConfig("sequana", verbose=False) sequana_config_path = configuration.user_config_dir completion = sequana_config_path + "/pipelines/{}.sh".format(self.name) if os.path.exists(completion): with open(completion, "r") as fin: line = fin.readline() if line.startswith("#version:"): version = line.split("#version:")[1].strip() from distutils.version import StrictVersion if StrictVersion(version) < StrictVersion( self._get_package_version()): msg = ( "The version {} of your completion file for the" " {} pipeline seems older than the installed" " pipeline itself ({}). " "Please, consider updating the completion file {}" " using the following command: \n\t sequana_completion --name {}\n" "available in the sequana_pipetools package (pip " "install sequana_completion)") msg = msg.format(version, self.name, self._get_package_version(), completion, self.name) logger.info(msg) else: # we could print a message to use the sequana_completion tools # be maybe boring on the long term pass
def _filter_line(self, vcf_line, filter_dict=None, iline=None): """ return False if the variant should be filter """ VT = self._get_variant_tag(vcf_line) if filter_dict is None: # a copy to avoid side effects filter_dict = self.filter_dict.copy() if filter_dict["QUAL"] != -1 and vcf_line.QUAL < filter_dict["QUAL"]: logger.debug("{} filtered variant with QUAL below {}".format( VT, filter_dict["QUAL"])) return False if self.apply_indel_filter: if self.is_indel(vcf_line) is True: logger.debug("{}: filter out line {} (INDEL)".format( VT, iline)) return False # DP4 if self.apply_dp4_filter and "DP4" in vcf_line.INFO: status = self.is_valid_dp4(vcf_line, self.dp4_minimum_depth, self.dp4_minimum_depth_strand, self.dp4_minimum_ratio) if status is False: logger.debug("{}: filter out DP4 line {} {}".format( VT, iline, vcf_line.INFO['DP4'])) return False # AF1 if self.apply_af1_filter and "AF1" in vcf_line.INFO: status = self.is_valid_af1(vcf_line, self.minimum_af1) if status is False: logger.debug("{}: filter out AF1 {} on line {}".format( VT, vcf_line.INFO['AF1'], iline)) return False for key, value in filter_dict["INFO"].items(): # valid expr is e.g. sum(DP4[2],DP4[0]) # here, we first extract the variable, then add missing [ ] # brackets to make a list and use eval function after setting # the local variable DP4 in the locals namespace # PV4 skip non morphic cases (no need to filter) if key == "PV4" and self.is_polymorphic(vcf_line) is False: return True # Filter such as " sum(DP[0], DP4[2])<60 " if key.startswith("sum("): # add the missing [] to create an array expr = key.replace("sum(", "sum([")[0:-1] + "])" # identify the key mykey = expr[5:].split("[")[0] lcl = locals() lcl[mykey] = vcf_line.INFO[mykey] result = eval(expr) if self._filter_info_field(result, value): logger.debug( "{} filtered variant {},{} with value {}".format( VT, result, expr, value)) return False else: return True # key could be with an index e.g. "DP4[0]<4" if "[" in key: if "]" not in key: raise ValueError("Found invalid filter %s" % key) else: key, index = key.split("[", 1) key = key.strip() index = int(index.replace("]", "").strip()) else: index = 0 # otherwise, this is probably a valid simple filter such as "DP<4" try: if (type(vcf_line.INFO[key]) != list): if (self._filter_info_field(vcf_line.INFO[key], value)): val = vcf_line.INFO[key] logger.debug( "{}: filtered variant {},{} with value {}".format( VT, key, value, val)) return False else: Nlist = len(vcf_line.INFO[key]) if index > Nlist - 1: raise ValueError( "Index must be less than %s (starts at zero)" % Nlist) if (self._filter_info_field(vcf_line.INFO[key][index], value)): return False except KeyError: logger.debug( "The information key {} doesn't exist in VCF file (line {})." .format(key, iline + 1)) return True
def _filter_line(self, vcf_line, filter_dict=None): if filter_dict is None: # a copy to avoid side effects filter_dict = self.filter_dict.copy() if (vcf_line.QUAL < filter_dict["QUAL"]): logger.debug("filtered variant with QUAL below {}".format( filter_dict["QUAL"])) return False for key, value in filter_dict["INFO"].items(): # valid expr is e.g. sum(DP4[2],DP4[0]) # here, we first extract the variable, then add missing [ ] # brackets to make a list and use eval function after setting # the local variable DP4 in the locals namespace # Filter such as " sum(DP[0], DP4[2])<60 " if key.startswith("sum("): # add the missing [] to create an array expr = key.replace("sum(", "sum([")[0:-1] + "])" # identify the key mykey = expr[5:].split("[")[0] lcl = locals() lcl[mykey] = vcf_line.INFO[mykey] result = eval(expr) if self._filter_info_field(result, value): logger.debug("filtered variant {},{} with value {}".format( result, expr, value)) return False else: return True # key could be with an index e.g. "DP4[0]<4" if "[" in key: if "]" not in key: raise ValueError("Found innvalid filter %s" % key) else: key, index = key.split("[", 1) key = key.strip() index = int(index.replace("]", "").strip()) else: index = 0 # otherwise, this is probably a valid simple filter such as "DP<4" try: if (type(vcf_line.INFO[key]) != list): if (self._filter_info_field(vcf_line.INFO[key], value)): val = vcf_line.INFO[key] logger.debug( "filtered variant {},{} with value {}".format( key, value, val)) return False else: Nlist = len(vcf_line.INFO[key]) if index > Nlist - 1: raise ValueError( "Index must be less than %s (starts at zero)" % Nlist) if (self._filter_info_field(vcf_line.INFO[key][index], value)): return False except KeyError: logger.warning( "The information key doesn't exist in VCF file.") return True
def get_df_concordance(self, max_align=-1): """This methods returns a dataframe with Insert, Deletion, Match, Substitution, read length, concordance (see below for a definition) Be aware that the SAM or BAM file must be created using minimap2 and the --cs option to store the CIGAR in a new CS format, which also contains the information about substitution. Other mapper are also handled (e.g. bwa) but the substitution are solely based on the NM tag if it exists. alignment that have no CS tag or CIGAR are ignored. """ from sequana import Cigar count = 0 I, D, M, L, mapq, flags, NM = [], [], [], [], [], [], [] S = [] for i, a in enumerate(self._data): # tags and cigar populated if there is a match # if we use --cs cigar is not populated so we can only look at tags # tags can be an empty list if a.tags is None or len(a.tags) == 0: continue count += 1 mapq.append(a.mapq) L.append(a.qlen) try: NM.append([x[1] for x in a.tags if x[0] == "NM"][0]) except: NM.append(-1) flags.append(a.flag) if 'cs' in dict(a.tags): cs = CS(dict(a.tags)['cs']) S.append(cs['S']) I.append(cs['I']) D.append(cs['D']) M.append(cs['M']) elif a.cigarstring: cigar = Cigar(a.cigarstring).as_dict() I.append(cigar["I"]) D.append(cigar['D']) M.append(cigar['M']) S.append(None) # no info about substitutions in the cigar else: I.append(0) D.append(0) M.append(0) S.append(0) if max_align>0 and count == max_align: break if count % 10000 == 0: logger.debug("Read {} alignments".format(count)) I = np.array(I) D = np.array(D) M = np.array(M) NM = np.array(NM) try: S = np.array(S) C = 1 - (I + D + S)/(S + I + D + M) logger.info("computed Concordance based on minimap2 --cs option") except: logger.info("computed Concordance based on standard CIGAR information using INDEL and NM tag") computed_S = NM - D - I C = 1 - (I + D + computed_S)/(computed_S + I + D + M) df = pd.DataFrame([C, L, I, D, M, mapq, flags, NM, S]) df = df.T df.columns = ["concordance", 'length', "I", "D", "M", "mapq", "flags", "NM", "mismatch"] return df
def teardown(self, check_schema=True, check_input_files=True): """Save all files required to run the pipeline and perform sanity checks We copy the following files into the working directory: * the config file (config.yaml) * a NAME.sh that contains the snakemake command * the Snakefile (NAME.rules) For book-keeping and some parts of the pipelines, we copied the config file and its snakefile into the .sequana directory. We also copy the logo.png file if present into this .sequana directory and if present: * the cluster_config configuration files for snakemake * multiqc_config file for mutliqc reports * the schema.yaml file used to check the content of the config.yaml file if the config.yaml contains a requirements section, the files requested are copied in the working directory """ if check_input_files: self.check_input_files() # the config file self.config._update_yaml() self.config.save("{}/{}/config.yaml".format(self.workdir, ".sequana")) try: os.symlink("{}/config.yaml".format(".sequana"), "{}/config.yaml".format(self.workdir)) except: pass # the command with open("{}/{}.sh".format(self.workdir, self.name), "w") as fout: fout.write(self.command) # the snakefile shutil.copy(self.module.snakefile, "{}/{}".format(self.workdir, ".sequana")) try: os.symlink("{}/{}.rules".format(".sequana", self.name), "{}/{}.rules".format(self.workdir, self.name)) except: pass # the cluster config if any if self.module.logo: shutil.copy(self.module.logo, "{}/{}".format(self.workdir, ".sequana")) # the cluster config if any if self.module.cluster_config: shutil.copy(self.module.cluster_config, "{}".format(self.workdir)) # the multiqc if any if self.module.multiqc_config: shutil.copy(self.module.multiqc_config, "{}".format(self.workdir)) # the schema if any if self.module.schema_config: shutil.copy(self.module.schema_config, "{}".format(self.workdir)) # This is the place where we can check the entire validity of the # inputs based on the schema if check_schema: #logger.info("Checking config file with schema") from sequana import SequanaConfig cfg = SequanaConfig("{}/config.yaml".format(self.workdir)) cfg.check_config_with_schema("{}/schema.yaml".format( self.workdir)) # finally, we copy the files be found in the requirements section of the # config file. self.copy_requirements() # some information msg = "Check the script in {}/{}.sh as well as " msg += "the configuration file in {}/config.yaml.\n" print( self.colors.purple( msg.format(self.workdir, self.name, self.workdir))) msg = "Once ready, execute the script {}.sh using \n\n\t".format( self.name) if self.options.run_mode == "slurm": msg += "cd {}; sbatch {}.sh\n\n".format(self.workdir, self.name) else: msg += "cd {}; sh {}.sh\n\n".format(self.workdir, self.name) print(self.colors.purple(msg)) # Save an info.txt with the command used with open(self.workdir + "/.sequana/info.txt", "w") as fout: # from sequana import version fout.write("# sequana version: {}\n".format(version)) fout.write("# sequana_{} version: {}\n".format( self.name, self._get_package_version())) cmd1 = os.path.basename(sys.argv[0]) fout.write(" ".join([cmd1] + sys.argv[1:])) # save environement try: cmd = "conda list" with open("{}/.sequana/env.yml".format(self.workdir), "w") as fout: subprocess.call(cmd.split(), stdout=fout) logger.debug("Saved your conda environment into env.yml") except: cmd = "pip freeze" with open("{}/.sequana/pip.yml".format(self.workdir), "w") as fout: subprocess.call(cmd.split(), stdout=fout) logger.debug( "Saved your pip environement into pip.txt (conda not found)")