def make_config(database_dir, threads, assembler, data_type='metagenome', interleaved_fastq=False, config='config.yaml'): """ Reads template config file with comments from ./template_config.yaml updates it by the parameters provided. Args: config (str): output file path for yaml database_dir (str): location of downloaded databases threads (int): number of threads per node to utilize assembler (str): either spades or megahit data_type (str): this is either metagenome or metatranscriptome """ from ruamel.yaml import YAML #used for yaml reading with comments yaml = YAML() yaml.version = (1, 1) yaml.default_flow_style = False template_conf_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), "template_config.yaml") with open(template_conf_file) as template_config: conf = yaml.load(template_config) conf["tmpdir"] = tempfile.gettempdir() conf["threads"] = multiprocessing.cpu_count() if not threads else threads conf["preprocess_adapters"] = os.path.join(database_dir, "adapters.fa") conf["contaminant_references"] = { "PhiX": os.path.join(database_dir, "phiX174_virus.fa") } if data_type == 'metatranscriptome': conf["contaminant_references"]["rRNA"] = os.path.join( database_dir, "silva_rfam_all_rRNAs.fa"), conf["data_type"] = data_type conf["interleaved_fastqs"] = interleaved_fastq conf["assembler"] = assembler conf["database_dir"] = database_dir #conf["refseq_namemap"] = os.path.join(database_dir, "refseq.db") #conf["refseq_tree"] = os.path.join(database_dir, "refseq.tree") #conf["diamond_db"] = os.path.join(database_dir, "refseq.dmnd") if os.path.exists(config): logging.warning( f"Config file {config} already exists, I didn't dare to overwrite it. continue..." ) else: with open(config, "w") as f: yaml.dump(conf, f) logging.info("Configuration file written to %s\n" "You may want to edit it using any text editor." % config)
def yaml(self, yaml_version=None): from ruamel.yaml import YAML y = YAML() y.preserve_quotes = True if yaml_version: y.version = yaml_version return y
def convert_yaml( yaml_data: str, output: Any, array=True, inject_comments=False ) -> None: yaml = YAML(typ="rt") yaml.version = "1.1" # type: ignore # yaml.version is mis-typed as None events = yaml.parse(yaml_data) output = JsonnetRenderer(events, output, array, inject_comments).render() return output
def get_loader(*args, **kwargs) -> YAML: yaml = YAML() yaml.version = (1, 2) # type: ignore yaml.Parser = Parser yaml._constructor = CustomConstructor(yaml, *args, **kwargs) # type: ignore return yaml
def make_config(config, path, data_type, database_dir, threads, assembler): """ Reads template config file with comments from ./template_config.yaml updates it by the parameters provided. Write the file `config` and complete the sample names and paths for all files in `path`. Args: config (str): output file path for yaml path (str): fastq/fasta data directory data_type (str): this is either metagenome or metatranscriptome database_dir (str): location of downloaded databases threads (int): number of threads per node to utilize assembler (str): either spades or megahit """ config = os.path.realpath(os.path.expanduser(config)) os.makedirs(os.path.dirname(config), exist_ok=True) path = os.path.realpath(os.path.expanduser(path)) database_dir = os.path.realpath(os.path.expanduser(database_dir)) yaml = YAML() yaml.version = (1, 1) yaml.default_flow_style = False template_conf_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), "template_config.yaml") with open(template_conf_file) as template_config: conf = yaml.load(template_config) samples = get_sample_files(path, data_type) logging.info("Found %d samples under %s" % (len(samples), path)) conf["samples"] = samples conf["tmpdir"] = tempfile.gettempdir() conf["threads"] = multiprocessing.cpu_count() if not threads else threads conf["preprocess_adapters"] = os.path.join(database_dir, "adapters.fa") conf["contaminant_references"] = { "rRNA": os.path.join(database_dir, "silva_rfam_all_rRNAs.fa"), "PhiX": os.path.join(database_dir, "phiX174_virus.fa") } conf["assembler"] = assembler conf["refseq_namemap"] = os.path.join(database_dir, "refseq.db") conf["refseq_tree"] = os.path.join(database_dir, "refseq.tree") conf["diamond_db"] = os.path.join(database_dir, "refseq.dmnd") with open(config, "w") as f: yaml.dump(conf, f) logging.info("Configuration file written to %s" % config)
def make_config(self): """ Reads template config file with comments from ./template_config.yaml updates it by the parameters provided. Args: config (str): output file path for yaml database_dir (str): location of downloaded databases threads (int): number of threads per node to utilize assembler (str): either spades or megahit data_type (str): this is either metagenome or metatranscriptome """ self.config = os.path.join(self.output, 'template_config.yaml') yaml = YAML() yaml.version = (1, 1) yaml.default_flow_style = False template_conf_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), "template_config.yaml") with open(template_conf_file) as template_config: conf = yaml.load(template_config) if self.assembly != "none": self.assembly = [os.path.abspath(p) for p in self.assembly] if self.pe1 != "none": self.pe1 = [os.path.abspath(p) for p in self.pe1] if self.pe2 != "none": self.pe2 = [os.path.abspath(p) for p in self.pe2] if self.longreads != "none": self.longreads = [os.path.abspath(p) for p in self.longreads] conf["fasta"] = self.assembly conf["max_threads"] = self.threads conf["pplacer_threads"] = self.pplacer_threads conf["max_memory"] = self.max_memory conf["short_reads_1"] = self.pe1 conf["short_reads_2"] = self.pe2 conf["long_reads"] = self.longreads conf["long_read_type"] = self.longread_type conf["min_contig_size"] = self.min_contig_size conf["min_bin_size"] = self.min_bin_size conf["gtdbtk_folder"] = os.path.abspath(self.gtdbtk) with open(self.config, "w") as f: yaml.dump(conf, f) logging.info("Configuration file written to %s\n" "You may want to edit it using any text editor." % self.config)
def get_loader(macros_root: str = None, context: ContextType = {}) -> YAML: yaml = YAML() yaml.version = (1, 2) # type: ignore yaml.Parser = Parser yaml._constructor = CustomConstructor( # type: ignore yaml, # macros_root=macros_root, macro_provider=MacroProvider(macros_root).get_macro, context=context ) return yaml
def get_yaml_instance( version: VersionType = (1, 2), indent: Any = {'mapping': 2, 'sequence': 4, 'offset': 2}, **kwargs: Any ) -> YAML: yaml = YAML(**kwargs) yaml.version = version # type: ignore yaml.Representer = CustomRepresenter yaml.indent(**indent) return yaml
def get_yaml_instance(version=(1, 2), indent={ 'mapping': 2, 'sequence': 4, 'offset': 2 }, **kwargs): yaml = YAML(**kwargs) yaml.Representer = CustomRepresenter yaml.version = version yaml.indent(**indent) return yaml
def write_yaml(self, p_data, p_filename, addnew=False): """ @param p_data: is the yaml data to be written. @param p_filename: is the name of the read in yaml file 'rooms.yaml' @param addnew: defaults to false, will add '-new' to the saved filename. """ l_now = datetime.datetime.now() l_node = self.m_pyhouse_obj._Config.YamlTree[p_filename] l_filename = l_node.YamlPath l_node.Yaml.insert(0, 'Skip', 'x', comment="Updated: " + str(l_now)) if addnew: l_filename += '-new' l_yaml = YAML(typ='rt') l_yaml.indent(mapping=2, sequence=4, offset=2) l_yaml.version = (1, 2) with open(l_filename, 'w+') as l_file: l_yaml.dump(p_data, l_file) LOG.debug('Saved Yaml file "{}"'.format(p_filename))
def get_constructor(): yaml = YAML() yaml._constructor = CustomConstructor(loader=yaml) yaml.version = (1, 2) return yaml.constructor
def convert_yaml_to_jsonnet(file, initialize=settings["initialize"]): # Initialize variable all_key_df = [] all_alias_df = [] f = StringIO() # Set file name file_name = Path(file).stem read_file = open(file, "r").read().replace("infinity", repr("infinity")) read_file = clean_comments(read_file) yaml = YAML(typ="rt") yaml.version = "1.1" # type: ignore # yaml.version is mis-typed as None # Parse and get events from the yml file # These events are Scalar Event, Sequence Start/ End event, Mapping event etc. events = yaml.parse(read_file) events_df = ( pd.DataFrame(events, columns=["event"]) .reset_index() .rename({"index": "event_id"}, axis=1) ) post_processed_read_file = copy.deepcopy(read_file).splitlines() output, keys, alias = y2j.convert_yaml( yaml_data=read_file, output=f, array=False, inject_comments=False ) output = output[: int(len(output) / 2)] output[-1] = output[-1].replace(",\n", "") output = [item for item in output if item != "#insert_comment"] copy_output = copy.deepcopy(output) output = "".join(output).splitlines() # Get all keys from the YML file # eg: { geo_K10_: "geo"} the key would be geo and is the 10th in position all_key_df = pd.DataFrame(keys, columns=["raw_key_id", "key_event"]) all_key_df["key_id"] = all_key_df.raw_key_id.apply( lambda x: x.replace("[", "").replace("]", "").replace("'", "") ) all_key_df[ ["key_value", "key_line_start", "key_col_start", "key_line_end", "key_col_end"] ] = pd.DataFrame( all_key_df.key_event.apply( lambda c: [ c.value, c.start_mark.line, c.start_mark.column, c.end_mark.line, c.end_mark.column, ] ).to_list() ) all_key_df["key_file_id"] = Path(file).stem all_key_df["line"] = all_key_df.key_line_start.apply( lambda x: post_processed_read_file[x] ) # Get all alias from the YML file # eg: default: { <<: *geo} the alias would be *geo all_alias_df = pd.DataFrame(alias, columns=["event"]) all_alias_df[ ["value", "line_start", "col_start", "line_end", "col_end"] ] = pd.DataFrame( all_alias_df.event.apply( lambda c: [ c.anchor, c.start_mark.line, c.start_mark.column, c.end_mark.line, c.end_mark.column, ] ).to_list() ) all_alias_df["file_id"] = Path(file).stem all_alias_df["line"] = all_alias_df.line_start.apply( lambda x: post_processed_read_file[x] ) all_alias_df["key"] = all_alias_df.line.apply(lambda x: x.lstrip().split()[0]) # Convert jsonnet to json and get path for each keys _jsonnet = importlib.import_module("_jsonnet") jsonnet_str = _jsonnet.evaluate_snippet("default", "\n".join(output)) obj = json.loads(jsonnet_str) d = benedict(obj, keypath_separator="|") k = d.keypaths(indexes=True) # Associate path to key # eg: default: { geo: &geo} (where path to geo is default.geo ) path_df = pd.DataFrame(k, columns=["path"]) path_df["key_id"] = path_df.path.apply(lambda x: x.split("|")[-1]) # Create a dataframe that links the reference tag to the actual key # eg: geo: &geo (where geo is the reference variable) refer_by = [] for line_id, line in enumerate(post_processed_read_file): if len(re.findall(r": &\S*|- &\S*", line)) > 0: pad = len(line) - len(line.lstrip()) refer_by.append([line, line_id, pad, re.findall(r": &\S*|- &\S*", line)]) refer_by_df = pd.DataFrame( refer_by, columns=["line", "ref_line_start", "ref_col_start", "reference"] ).explode("reference") refer_by_df["reference"] = refer_by_df.reference.apply( lambda x: re.sub("[^._|A-Za-z0-9/-]+", "", x) ) refer_by_df["ref_file_id"] = Path(file).stem refer_by_df["list_flag"] = refer_by_df.reference.apply( lambda x: True if x.lstrip().startswith("-") else False ) refer_by_df["reference"] = refer_by_df.reference.apply( lambda x: x[1:] if x.lstrip().startswith("-") else x ) original_read_file = read_file.splitlines() # Get all alais whivh have to be merged to the associated key # eg: default: { <<: *geo} -> geo needs to be merged to default indirect_ref = all_alias_df[all_alias_df.key.isin(["<<:"])] indirect_ref_associated_key_df = indirect_ref.apply( lambda x: get_associated_key( all_key_df, original_read_file[x.line_start], x.file_id, x.line_start, x.col_start, ), axis=1, ) indirect_ref_key_reference_df = pd.concat( [indirect_ref, indirect_ref_associated_key_df], axis=1 ) for line_start in indirect_ref_key_reference_df.line_start.unique(): post_processed_read_file[line_start] = "" f = StringIO() read_file = "\n".join(post_processed_read_file) processed_output, processed_keys, processed_alias = y2j.convert_yaml( yaml_data=read_file, output=f, array=False, inject_comments=False ) processed_output = processed_output[: int(len(processed_output) / 2)] processed_output[-1] = processed_output[-1].replace(",\n", "") processed_output = [item for item in processed_output if item != "#insert_comment"] processed_output = "".join(processed_output) # Initialize the reference paths for each key if file_name == "default": if initialize: default_reference_key = pd.DataFrame() else: default_reference_key = pd.read_csv(settings["reference_file"]) else: default_reference_key = pd.read_csv(settings["reference_file"]) if len(refer_by_df) > 0: if refer_by_df.list_flag.any(): reference_key_list = refer_by_df[refer_by_df.list_flag].apply( lambda x: get_associated_key( all_key_df, original_read_file[x.ref_line_start], x.ref_file_id, x.ref_line_start, x.ref_col_start, ), axis=1, ) reference_key_list = pd.concat( [ refer_by_df[refer_by_df.list_flag], reference_key_list[ [ "key_id", "key_line_start", "key_col_start", "key_col_end", "line", ] ], ], axis=1, ).reset_index(drop=True) reference_key_list["key_id"] = reference_key_list.apply( lambda x: x.key_id + "[" + str(x.name) + "]", axis=1 ) reference_key_no_list = pd.merge( refer_by_df[~refer_by_df.list_flag], all_key_df[ ["key_id", "key_line_start", "key_col_start", "key_col_end", "line"] ], on="line", ) if refer_by_df.list_flag.any(): reference_key = pd.concat( [ reference_key_no_list[ ["reference", "ref_file_id", "list_flag", "key_id"] ], reference_key_list[ ["reference", "ref_file_id", "list_flag", "key_id"] ], ] ).reset_index(drop=True) else: reference_key = reference_key_no_list[ ["reference", "ref_file_id", "list_flag", "key_id"] ] reference_key = pd.merge(reference_key, path_df, on="key_id") reference_key["reference_path"] = reference_key.apply( lambda x: ( x.ref_file_id + "." + ".".join( [ "[" + repr(p.replace("_h_", "-")) + "]" if "_h_" in p else p for p in x.path.split("|") ] ) ).replace(".[", "["), axis=1, ) default_reference_key = pd.concat( [reference_key[["reference", "reference_path"]], default_reference_key] ) if file_name == "default": if initialize: default_reference_key.to_csv(settings["reference_file"]) direct_df = all_alias_df[all_alias_df.key != "<<:"] indirect_df = all_alias_df[all_alias_df.key == "<<:"] indirect_df["key_id"] = indirect_df.apply( lambda x: get_associated_key( all_key_df, original_read_file[x.line_start], x.file_id, x.line_start, x.col_start, ), axis=1, )["key_id"] processed_output_list = processed_output.splitlines() for idx, val in indirect_df[["key_id", "value"]].iterrows(): l_id = [ line_id for line_id, e in enumerate(processed_output_list) if val.key_id in e ][0] ob = processed_output_list[l_id].split(":") if "*" in processed_output_list[l_id]: processed_output_list[l_id] = ( (ob[0] + ": " + repr("*" + val.value) + " + " + "".join(ob[1:])) .replace("null", "") .replace(".[", "[") ) else: processed_output_list[l_id] = ( (ob[0] + ": " + repr("*" + val.value) + " " + "".join(ob[1:])) .replace("null", "") .replace(".[", "[") ) # Associate comment to its associated key comments_df = ( pd.DataFrame( [e for e in flatten(events_df.event.apply(lambda x: x.comment)) if e], columns=["event"], ) .reset_index() .rename({"index": "event_id"}, axis=1) ) comments_df[ ["comment", "line_start", "col_start", "line_end", "col_end"] ] = pd.DataFrame( comments_df.event.apply( lambda c: [ c.value, c.start_mark.line, c.start_mark.column, c.end_mark.line, c.end_mark.column, ] ).to_list() ) comments_df["file_id"] = file_name associatied_comments_df = comments_df.apply( lambda x: get_associated_key_for_comment( all_key_df, original_read_file[x.line_start], x.file_id, x.line_start, x.col_start, ), axis=1, ) comments_df = pd.concat([comments_df, associatied_comments_df], axis=1) comments_df = comments_df[ comments_df.comment.apply(lambda x: not bool(re.match("^\n+$", x))) ] for idx, val in comments_df[["comment", "raw_key_id"]].iterrows(): l_id = [ line_id for line_id, e in enumerate(processed_output_list) if val.raw_key_id in e ][0] comment_line = ( "".join( [ "\n// " + line.lstrip()[1:].lstrip() for line in val.comment.splitlines() if len(line) > 0 ] ) + "\n" ) processed_output_list[l_id] = comment_line + processed_output_list[l_id] final_op = "\n".join(processed_output_list) # Perform clean up operations for idx, val in default_reference_key.iterrows(): final_op = final_op.replace(repr("*" + val.reference), val.reference_path) for f in re.findall("_K(.+?)_", final_op): final_op = final_op.replace("_K" + f + "_", "") for f in re.findall("_M(.+?)_", final_op): final_op = final_op.replace("_M" + f + "_", "") final_op = final_op.replace("_h_", "-").replace("// \n", "") final_op_lines = final_op.splitlines() for line_id, line in enumerate(final_op_lines): if "['<<']" in line: final_op_lines[line_id] = line.replace("['<<']:", "").rstrip()[:-1] + " + {" final_op_lines[line_id - 1] = "" # Create reference local variables if file_name != "default": final_op_lines = [ "local default = import 'default.jsonnet';", "{", "local " + file_name + " = $ ,", ] + final_op_lines[1:] else: final_op_lines = ["{", "local " + file_name + " = $ ,"] + final_op_lines[1:] # Format lists within the jsonnet complete_jsonnet = "" for line_id, line in enumerate(final_op_lines): if ( (":" not in line) & ("//" not in line) & ("{" not in line) & ("}" not in line) & (len(line) > 0) & ("=" not in line) & ("[" not in line) & ("]" not in line) & (".yml" not in line) & ("|||" not in line) & (line.rstrip().endswith(",")) ): final_op_lines[line_id - 1] = final_op_lines[line_id - 1].replace("\n", "") else: final_op_lines[line_id] = final_op_lines[line_id] + "\n" complete_jsonnet = "".join(final_op_lines) complete_jsonnet = complete_jsonnet.replace("\n\n//", "\n//") complete_jsonnet = complete_jsonnet.replace("\n\n|||", "\n|||") # Write the output to jsonnet file with open(settings["output_path"] + file_name + ".jsonnet", "w") as f: f.write(complete_jsonnet) print(file_name + " has been processed!")