def get_tree_from_fasta(in_fasta): """ @warning: The root node must be present """ databank_tree = None FH_databank = FastaIO(in_fasta) for record in FH_databank: if record.description.endswith(";"): record.description = record.description[:-1] taxonomy = record.description.split(";") if databank_tree is None: databank_tree = Node(taxonomy[0]) parent = databank_tree for rank_depth, taxa in enumerate(taxonomy[1:]): if not parent.has_child(taxa): taxa_node = Node(taxa, parent) if (rank_depth + 1) == (len(taxonomy) - 1): # Current node is leaf taxa_node.metadata["seq_ids"] = [record.id] else: if (rank_depth + 1) == (len(taxonomy) - 1): # Current node is leaf taxa_node = parent.get_child(taxa) taxa_node.metadata["seq_ids"].append(record.id) parent = parent.get_child(taxa) FH_databank.close() return databank_tree
def write_subset(in_path, out_path, selected): FH_in = FastaIO(in_path) FH_out = FastaIO(out_path, "w") for record in FH_in: if record.id in selected: FH_out.write(record) FH_in.close() FH_out.close()
def biom_fasta_update(biom_in, fasta_in, fasta_out, log_file): FH_in = FastaIO( fasta_in ) FH_out = FastaIO( fasta_out, "w" ) biom = BiomIO.from_json( biom_in ) seq_in=0 seq_out=0 for record in FH_in: seq_in += 1 try: biom.find_idx("observation",record.id) except ValueError: pass else: FH_out.write(record) seq_out += 1 FH_in.close() FH_out.close() FH_log=open(log_file,"w") FH_log.write("Number of sequence in :" + str(seq_in)+"\n" ) FH_log.write("Number of sequence out :" + str(seq_out) +"\n")
def biom_fasta_update(biom_in, fasta_in, fasta_out, log_file): FH_in = FastaIO(fasta_in) FH_out = FastaIO(fasta_out, "w") biom = BiomIO.from_json(biom_in) seq_in = 0 seq_out = 0 for record in FH_in: seq_in += 1 try: biom.find_idx("observation", record.id) except ValueError: pass else: FH_out.write(record) seq_out += 1 FH_in.close() FH_out.close() FH_log = open(log_file, "w") FH_log.write("Number of sequence in :" + str(seq_in) + "\n") FH_log.write("Number of sequence out :" + str(seq_out) + "\n")
# Grinder to BIOM cmd_grinder2biom = os.path.join(os.path.dirname(os.path.abspath(__file__)), "grinder2biom.py") + \ " --affiliation " + os.path.abspath(args.databank) + \ " --output " + real_biom + \ " --samples" for current_sample in samples: cmd_grinder2biom += " '" + current_sample['name'] + ":" + current_sample['path'] + "'" subprocess.check_call( cmd_grinder2biom, shell=True ) # Add reference id in checked BIOM biom = BiomIO.from_json( args.checked_biom ) fasta = FastaIO( args.checked_fasta ) for record in fasta: reference = re.search("reference=([^\s]+)", record.description).group(1) biom.add_metadata( record.id, "grinder_source", reference, "observation" ) fasta.close() BiomIO.write( checked_biom, biom ) del(biom) # Compare expected to obtained for current_sample in samples: print current_sample['name'] cmd_compareSample = os.path.join(os.path.dirname(os.path.abspath(__file__)), "biomCmpTax.py") \ + " --real-biom " + os.path.abspath(real_biom) \ + " --real-tax-key 'real_taxonomy'" \ + " --checked-biom " + os.path.abspath(checked_biom) \ + " --checked-tax-key '" + args.taxonomy_key + "'" \ + (" --multi-affiliations" if args.multi_affiliations else "") \ + (" --uniq-groups " + args.uniq_groups if args.uniq_groups is not None else "") \ + " --sample " + current_sample['name'] print subprocess.check_output( cmd_compareSample, shell=True )
help='Path to the sequence file outputed by UTAX (format: fasta).') group_input.add_argument('-b', '--input-biom', required=True, help='Path to the abundance file (format: BIOM).') # Outputs group_output = parser.add_argument_group('Outputs') group_output.add_argument( '-o', '--output-biom', required=True, help='Path to the abundance file with taxonomy (format: BIOM).') args = parser.parse_args() # Process biom = BiomIO.from_json(args.input_biom) fasta = FastaIO(args.input_fasta) for record in fasta: # record.id example: Cluster_1;size=19714;tax=d:Bacteria(1.0000),p:"Proteobacteria"(0.9997),c:Alphaproteobacteria(0.9903),o:Rhodospirillales(0.9940),f:Acetobacteraceae(0.9887),g:Humitalea(0.9724); match = re.search("^([^\;]+)\;size\=\d+\;tax=(.+)$", record.id) if match is None: fasta.close() raise Exception("ID and taxonomy cannot be retrieved from '" + record.id + "'") record.id = match.group(1) record.description = match.group(2) biom.add_metadata(record.id, args.taxonomy_tag, record.description, "observation") fasta.close() BiomIO.write(args.output_biom, biom)
def process(params): biom_in = BiomIO.from_json(params.input_biom) # check if biom_in has blast_taxonomy affiliations if not biom_in.has_metadata("blast_affiliations"): raise_exception( Exception( "\n\n#ERROR : Your input biom file, " + os.path.basename(params.input_biom) + ", does not contain any blast_affiliations metadata.\n\n")) biom_out = Biom(generated_by='FROGS_aggregate_affiliated_otu', matrix_type="sparse") # add samples in biom_out for sample_name in biom_in.get_samples_names(): biom_out.add_sample(sample_name) # parse biom from most abondant OTU to less abondant one # save taxonomy # add OTU to biom_out if taxonomy is with poor %id or %cov or taxonomy not already saved # aggregate OTU to previous one if %id or %cov is big enough and share taxonomy with previous one # compute observation sum otu_sums = {} for otu_name, count_sum in biom_in.get_observations_counts(): otu_sums[otu_name] = count_sum # save "confident" taxonomy otu_by_tax = dict() # save aggregated_otu_composition aggregated_otu = OrderedDict() otu_in = 0 otu_out = 0 otu_aggregated = 0 # parse otu from most abondant to less ones for otu_name in sorted(otu_sums, key=lambda i: int(otu_sums[i]), reverse=True): otu_in += 1 observation = biom_in.get_observations_by_name(otu_name) # is this OTU poorly affiliated min_id = 100 min_cov = 100 tax = list() for affiliation in observation["metadata"]["blast_affiliations"]: if params.taxon_ignored and any( t in ";".join(affiliation["taxonomy"]) for t in params.taxon_ignored): continue if not affiliation["taxonomy"] in tax: tax.append(affiliation["taxonomy"]) percent_id = affiliation["perc_identity"] percent_cov = affiliation["perc_query_coverage"] if percent_id < min_id: min_id = percent_id if percent_cov < min_cov: min_cov = percent_cov # Add otu because of poor affiliations stat if min_id < params.identity or min_cov < params.coverage: otu_out += 1 biom_out.add_observation(otu_name, observation["metadata"]) for sample_name in biom_in.get_samples_names(): count = biom_in.get_count(otu_name, sample_name) biom_out.add_count(otu_name, sample_name, count) aggregated_otu[otu_name] = list() # for confident taxonomy else: # check if all taxonomies are new is_new_tax = True equivalent_otu_name = "" for taxonomy in tax: if isinstance(taxonomy, list): taxonomy = ";".join(taxonomy) if taxonomy in otu_by_tax: is_new_tax = False if equivalent_otu_name == "": equivalent_otu_name = otu_by_tax[taxonomy] elif otu_by_tax[taxonomy] != equivalent_otu_name: Logger.static_write( params.log_file, '\tWarning: observation ' + otu_name + ' shares taxonomy ( ' + taxonomy + ' with an other OTU : ' + otu_by_tax[taxonomy] + ', first detected OTU will be kept : ' + equivalent_otu_name + '\n') # if new tax, add OTU and save taxonomies if is_new_tax: otu_out += 1 biom_out.add_observation(otu_name, observation["metadata"]) for sample_name in biom_in.get_samples_names(): count = biom_in.get_count(otu_name, sample_name) if count > 0: biom_out.add_count(otu_name, sample_name, count) aggregated_otu[otu_name] = list() for taxonomy in tax: if isinstance(taxonomy, list): taxonomy = ";".join(taxonomy) otu_by_tax[taxonomy] = otu_name # else aggregation of OTU else: otu_aggregated += 1 equivalent_otu = biom_out.get_observations_by_name( equivalent_otu_name) # add blast_affiliations aggregated_blast_affi = equivalent_otu["metadata"][ "blast_affiliations"] + observation["metadata"][ "blast_affiliations"] biom_out.add_metadata(equivalent_otu_name, "blast_affiliations", aggregated_blast_affi, subject_type="observation", erase_warning=False) # update consensus tax consensus_tax = get_tax_consensus( [affi["taxonomy"] for affi in aggregated_blast_affi]) biom_out.add_metadata(equivalent_otu_name, "blast_taxonomy", consensus_tax, subject_type="observation", erase_warning=False) # update counts for sample_name in biom_in.get_samples_names(): count = biom_out.get_count( equivalent_otu_name, sample_name) + biom_in.get_count( otu_name, sample_name) biom_out.change_count(equivalent_otu_name, sample_name, count) # save aggregated composition aggregated_otu[equivalent_otu_name].append(otu_name) # update known taxonomies for taxonomy in tax: if isinstance(taxonomy, list): taxonomy = ";".join(taxonomy) if not taxonomy in otu_by_tax: otu_by_tax[taxonomy] = equivalent_otu_name # write biom output file BiomIO.write(params.output_biom, biom_out) # update fasta FH_in = FastaIO(params.input_fasta) FH_out = FastaIO(params.output_fasta, "wt") for record in FH_in: if record.id in aggregated_otu: FH_out.write(record) FH_in.close() FH_out.close() # write otu composition FH_compo = open(params.output_compo, "wt") for OTU in aggregated_otu: FH_compo.write(OTU + " " + " ".join(aggregated_otu[OTU]) + "\n") FH_compo.close() # simple log stat Logger.static_write(params.log_file, "# nb OTU in : " + str(otu_in) + "\n") Logger.static_write(params.log_file, "# nb OTU out : " + str(otu_out) + "\n") Logger.static_write(params.log_file, "# nb OTU aggregated : " + str(otu_aggregated) + "\n")
required=True, help= 'The sequence of the OTU seeds with reference id in description (format: fasta).' ) args = parser.parse_args() # Get observation sequences observation_id_by_seq = dict() FH_seeds = FastaIO(args.seeds_fasta) for record in FH_seeds: if record.string in observation_id_by_seq: raise Exception("The OTU '" + observation_id_by_seq[record.string] + "' and '" + record.id + "' have the same sequence.") observation_id_by_seq[record.string] = record.id.split(";size=")[0] FH_seeds.close() # Get centroids of observation reference_by_observation_id = dict() for file in args.reads: FH_reads = SequenceFileReader.factory(file) for record in FH_reads: if record.string in observation_id_by_seq: observation_id = observation_id_by_seq[record.string] reference_id = re.search("reference=([^\s]+)", record.description).group(1) if observation_id not in reference_by_observation_id: reference_by_observation_id[observation_id] = reference_id elif len(reference_by_observation_id[observation_id].split( ",")) > len(reference_id.split(",")): reference_by_observation_id[observation_id] = reference_id
parser.add_argument( '-i', '--input', required=True, help='Sequences file from mothur get.oturep (format: fasta).' ) parser.add_argument( '-t', '--trimmed-reads', required=True, nargs="+", help='Reads after all sequence modifications like aln (format: fasta or fastq). It is used to find the ID of clusters centroids by exact comparison between OTU sequences and reads sequences.' ) parser.add_argument( '-r', '--reads', required=True, nargs="+", help='Simulated reads used as input in mothur pipeline (format: fasta or fastq). These reads are used to retrieve simulation reference of the centroids. The link between centroids and reads is the sequence ID. The description of reads must contain the tag "reference=<REF_ID>".' ) parser.add_argument( '-o', '--output', required=True, help='Output file (format: fasta).' ) args = parser.parse_args() # Get observation sequences nb_observations = 0 observation_ids_by_seq = dict() FH_seeds = FastaIO(args.input) for record in FH_seeds: nb_observations += 1 if record.string not in observation_ids_by_seq: observation_ids_by_seq[record.string] = list() observation_ids_by_seq[record.string].append(record.id) FH_seeds.close() # Get centroids (the real centroid and indentical sequences) ID by observation observation_ids_by_centroid_id = dict() for file in args.trimmed_reads: FH_reads = SequenceFileReader.factory(file) for record in FH_reads: record_seq = record.string.replace("-", "").replace(".", "") if record_seq in observation_ids_by_seq: observation_ids_by_centroid_id[record.id] = observation_ids_by_seq[record_seq] FH_reads.close() # Get reference by observation reference_by_observation_id = dict() for file in args.reads: FH_reads = SequenceFileReader.factory(file)