예제 #1
0
def write_subset(in_path, out_path, selected):
    FH_in = FastaIO(in_path)
    FH_out = FastaIO(out_path, "w")
    for record in FH_in:
        if record.id in selected:
            FH_out.write(record)
    FH_in.close()
    FH_out.close()
예제 #2
0
def write_subset(in_path, out_path, selected):
    FH_in = FastaIO(in_path)
    FH_out = FastaIO(out_path, "w")
    for record in FH_in:
        if record.id in selected:
            FH_out.write(record)
    FH_in.close()
    FH_out.close()
예제 #3
0
def biom_fasta_update(biom_in, fasta_in, fasta_out, log_file):
    FH_in = FastaIO( fasta_in )
    FH_out = FastaIO( fasta_out, "w" )
    biom = BiomIO.from_json( biom_in )
    seq_in=0
    seq_out=0

    for record in FH_in:
        seq_in += 1
        try:
            biom.find_idx("observation",record.id)
        except ValueError:
            pass
        else:
            FH_out.write(record)
            seq_out += 1
    FH_in.close()
    FH_out.close()
    FH_log=open(log_file,"w")
    FH_log.write("Number of sequence in :" + str(seq_in)+"\n" )
    FH_log.write("Number of sequence out :" + str(seq_out) +"\n") 
예제 #4
0
def biom_fasta_update(biom_in, fasta_in, fasta_out, log_file):
    FH_in = FastaIO(fasta_in)
    FH_out = FastaIO(fasta_out, "w")
    biom = BiomIO.from_json(biom_in)
    seq_in = 0
    seq_out = 0

    for record in FH_in:
        seq_in += 1
        try:
            biom.find_idx("observation", record.id)
        except ValueError:
            pass
        else:
            FH_out.write(record)
            seq_out += 1
    FH_in.close()
    FH_out.close()
    FH_log = open(log_file, "w")
    FH_log.write("Number of sequence in :" + str(seq_in) + "\n")
    FH_log.write("Number of sequence out :" + str(seq_out) + "\n")
예제 #5
0
def process(params):

    biom_in = BiomIO.from_json(params.input_biom)
    # check if biom_in has blast_taxonomy affiliations
    if not biom_in.has_metadata("blast_affiliations"):
        raise_exception(
            Exception(
                "\n\n#ERROR : Your input biom file, " +
                os.path.basename(params.input_biom) +
                ", does not contain any blast_affiliations metadata.\n\n"))

    biom_out = Biom(generated_by='FROGS_aggregate_affiliated_otu',
                    matrix_type="sparse")

    # add samples in biom_out
    for sample_name in biom_in.get_samples_names():
        biom_out.add_sample(sample_name)

    # parse biom from most abondant OTU to less abondant one
    # save taxonomy
    # add OTU to biom_out if taxonomy is with poor %id or %cov or taxonomy not already saved
    # aggregate OTU to previous one if %id or %cov is big enough and share taxonomy with previous one

    # compute observation sum
    otu_sums = {}
    for otu_name, count_sum in biom_in.get_observations_counts():
        otu_sums[otu_name] = count_sum

    # save "confident" taxonomy
    otu_by_tax = dict()
    # save aggregated_otu_composition
    aggregated_otu = OrderedDict()
    otu_in = 0
    otu_out = 0
    otu_aggregated = 0

    # parse otu from most abondant to less ones
    for otu_name in sorted(otu_sums,
                           key=lambda i: int(otu_sums[i]),
                           reverse=True):
        otu_in += 1
        observation = biom_in.get_observations_by_name(otu_name)

        # is this OTU poorly affiliated
        min_id = 100
        min_cov = 100
        tax = list()
        for affiliation in observation["metadata"]["blast_affiliations"]:
            if params.taxon_ignored and any(
                    t in ";".join(affiliation["taxonomy"])
                    for t in params.taxon_ignored):
                continue
            if not affiliation["taxonomy"] in tax:
                tax.append(affiliation["taxonomy"])
            percent_id = affiliation["perc_identity"]
            percent_cov = affiliation["perc_query_coverage"]
            if percent_id < min_id:
                min_id = percent_id
            if percent_cov < min_cov:
                min_cov = percent_cov

        # Add otu because of poor affiliations stat
        if min_id < params.identity or min_cov < params.coverage:
            otu_out += 1
            biom_out.add_observation(otu_name, observation["metadata"])
            for sample_name in biom_in.get_samples_names():
                count = biom_in.get_count(otu_name, sample_name)
                biom_out.add_count(otu_name, sample_name, count)
            aggregated_otu[otu_name] = list()
        # for confident taxonomy
        else:
            # check if all taxonomies are new
            is_new_tax = True
            equivalent_otu_name = ""

            for taxonomy in tax:
                if isinstance(taxonomy, list):
                    taxonomy = ";".join(taxonomy)
                if taxonomy in otu_by_tax:
                    is_new_tax = False
                    if equivalent_otu_name == "":
                        equivalent_otu_name = otu_by_tax[taxonomy]
                    elif otu_by_tax[taxonomy] != equivalent_otu_name:
                        Logger.static_write(
                            params.log_file, '\tWarning: observation ' +
                            otu_name + ' shares taxonomy ( ' + taxonomy +
                            ' with an other OTU : ' + otu_by_tax[taxonomy] +
                            ', first detected OTU will be kept : ' +
                            equivalent_otu_name + '\n')

            # if new tax, add OTU and save taxonomies
            if is_new_tax:
                otu_out += 1
                biom_out.add_observation(otu_name, observation["metadata"])
                for sample_name in biom_in.get_samples_names():
                    count = biom_in.get_count(otu_name, sample_name)
                    if count > 0:
                        biom_out.add_count(otu_name, sample_name, count)
                aggregated_otu[otu_name] = list()
                for taxonomy in tax:
                    if isinstance(taxonomy, list):
                        taxonomy = ";".join(taxonomy)
                    otu_by_tax[taxonomy] = otu_name
            # else aggregation of OTU
            else:
                otu_aggregated += 1
                equivalent_otu = biom_out.get_observations_by_name(
                    equivalent_otu_name)
                # add blast_affiliations
                aggregated_blast_affi = equivalent_otu["metadata"][
                    "blast_affiliations"] + observation["metadata"][
                        "blast_affiliations"]
                biom_out.add_metadata(equivalent_otu_name,
                                      "blast_affiliations",
                                      aggregated_blast_affi,
                                      subject_type="observation",
                                      erase_warning=False)
                # update consensus tax
                consensus_tax = get_tax_consensus(
                    [affi["taxonomy"] for affi in aggregated_blast_affi])
                biom_out.add_metadata(equivalent_otu_name,
                                      "blast_taxonomy",
                                      consensus_tax,
                                      subject_type="observation",
                                      erase_warning=False)
                # update counts
                for sample_name in biom_in.get_samples_names():
                    count = biom_out.get_count(
                        equivalent_otu_name, sample_name) + biom_in.get_count(
                            otu_name, sample_name)
                    biom_out.change_count(equivalent_otu_name, sample_name,
                                          count)
                # save aggregated composition
                aggregated_otu[equivalent_otu_name].append(otu_name)
                # update known taxonomies
                for taxonomy in tax:
                    if isinstance(taxonomy, list):
                        taxonomy = ";".join(taxonomy)
                    if not taxonomy in otu_by_tax:
                        otu_by_tax[taxonomy] = equivalent_otu_name

    # write biom output file
    BiomIO.write(params.output_biom, biom_out)

    # update fasta
    FH_in = FastaIO(params.input_fasta)
    FH_out = FastaIO(params.output_fasta, "wt")
    for record in FH_in:
        if record.id in aggregated_otu:
            FH_out.write(record)
    FH_in.close()
    FH_out.close()

    # write otu composition
    FH_compo = open(params.output_compo, "wt")
    for OTU in aggregated_otu:
        FH_compo.write(OTU + " " + " ".join(aggregated_otu[OTU]) + "\n")
    FH_compo.close()

    # simple log stat
    Logger.static_write(params.log_file, "# nb OTU in : " + str(otu_in) + "\n")
    Logger.static_write(params.log_file,
                        "# nb OTU out : " + str(otu_out) + "\n")
    Logger.static_write(params.log_file,
                        "# nb OTU aggregated : " + str(otu_aggregated) + "\n")
예제 #6
0
            if record.string in observation_id_by_seq:
                observation_id = observation_id_by_seq[record.string]
                reference_id = re.search("reference=([^\s]+)",
                                         record.description).group(1)
                if observation_id not in reference_by_observation_id:
                    reference_by_observation_id[observation_id] = reference_id
                elif len(reference_by_observation_id[observation_id].split(
                        ",")) > len(reference_id.split(",")):
                    reference_by_observation_id[observation_id] = reference_id
        FH_reads.close()
    if len(observation_id_by_seq) != len(reference_by_observation_id):
        missing = list()
        for seed_seq in observation_id_by_seq:
            if observation_id_by_seq[
                    seed_seq] not in reference_by_observation_id:
                missing.append(observation_id_by_seq[seed_seq])
        raise Exception(
            "All the centroids sequences cannot be retrieved in reads files. Centroids without read: '"
            + "' '".join(missing) + "'.")

    # Write seeds fasta with reference information
    FH_seeds = FastaIO(args.seeds_fasta)
    FH_seeds_with_ref = FastaIO(args.annotated_fasta, "w")
    for record in FH_seeds:
        record.id = record.id.split(";size=")[0]
        record.description = "reference=" + reference_by_observation_id[
            record.id]
        FH_seeds_with_ref.write(record)
    FH_seeds.close()
    FH_seeds_with_ref.close()
예제 #7
0
            record_seq = record.string.replace("-", "").replace(".", "")
            if record_seq in observation_ids_by_seq:
                observation_ids_by_centroid_id[record.id] = observation_ids_by_seq[record_seq]
        FH_reads.close()

    # Get reference by observation
    reference_by_observation_id = dict()
    for file in args.reads:
        FH_reads = SequenceFileReader.factory(file)
        for record in FH_reads:
            if record.id in observation_ids_by_centroid_id:
                observation_ids = observation_ids_by_centroid_id[record.id]
                reference_id = re.search("reference=([^\s]+)", record.description).group(1)
                for current_obs_id in observation_ids:
                    if current_obs_id not in reference_by_observation_id:
                        reference_by_observation_id[current_obs_id] = reference_id
                    elif len(reference_by_observation_id[current_obs_id].split(",")) > len(reference_id.split(",")):
                        reference_by_observation_id[current_obs_id] = reference_id
        FH_reads.close()
    if nb_observations != len(reference_by_observation_id):
        raise Exception("All the centroids sequences cannot be retrieved in reads files.")

    # Write seeds fasta with reference information
    FH_seeds = FastaIO(args.input)
    FH_seeds_with_ref = FastaIO(args.output, "w")
    for record in FH_seeds:
        record.description = "reference=" + reference_by_observation_id[record.id]
        FH_seeds_with_ref.write(record)
    FH_seeds.close()
    FH_seeds_with_ref.close()