Exemplo n.º 1
0
def vsearch(percent_id, genes, num_threads=num_vcpu):
    centroids = f"centroids.{percent_id}.ffn"
    uclust = f"uclust.{percent_id}.txt"
    # log = f"uclust.{percent_id}.log"
    if find_files(centroids) and find_files(uclust):
        tsprint(
            f"Found vsearch results at percent identity {percent_id} from prior run."
        )
    else:
        try:
            command(
                f"vsearch --quiet --cluster_fast {genes} --id {percent_id/100.0} --threads {num_threads} --centroids {centroids} --uc {uclust}"
            )
        except:
            # Do not keep bogus zero-length files;  those are harmful if we rerun in place.
            command(f"mv {centroids} {centroids}.bogus", check=False)
            command(f"mv {uclust} {uclust}.bogus", check=False)
            raise
    return centroids, uclust  #, log
Exemplo n.º 2
0
def init(args):
    """
    Input spec: https://github.com/czbiohub/iggtools/wiki#inputs
    Output spec: https://github.com/czbiohub/iggtools/wiki#target-layout-in-s3
    """

    msg = f"Building {outputs.genomes}."
    if find_files(outputs.genomes):
        if not args.force:
            tsprint(
                f"Destination {outputs.genomes} already exists.  Specify --force to overwrite."
            )
            return
        msg = f"Rebuilding {outputs.genomes}."
    tsprint(msg)

    id_remap = {}
    with InputStream(inputs.alt_species_ids) as ids:
        for row in select_from_tsv(
                ids, selected_columns=["alt_species_id", "species_id"]):
            new_id, old_id = row
            id_remap[old_id] = new_id

    seen_genomes, seen_species = set(), set()
    with OutputStream(outputs.genomes) as out:

        target_columns = [
            "genome", "species", "representative", "genome_is_representative"
        ]
        out.write("\t".join(target_columns) + "\n")

        with InputStream(inputs.genomes2species) as g2s:
            for row in select_from_tsv(
                    g2s, selected_columns=["MAG_code", "Species_id"]):
                genome, representative = row
                species = id_remap[representative]
                genome_is_representative = str(int(genome == representative))
                target_row = [
                    genome, species, representative, genome_is_representative
                ]
                out.write("\t".join(target_row) + "\n")
                seen_genomes.add(genome)
                seen_species.add(species)

    tsprint(
        f"Emitted {len(seen_genomes)} genomes and {len(seen_species)} species to {outputs.genomes}."
    )
Exemplo n.º 3
0
def hmmsearch(genome_id, species_id, marker_genes_hmm, num_threads=1):
    # Input
    annotated_genes_s3_path = input_annotations_file(genome_id, species_id, f"{genome_id}.faa.lz4")
    annotated_genes = download_reference(annotated_genes_s3_path)

    # Output
    hmmsearch_file = f"{genome_id}.hmmsearch"

    # Command
    if find_files(hmmsearch_file):
        # This only happens in debug mode, where we can use pre-existing file.
        tsprint(f"Found hmmsearch results for genome {genome_id} from prior run.")
    else:
        try:
            command(f"hmmsearch --noali --cpu {num_threads} --domtblout {hmmsearch_file} {marker_genes_hmm} {annotated_genes}")
        except:
            # Do not keep bogus zero-length files;  those are harmful if we rerun in place.
            command(f"mv {hmmsearch_file} {hmmsearch_file}.bogus", check=False)
            raise

    return hmmsearch_file
Exemplo n.º 4
0
def find_files_with_retry(f):
    return find_files(f)