Пример #1
0
    def download_ref_sequences_from_s3(self, accession_dict, output_reference_fasta, db_type,
                               loc_db, db_s3_path):
        ''' Download accessions specified in the selected_genera '''
        threads = []
        error_flags = {}
        semaphore = threading.Semaphore(64)
        mutex = threading.RLock()

        bucket, key = db_s3_path[5:].split("/", 1)
        loc_dict = shelve.open(loc_db.replace('.db', ''), 'r')
        accession_dir = os.path.join(self.output_dir_local, db_type, 'accessions')
        command.execute(f"mkdir -p {accession_dir}")
        for accession, taxinfo in accession_dict.items():
            accession_out_file = os.path.join(accession_dir, accession)
            semaphore.acquire()
            t = threading.Thread(
                target=PipelineStepDownloadAccessions.fetch_sequence_for_thread,
                args=[
                    error_flags, accession, accession_out_file, loc_dict,
                    bucket, key, semaphore, mutex
                ])
            t.start()
            threads.append(t)
        for t in threads:
            t.join()
        if error_flags:
            raise RuntimeError("Error in getting sequences by accession list.")
        # Combine all the downloaded accessions to a fasta file
        command.execute(f"find {accession_dir}/ -type f | xargs -n 32 -P 1 cat >> {output_reference_fasta}")
Пример #2
0
    def run(self):
        """
          Generate GSNAP index. To be called from idseq-infra
        """
        nt_db = self.input_files_local[0][0]
        output_nt_index_tar = self.output_files_local()[0]
        output_nt_index_parent_dir = os.path.dirname(output_nt_index_tar)
        output_tar_base = os.path.basename(output_nt_index_tar)
        output_nt_index_dir_base = output_tar_base[:-4]
        k = self.additional_attributes.get("k", 16)  # kmer k
        log.write(f"input: {nt_db} output: {output_nt_index_tar}")
        command.execute(
            command_patterns.SingleCommand(cmd="gmap_build",
                                           args=[
                                               "-D",
                                               output_nt_index_parent_dir,
                                               "-d", output_nt_index_dir_base,
                                               "-k", k, nt_db
                                           ]))

        output_nt_index_dir = os.path.join(output_nt_index_parent_dir,
                                           output_nt_index_dir_base)
        self.additional_output_folders_hidden.append(output_nt_index_dir)

        command.execute(
            command_patterns.SingleCommand(
                cd=output_nt_index_parent_dir,
                cmd="tar",
                args=["cvf", output_tar_base, output_nt_index_dir_base]))
Пример #3
0
 def fq2fa(input_fastq, output_fasta):
     ''' FASTQ to FASTA conversion '''
     step = "FASTQ to FASTA conversion"
     log.write(f"Starting {step}...")
     cmd = f"sed -n '1~4s/^@/>/p;2~4p' <{input_fastq} >{output_fasta}"
     command.execute(cmd)
     log.write(f"Finished {step}.")
Пример #4
0
 def generate_read_to_contig_mapping(assembled_contig, fasta_file,
                                     read2contig,
                                     duplicate_cluster_sizes_path,
                                     output_bowtie_sam,
                                     output_contig_stats):
     ''' read -> contig mapping through bowtie2 alignment '''
     base_output_dir = os.path.dirname(fasta_file)
     # build bowtie index based on assembled_contig
     bowtie_index_path = os.path.join(base_output_dir, 'bowtie-contig')
     command.make_dirs(bowtie_index_path)
     command.execute(
         command_patterns.SingleCommand(
             cmd='bowtie2-build',
             args=[assembled_contig, bowtie_index_path]))
     command.execute(
         command_patterns.ShellScriptCommand(
             script=
             r'''bowtie2 -x "${bowtie_index_path}" -f -U "${fasta_file}" --very-sensitive -p 32 > "${output_bowtie_sam}";''',
             named_args={
                 'bowtie_index_path': bowtie_index_path,
                 'fasta_file': fasta_file,
                 'output_bowtie_sam': output_bowtie_sam
             }))
     contig_stats = PipelineStepRunAssembly.generate_info_from_sam(
         output_bowtie_sam, read2contig, duplicate_cluster_sizes_path)
     with open(output_contig_stats, 'w') as ocf:
         json.dump(contig_stats, ocf)
Пример #5
0
def touch_s3_file(s3_file_path):
    try:
        command.execute("aws s3 cp --metadata '{\"touched\":\"now\"}' %s %s" %
                        (s3_file_path, s3_file_path))
        return True
    except:
        return False
Пример #6
0
def upload_log_file(sample_s3_output_path, lock=threading.RLock()):
    with lock:
        logh = logging.getLogger().handlers[0]
        logh.flush()
        command.execute(
            f"aws s3 cp --only-show-errors {logh.baseFilename} {sample_s3_output_path}/"
        )
Пример #7
0
 def multilinefa2singlelinefa(input_fasta, output_fasta):
     ''' Multi-line FASTA to Single-line FASTA conversion '''
     step = "Multi-line FASTA to single-line FASTA conversion"
     log.write(f"Starting {step}...")
     cmd = f"awk 'NR==1 {{print $0}} NR>1 && /^>/ {{printf(\"\\n%s\\n\",$0);next; }} NR>1 {{ printf(\"%s\",$0);}}  END {{printf(\"\\n\");}}' <{input_fasta} > {output_fasta}"
     command.execute(cmd)
     log.write(f"Finished {step}.")
Пример #8
0
    def generate_mapped_reads_tsv(self):
        """Use bedtools to generate a table of mapped reads for each genome in the ARG ANNOT database.
            If a new resistance gene db is used, the .bed file will need to be updated manually."""
        bed_file_path = fetch_reference(
            self.additional_files["resist_genome_bed"],
            self.ref_dir_local,
            allow_s3mi=False)
        sample_bam_file_path = self.output_files_local()[5]

        tmp_sort_dir = os.path.join(self.output_dir_local, "tmp_sort")
        command.make_dirs(tmp_sort_dir)

        # Convert the sorted.bam output from SRST2 to the bed format, then sort the bed file.
        # This allows us to use the "sorted" mode of bedtools coverage, which is memory-efficient.
        # Otherwise, large sorted.bam files will cause our machines to run out of RAM.
        #
        # Note that despite being called "sorted.bam", the bam is not sorted the way we need it to be.
        #
        # env LC_ALL=C ensures that the sort command uses the same sort order on all machines.
        #
        # The -T flag with tmp_sort_dir ensures that we make tmp files inside /mnt, which is where our huge AWS volumes are mounted.
        # By default, the sort command creates temp files in /tmp, which has very little disk space.
        command.execute(
            command_patterns.ShellScriptCommand(
                script='''
                    bedtools bamtobed -i "$1" |
                    env LC_ALL=C sort -T "$2" -k1,1 -k2,2n |
                    bedtools coverage -sorted -a "$3" -b stdin > "$4";''',
                args=[
                    sample_bam_file_path, tmp_sort_dir, bed_file_path,
                    os.path.join(self.output_dir_local, MATCHED_READS_FILE)
                ]))

        command.remove_rf(tmp_sort_dir)
Пример #9
0
    def subsample_fastas(input_fas, output_fas, max_fragments):
        ''' In memory subsampling '''
        paired = len(input_fas) >= 2
        # count lines
        cmd = "wc -l %s | cut -f1 -d ' '" % input_fas[0]
        total_records = int(command.execute_with_output(cmd)) // 2
        log.write("total reads: %d" % total_records)
        log.write("target reads: %d" % max_fragments)
        if total_records <= max_fragments:
            for infile, outfile in zip(input_fas, output_fas):
                command.execute("cp %s %s" % (infile, outfile))
            return

        # total_records > max_fragments, sample
        randgen = random.Random(x=hash(input_fas[0]))
        records_to_keep = randgen.sample(range(total_records), max_fragments)
        PipelineStepRunSubsample.subset(input_fas[0], output_fas[0],
                                        records_to_keep)
        if paired:
            PipelineStepRunSubsample.subset(input_fas[1], output_fas[1],
                                            records_to_keep)
            if len(input_fas) == 3 and len(output_fas) == 3:
                # subset the merged fasta
                records_to_keep_merged = []
                for r in records_to_keep:
                    records_to_keep_merged += [2 * r, 2 * r + 1]
                PipelineStepRunSubsample.subset(input_fas[2], output_fas[2],
                                                records_to_keep_merged)
Пример #10
0
    def run(self):
        input_fas = self.input_files_local[0]
        output_files = self.output_files_local()
        assert len(output_files) == len(
            input_fas) + 2, f"Context: {input_fas} -> {output_files}."
        output_fas = output_files[:len(input_fas)]
        duplicate_cluster_sizes_path = output_files[-1]
        assert duplicate_cluster_sizes_path.endswith(".tsv"), str(output_files)
        duplicate_clusters_path = output_files[-2]
        assert duplicate_clusters_path.endswith(".csv"), str(output_files)

        # See docstring above for explanation of these options.
        idseq_dedup_params = [
            '-i',
            input_fas[0],
            '-o',
            output_fas[0],
            '-l',
            '70',
            '-c',
            duplicate_clusters_path,
        ]
        if len(input_fas) == 2:
            idseq_dedup_params += ['-i', input_fas[1], '-o', output_fas[1]]
        command.execute(
            command_patterns.SingleCommand(cmd='idseq-dedup',
                                           args=idseq_dedup_params))

        # Emit cluster sizes.  One line per cluster.  Format "<cluster_size> <cluster_read_id>".
        # This info is loaded in multiple subsequent steps using m8.load_duplicate_cluster_sizes,
        # and used to convert unique read counts to original read counts, and also to compute
        # per-taxon DCRs emitted alongside taxon_counts.
        clusters_dict = parse_clusters_file(duplicate_clusters_path)
        save_duplicate_cluster_sizes(duplicate_cluster_sizes_path,
                                     clusters_dict)
 def run_blast_nt(blast_index_path, blast_m8, assembled_contig,
                  reference_fasta, blast_top_m8):
     blast_type = 'nucl'
     blast_command = 'blastn'
     min_alignment_length = NT_MIN_ALIGNMENT_LEN
     min_pident = NT_MIN_PIDENT
     max_evalue = MAX_EVALUE_THRESHOLD
     command.execute(
         command_patterns.SingleCommand(
             cmd="makeblastdb",
             args=[
                 "-in", reference_fasta, "-dbtype", blast_type, "-out",
                 blast_index_path
             ],
         ))
     command.execute(
         command_patterns.SingleCommand(
             cmd=blast_command,
             args=[
                 "-query", assembled_contig, "-db", blast_index_path,
                 "-out", blast_m8, "-outfmt",
                 '6 ' + ' '.join(m8.BLAST_OUTPUT_NT_SCHEMA.keys()),
                 '-evalue', 1e-10, '-max_target_seqs', 5000, "-num_threads",
                 16
             ],
             # We can only pass BATCH_SIZE as an env var.  The default is 100,000 for blastn;  10,000 for blastp.
             # Blast concatenates input queries until they exceed this size, then runs them together, for efficiency.
             # Unfortunately if too many short and low complexity queries are in the input, this can expand too
             # much the memory required.  We have found empirically 10,000 to be a better default.  It is also the
             # value used as default for remote blast.
             env=dict(os.environ, BATCH_SIZE="10000")))
     # further processing of getting the top m8 entry for each contig.
     PipelineStepBlastContigs.get_top_m8_nt(blast_m8, blast_top_m8,
                                            min_alignment_length,
                                            min_pident, max_evalue)
Пример #12
0
 def execute_srst2(self, is_paired, is_fasta, is_zipped):
     """Executes srst2 with appropriate parameters based on whether input files are zipped,
        paired reads and on file type."""
     srst2_params = []
     srst2_params.extend(self.get_common_params())
     if is_fasta:
         file_ext = '.fasta.gz' if is_zipped else '.fasta'
         srst2_params.extend(['--read_type', 'f'])
     else:
         file_ext = '.fastq.gz' if is_zipped else '.fastq'
     if is_paired:
         srst2_params.extend(['--input_pe'])
     else:
         srst2_params.extend(['--input_se'])
     for i, rd in enumerate(self.input_files_local[0]):
         link_name = f"_R{i+1}_001{file_ext}"
         command.execute(
             command_patterns.SingleCommand(cmd='ln',
                                            args=['-sf', rd, link_name]))
         srst2_params.append(link_name)
     if is_paired:
         srst2_params.extend(
             ['--forward', '_R1_001', '--reverse', '_R2_001'])
     command.execute(
         command_patterns.SingleCommand(cmd='srst2', args=srst2_params))
Пример #13
0
    def lzw_compute(input_files, threshold_readlength, slice_step=NUM_SLICES):
        """Spawn subprocesses on NUM_SLICES of the input files, then coalesce the
        scores into a temp file, and return that file's name."""

        temp_file_names = [f"lzwslice_{slice_step}_{slice_start}.txt" for slice_start in range(slice_step + 1)]
        for tfn in temp_file_names:
            assert not os.path.exists(tfn)

        @run_in_subprocess
        def lzw_compute_slice(slice_start):
            """For each read, or read pair, in input_files, such that read_index % slice_step == slice_start,
            output the lzw score for the read, or the min lzw score for the pair."""
            lzw_score = PipelineStepRunLZW.lzw_score
            with open(temp_file_names[slice_start], "a") as slice_output:
                for i, reads in enumerate(fasta.synchronized_iterator(input_files)):
                    if i % slice_step == slice_start:
                        lzw_min_score = min(lzw_score(r.sequence, threshold_readlength) for r in reads)
                        slice_output.write(str(lzw_min_score) + "\n")

        # slices run in parallel
        mt_map(lzw_compute_slice, range(slice_step))

        slice_outputs = temp_file_names[:-1]
        coalesced_score_file = temp_file_names[-1]
        # Paste can insert newlines at the end;  we grep those out.
        command.execute("paste -d '\n' " + " ".join(slice_outputs) + " | grep -v ^$ > " + coalesced_score_file)
        for tfn in slice_outputs:
            os.remove(tfn)
        return coalesced_score_file
Пример #14
0
    def run(self):
        # Setup
        input_fa = self.input_files_local[0][0]
        out_files = self.output_files_local()
        tmp = os.path.join(self.output_dir_local, "scratch_taxid_locator")
        command.execute(f"mkdir -p {tmp}")

        # TODO: Design a way to map in/out files more robustly, e.g. by name/type
        # Generate locator files for species NT, species NR, genus NT...
        i = 0
        for level in ["species", "genus", "family"]:
            for name in ("NT", "NR"):
                taxid_field = f"{level}_{name.lower()}"
                output_fa = out_files[i]
                output_json = out_files[i + 1]
                PipelineStepGenerateTaxidLocator.generate_locator_work(
                    input_fa, taxid_field, name, output_fa, output_json, tmp)
                i += 2

        # Generate combined JSON file (even-numbered in the output list)
        input_jsons = [f for i, f in enumerate(out_files) if i % 2 == 1]
        output_json = out_files[-1]  # Last combined file
        PipelineStepGenerateTaxidLocator.combine_json(input_jsons, output_json)

        # Cleanup
        command.execute(f"rm -rf {tmp}")
Пример #15
0
    def run(self):
        """
          1. extract contigs.fasta and read-contig.sam
          2. run pile up
        """
        contigs, _scaffolds, read_contig_sam, _stats = self.input_files_local[
            0]
        coverage_json, coverage_summary_csv = self.output_files_local()

        if os.path.getsize(contigs) < MIN_CONTIG_FILE_SIZE:
            command.write_text_to_file('{}', coverage_json)
            command.write_text_to_file('No Contigs', coverage_summary_csv)
            return

        # generate bam files
        bam_file = read_contig_sam.replace(".sam", ".bam")
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''samtools view -S -b "${read_contig_sam}" | samtools sort - -o "${bam_file}";''',
                named_args={
                    'read_contig_sam': read_contig_sam,
                    'bam_file': bam_file
                }))
        command.execute(
            command_patterns.SingleCommand(cmd="samtools",
                                           args=["index", bam_file]))
        # run coverage info
        output_csv, output_json = self.calc_contig2coverage(bam_file)
        os.rename(output_csv, coverage_summary_csv)
        os.rename(output_json, coverage_json)
Пример #16
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)

        genome_dir = fetch_reference(self.additional_files["gsnap_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        gsnap_base_dir = os.path.dirname(genome_dir)
        gsnap_index_name = os.path.basename(genome_dir)
        # Run Gsnap
        gsnap_params = [
            '-A', 'sam', '--batch=0', '--use-shared-memory=0',
            '--gmap-mode=all', '--npaths=1', '--ordered', '-t', 32,
            '--max-mismatches=40', '-D', gsnap_base_dir, '-d',
            gsnap_index_name, '-o', output_sam_file
        ] + input_fas
        command.execute(
            command_patterns.SingleCommand(cmd='gsnapl', args=gsnap_params))
        log.write("Finished GSNAP alignment.")

        # Extract out unmapped files from sam
        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Пример #17
0
 def generate_unidentified_fasta(input_fa, output_fa):
     # TODO  remove annotated fasta intermediate file and replace > with : below
     command.execute(
         command_patterns.ShellScriptCommand(
             script=
             r'''grep -A 1 '>NR::NT::' "$1" | sed '/^--$/d' > "$2";''',
             args=[input_fa, output_fa]))
Пример #18
0
 def get_genbank_genomes(self,
                         reference_taxids,
                         destination_dir,
                         superkingdom_name,
                         n=10):
     '''
     Retrieve up to n GenBank reference genomes under the reference_taxids.
     Assumes reference_taxids are species-level or below.
     Also assumes they are all in the same superkingdom, which is the only thing we need in our application.
     Saves the references under file names compatible with MakeKSNP3infile.
     TODO: Retrieve the genomes from S3 rather than ftp.ncbi.nih.gov (JIRA/IDSEQ-334).
     '''
     if n == 0 or not reference_taxids:
         return {}
     n_per_taxid = max(n // len(reference_taxids), 1)
     genbank_categories_by_superkingdom = {
         "Viruses": ["viral"],
         "Bacteria": ["bacteria"],
         "Eukaryota": ["fungi", "protozoa"],
         None: ["bacteria", "viral", "fungi", "protozoa"]
     }
     # additional options in genbank that we probably don't need right now:
     # ["archaea", "plant",
     # "vertebrate_mammalian", "vertebrate_other", "invertebrate",
     # "other", "metagenomes"]
     categories = genbank_categories_by_superkingdom[superkingdom_name]
     for cat in categories:
         genome_list_path_s3 = f"s3://idseq-public-references/genbank/{cat}/assembly_summary.txt"  # source: ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt
         genome_list_local = s3.fetch_from_s3(genome_list_path_s3,
                                              destination_dir)
         genomes = []
         for taxid in reference_taxids:
             taxid_genomes = PipelineStepGeneratePhyloTree.get_taxid_genomes(
                 genome_list_local, taxid, n_per_taxid)
             genomes += [
                 entry for entry in taxid_genomes if entry not in genomes
             ]
         genomes = genomes[:n]
         command.remove_file(genome_list_local)
         if genomes:
             genbank_fastas = {}
             for line in genomes:
                 assembly_accession, taxid, _species_taxid, _organism_name, ftp_path = line.split(
                     "\t")
                 ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz"
                 tree_node_name = f"genbank_{self.clean_name_for_ksnp3(assembly_accession)}"
                 local_fasta = f"{destination_dir}/{tree_node_name}.fasta"
                 if os.path.isfile(local_fasta):
                     local_fasta = f"{local_fasta.split('.')[0]}__I.fasta"
                 command.execute(
                     command_patterns.SingleCommand(
                         cmd='wget',
                         args=["-O", f"{local_fasta}.gz", ftp_fasta_gz]))
                 command.execute(
                     command_patterns.SingleCommand(
                         cmd='gunzip', args=[f"{local_fasta}.gz"]))
                 genbank_fastas[assembly_accession] = local_fasta
             return genbank_fastas
     return {}
Пример #19
0
 def trim_adapters_in_place(local_file):
     local_file_trimmed = os.path.join(
         os.path.dirname(local_file),
         "trimmed_" + os.path.basename(local_file))
     command.execute(
         f"cutadapt -a AGATCGGAAGAGCACACGTCT -o {local_file_trimmed} {local_file}"
     )
     command.execute(f"mv {local_file_trimmed} {local_file}")
Пример #20
0
    def __init__(self, *args, **kwrds):
        PipelineStep.__init__(self, *args, **kwrds)
        self.chunks_in_flight = threading.Semaphore(self.additional_attributes['chunks_in_flight'])
        self.chunks_result_dir_local = os.path.join(self.output_dir_local, "chunks")
        self.chunks_result_dir_s3 = os.path.join(self.output_dir_s3, "chunks")
        self.iostream_upload = multiprocessing.Semaphore(MAX_CONCURRENT_CHUNK_UPLOADS)

        command.execute("mkdir -p %s" % self.chunks_result_dir_local)
Пример #21
0
    def run(self):
        output_files = self.output_files_local()
        taxid = self.additional_attributes["taxid"]

        # Retrieve IDseq taxon fasta files
        local_taxon_fasta_files = []
        for _pipeline_run_id, byterange in self.additional_attributes["taxon_byteranges"].items():
            first_byte = byterange[0]
            last_byte = byterange[1]
            s3_file = byterange[2]
            local_basename = byterange[3]
            bucket, key = s3.split_identifiers(s3_file)
            local_file = os.path.join(self.output_dir_local, local_basename)
            s3.fetch_byterange(first_byte, last_byte, bucket, key, local_file)
            local_taxon_fasta_files.append(local_file)

        # Trim Illumina adapters
        # TODO: consider moving this to the beginning of the main pipeline
        PipelineStepGeneratePhyloTree.trim_adapters_in_place(local_taxon_fasta_files)

        # knsp3 has a command (MakeKSNP3infile) for making a ksnp3-compatible input file from a directory of fasta files.
        # Before we can use the command, we symlink all fasta files to a dedicated directory.
        # The command makes certain unreasonable assumptions we'll need to enforce:
        # - current directory is parent directory of the fasta file directory
        # - file names do not have dots except before extension (also no spaces)
        # - file names cannot be too long (for kSNP3 tree building).
        genome_name_map = PipelineStepGeneratePhyloTree.clean_filename_collection(local_taxon_fasta_files)
        input_dir_for_ksnp3 = f"{self.output_dir_local}/inputs_for_ksnp3"
        command.execute(f"mkdir {input_dir_for_ksnp3}")
        for local_file, genome_name in genome_name_map.items():
            command.execute(f"ln -s {local_file} {input_dir_for_ksnp3}/{genome_name}")

        # Retrieve Genbank references (full assembled genomes).
        # For now, we skip this using the option n=0 because
        # (a) sequences for the accession IDs actually matched by the sample are likely to be more relevant initially
        # (b) the downloads are slow
        # (c) the function only supports species-level taxids. If the phylo_tree's taxid in idseq-web is genus-level or higher,
        #     then we will need to decide on a list of species/strains to be included in the tree and pass those to the function.
        self.get_genbank_genomes(taxid, input_dir_for_ksnp3, 0)

        # Retrieve NCBI NT references for the accessions in the alignment viz files.
        # These are the accessions (not necessarily full genomes) that were actually matched
        # by the sample's reads during GSNAP alignment.
        self.get_accession_sequences(input_dir_for_ksnp3, 10)

        # Run MakeKSNP3infile.
        command.execute(f"cd {input_dir_for_ksnp3}/..; MakeKSNP3infile {os.path.basename(input_dir_for_ksnp3)} {self.output_dir_local}/inputs.txt A")

        # Now run ksnp3.
        # We can choose among 4 different output files, see http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0081760#s2:
        # (1) tree.parsimony.tre: basic, includes no node labels
        # (2) tree_AlleleCounts.parsimony.tre: labels the internal nodes with the number of SNPs that are shared exclusively by the descendants of that node
        # (3) tree_tipAlleleCounts.parsimony.tre: same as (2), but also labels the strain names at the tips with the number of SNPs that are exclusive to that strain.
        # (4) tree_AlleleCounts.parsimony.NodeLabel.tre: labels the internal nodes with the node number separated by an underscore from the number of SNPs that are
        #     shared exclusively by the descendants of that node.
        command.execute(f"cd {self.output_dir_local}; mkdir ksnp3_outputs; kSNP3 -in inputs.txt -outdir ksnp3_outputs -k 13")
        command.execute(f"mv {self.output_dir_local}/ksnp3_outputs/tree_tipAlleleCounts.parsimony.tre {output_files[0]}")
Пример #22
0
 def grab_wgs_accessions(self, source_file, dest_file):
     command.execute(
         command_patterns.ShellScriptCommand(
             script=
             r'''grep '^>' "${source_file}" | grep 'complete genome' | cut -f 1 -d' ' > "${dest_file}";''',
             named_args={
                 'source_file': source_file,
                 'dest_file': dest_file
             }))
Пример #23
0
    def run_star_part(self,
                      output_dir,
                      genome_dir,
                      input_files,
                      count_genes,
                      use_starlong):
        command.make_dirs(output_dir)

        cpus = str(multiprocessing.cpu_count())
        cd = output_dir
        cmd = 'STARlong' if use_starlong else 'STAR'
        params = [
            '--outFilterMultimapNmax', '99999',
            '--outFilterScoreMinOverLread', '0.5',
            '--outFilterMatchNminOverLread', '0.5',
            '--outReadsUnmapped', 'Fastx',
            '--outFilterMismatchNmax', '999',
            '--clip3pNbases', '0',
            '--runThreadN', cpus,
            '--genomeDir', genome_dir,
            '--readFilesIn', *input_files
        ]

        if self.collect_insert_size_metrics_for == "rna":
            params += [
                '--outSAMtype', 'BAM', 'Unsorted',
                '--outSAMmode', 'NoQS',
                # Based on experimentation we always want --quantMode TranscriptomeSAM GeneCounts
                #   for RNA to collect transcriptome-specific results to compute insert size metrics on
                #   https://czi.quip.com/4niiAhiJsFNx/2019-11-15-CollectInsertSizeMetrics-for-RNA
                '--quantMode', 'TranscriptomeSAM', 'GeneCounts',
            ]
        else:
            if self.collect_insert_size_metrics_for == "dna":
                params += ['--outSAMtype', 'BAM', 'Unsorted', '--outSAMmode', 'NoQS', ]
            else:
                params += ['--outSAMmode', 'None']

            count_file = f"{genome_dir}/sjdbList.fromGTF.out.tab"
            if count_genes and os.path.isfile(count_file):
                params += ['--quantMode', 'GeneCounts']

        if use_starlong:
            params += [
                '--seedSearchStartLmax', '20',
                '--seedPerReadNmax', '100000',
                '--seedPerWindowNmax', '1000',
                '--alignTranscriptsPerReadNmax', '100000',
                '--alignTranscriptsPerWindowNmax', '10000']

        command.execute(
            command_patterns.SingleCommand(
                cd=cd,
                cmd=cmd,
                args=params
            )
        )
Пример #24
0
    def chunk_input(self, input_files, chunksize):
        """Chunk input files into pieces for performance and parallelism."""
        part_lists = []  # Lists of partial files
        known_nlines = None
        part_suffix = ""
        chunk_nlines = chunksize * 2

        for input_file in input_files:
            # Count number of lines in the file
            nlines = int(
                command.execute_with_output("wc -l %s" %
                                            input_file).strip().split()[0])
            # Number of lines should be the same in paired files
            if known_nlines is not None:
                msg = "Mismatched line counts in supposedly paired files: {}".format(
                    input_files)
                assert nlines == known_nlines, msg
            known_nlines = nlines

            # Set number of pieces and names
            numparts = (nlines + chunk_nlines - 1) // chunk_nlines
            ndigits = len(str(numparts - 1))
            part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize,
                                                               numparts)
            out_prefix_base = os.path.basename(input_file) + part_suffix
            out_prefix = os.path.join(self.chunks_result_dir_local,
                                      out_prefix_base)

            # Split large file into smaller named pieces
            command.execute("split -a %d --numeric-suffixes -l %d %s %s" %
                            (ndigits, chunk_nlines, input_file, out_prefix))
            command.execute_with_retries(
                f"aws s3 sync --only-show-errors {self.chunks_result_dir_local}/ {self.chunks_result_dir_s3}/ --exclude '*' --include '{out_prefix_base}*'"
            )

            # Get the partial file names
            partial_files = []
            paths = command.execute_with_output(
                "ls %s*" % out_prefix).rstrip().split("\n")
            for pf in paths:
                partial_files.append(os.path.basename(pf))

            # Check that the partial files match our expected chunking pattern
            pattern = "{:0%dd}" % ndigits
            expected_partial_files = [(out_prefix_base + pattern.format(i))
                                      for i in range(numparts)]
            msg = "something went wrong with chunking: {} != {}".format(
                partial_files, expected_partial_files)
            assert expected_partial_files == partial_files, msg
            part_lists.append(partial_files)

        # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"],
        # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"],
        # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...]
        input_chunks = [list(part) for part in zip(*part_lists)]
        return part_suffix, input_chunks
Пример #25
0
def multilinefa2singlelinefa(input_fasta, output_fasta):
    ''' Multi-line FASTA to Single-line FASTA conversion '''
    command.execute(
        command_patterns.ShellScriptCommand(
            script=
            r'''awk 'NR==1 {print $0} NR>1 && /^>/ {printf("\n%s\n",$0);next; } NR>1 { printf("%s",$0);}  END {printf("\n");}' <"${input_fasta}" > "${output_fasta}";''',
            named_args={
                'input_fasta': input_fasta,
                'output_fasta': output_fasta
            }))
Пример #26
0
def fq2fa(input_fastq, output_fasta):
    ''' FASTQ to FASTA conversion '''
    command.execute(
        command_patterns.ShellScriptCommand(
            script=
            r'''sed -n '1~4s/^@/>/p;2~4p' <"${input_fastq}" > "${output_fasta}";''',
            named_args={
                'input_fastq': input_fastq,
                'output_fasta': output_fasta
            }))
Пример #27
0
 def __delete_remote_dir(self, remote_dir, key_path, remote_username,
                         instance_ip):
     """
     Delete a directory on a remote machine
     This needs to happen while we are holding the machine reservation,
     i.e., inside the "with ASGInstnace" context.
     """
     rm_command = f"rm -rf {remote_dir}"
     command.execute(
         command.remote(rm_command, key_path, remote_username, instance_ip))
Пример #28
0
 def run(self):
     # lz4 is "scalable with multi-cores CPU" so we let it parallelize
     # itself. See https://github.com/lz4/lz4 .
     for file_list in self.input_files_local:
         for input_file in file_list:
             if input_file.endswith(('.gz', '.zip', '.lz4')):
                 log.log_event(
                     f'Skipping already-compressed file {input_file}')
             else:
                 command.execute(self.get_command(input_file))
Пример #29
0
 def run(self):
     '''
         Dummy implementation. just copy the files over.
         Real thing to be implemented later.
     '''
     input_files = self.input_files_local[0]
     output_files = self.output_files_local()
     for i in range(len(input_files)):
         command.execute(f"cp {input_files[i]} {output_files[i]}")
     command.execute(f"echo 1234 > {output_files[4]}")
Пример #30
0
 def validate(self):
     ''' Make sure all the output files are generated. '''
     for f in self.output_files_local():
         if not os.path.exists(f):
             raise RuntimeError(
                 "output file %s should be generated after run" % f)
         # Tag the done files
         done_file = self.done_file(f)
         command.execute("date > %s" % done_file)
     self.count_reads()