def fetch_ncbi(accession):
     query = accession
     base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
     search_url = f"{base}/esearch.fcgi?db=nuccore&term={query}&usehistory=y"
     output = command.execute_with_output(
         command_patterns.SingleCommand(
             cmd="curl",
             args=[search_url]
         )
     )
     root = ET.fromstring(output)
     web = root.find('WebEnv').text
     key = root.find('QueryKey').text
     fetch_url = f"{base}/efetch.fcgi?db=nuccore&query_key={key}&WebEnv={web}&rettype=gb&retmode=xml"
     genbank_xml = command.execute_with_output(
         command_patterns.SingleCommand(
             cmd="curl",
             args=[fetch_url]
         )
     )
     return {
         'search_url': search_url,
         'fetch_url': fetch_url,
         'genbank_xml': genbank_xml
     }
Пример #2
0
    def chunk_input(self, input_files, chunksize):
        """Chunk input files into pieces for performance and parallelism."""
        part_lists = []  # Lists of partial files
        known_nlines = None
        part_suffix = ""
        chunk_nlines = chunksize * 2

        for input_file in input_files:
            # Count number of lines in the file
            nlines = int(
                command.execute_with_output("wc -l %s" %
                                            input_file).strip().split()[0])
            # Number of lines should be the same in paired files
            if known_nlines is not None:
                msg = "Mismatched line counts in supposedly paired files: {}".format(
                    input_files)
                assert nlines == known_nlines, msg
            known_nlines = nlines

            # Set number of pieces and names
            numparts = (nlines + chunk_nlines - 1) // chunk_nlines
            ndigits = len(str(numparts - 1))
            part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize,
                                                               numparts)
            out_prefix_base = os.path.basename(input_file) + part_suffix
            out_prefix = os.path.join(self.chunks_result_dir_local,
                                      out_prefix_base)

            # Split large file into smaller named pieces
            command.execute("split -a %d --numeric-suffixes -l %d %s %s" %
                            (ndigits, chunk_nlines, input_file, out_prefix))
            command.execute_with_retries(
                f"aws s3 sync --only-show-errors {self.chunks_result_dir_local}/ {self.chunks_result_dir_s3}/ --exclude '*' --include '{out_prefix_base}*'"
            )

            # Get the partial file names
            partial_files = []
            paths = command.execute_with_output(
                "ls %s*" % out_prefix).rstrip().split("\n")
            for pf in paths:
                partial_files.append(os.path.basename(pf))

            # Check that the partial files match our expected chunking pattern
            pattern = "{:0%dd}" % ndigits
            expected_partial_files = [(out_prefix_base + pattern.format(i))
                                      for i in range(numparts)]
            msg = "something went wrong with chunking: {} != {}".format(
                partial_files, expected_partial_files)
            assert expected_partial_files == partial_files, msg
            part_lists.append(partial_files)

        # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"],
        # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"],
        # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...]
        input_chunks = [list(part) for part in zip(*part_lists)]
        return part_suffix, input_chunks
Пример #3
0
 def get_total_reads(self, is_zipped, is_fasta):
     """Gets the total number of reads in the sample by counting them directly from the
         fastq or fasta files."""
     # TODO: factor out into utility function, see nonhost_fastq
     input_filenames = self.input_files_local[0]
     if is_zipped:
         unzipped_filenames = []
         for filename in input_filenames:
             if not os.path.exists(filename[:len(filename) - 3]):
                 command.execute(
                     command_patterns.SingleCommand(
                         cmd='gunzip',
                         args=[
                             '-k',
                             filename
                         ]
                     )
                 )
             unzipped_filenames.append(filename[:len(filename) - 3])
         input_filenames = unzipped_filenames
     if is_fasta:  # Number of lines per read can vary, so we use grep
         grep_output = command.execute_with_output(
             command_patterns.SingleCommand(
                 cmd='grep',
                 args=[
                     '-c',
                     '^>',  # fastas start reads with "^>".
                     *input_filenames
                 ]
             )
         )
         output_lines = [line for line in grep_output.split("\n") if line != '']
         if ":" in output_lines[0]:
             # for paired fastas - when run on just one file, grep outputs only
             # a number. But when this command is run on two files, grep outputs
             # a string formatted as filename:count for each file, with count being
             # what we want to add up.
             read_counts = map(lambda line: int(line.split(":")[1]), output_lines)
             return reduce(lambda x, y: x + y, list(read_counts))
         else:
             return int(output_lines[0])
     else:  # fastqs have 4 lines for every read, so we count lines and divide by 4
         wc_params = ['wc', '-l']
         wc_params.extend(input_filenames)
         wc_output = command.execute_with_output(" ".join(wc_params))
         # take the set of characters from the last line, which is the total number of lines
         # for paired reads or the only line for unpaired reads
         wc_lines = [line for line in wc_output.split("\n") if line != '']
         wc_target_line = [line for line in wc_lines[-1].split(" ") if line != '']
         total_line_count = int(wc_target_line[0])
         return total_line_count / 4
Пример #4
0
 def get_genbank_genomes(self, reference_taxids, destination_dir, superkingdom_name, n=10):
     '''
     Retrieve up to n GenBank reference genomes under the reference_taxids.
     Assumes reference_taxids are species-level or below.
     Also assumes they are all in the same superkingdom, which is the only thing we need in our application.
     Saves the references under file names compatible with MakeKSNP3infile.
     TODO: Retrieve the genomes from S3 rather than ftp.ncbi.nih.gov (JIRA/IDSEQ-334).
     '''
     if n == 0 or not reference_taxids:
         return {}
     n_per_taxid = max(n // len(reference_taxids), 1)
     genbank_categories_by_superkingdom = {
         "Viruses": ["viral"],
         "Bacteria": ["bacteria"],
         "Eukaryota": ["fungi", "protozoa"],
         None: ["bacteria", "viral", "fungi", "protozoa"]
     }
     # additional options in genbank that we probably don't need right now:
     # ["archaea", "plant", 
     # "vertebrate_mammalian", "vertebrate_other", "invertebrate",
     # "other", "metagenomes"]
     categories = genbank_categories_by_superkingdom[superkingdom_name]
     for cat in categories:
         genome_list_path_s3 = f"s3://idseq-database/genbank/{cat}/assembly_summary.txt" # source: ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt
         genome_list_local = s3.fetch_from_s3(genome_list_path_s3, destination_dir)
         genomes = []
         for taxid in reference_taxids:
             cmd = f"cut -f1,6,7,8,20 {genome_list_local}" # columns: 1 = assembly_accession; 6 = taxid; 7 = species_taxid, 8 = organism_name, 20 = ftp_path
             cmd += f" | awk -F '\t' '$2 == {taxid}'" # try to find taxid in the taxid column (2nd column of the piped input)
             cmd += f" | head -n {n_per_taxid}" # take only top n_per_taxid results
             taxid_genomes = list(filter(None, command.execute_with_output(cmd).split("\n")))
             genomes += [entry for entry in taxid_genomes if entry not in genomes]
         genomes = genomes[:n]
         command.execute_with_output(f"rm {genome_list_local}")
         if genomes:
             genbank_fastas = {}
             for line in genomes:
                 assembly_accession, taxid, species_taxid, organism_name, ftp_path = line.split("\t")
                 ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz"
                 tree_node_name = f"genbank_{self.clean_name_for_ksnp3(assembly_accession)}"
                 local_fasta = f"{destination_dir}/{tree_node_name}.fasta"
                 if os.path.isfile(local_fasta):
                     local_fasta = f"{local_fasta.split('.')[0]}__I.fasta"
                 command.execute(f"wget -O {local_fasta}.gz {ftp_fasta_gz}")
                 command.execute(f"gunzip {local_fasta}.gz")
                 genbank_fastas[assembly_accession] = local_fasta
             return genbank_fastas
     return {}
Пример #5
0
 def __check_if_output_is_corrupt(
         self,
         service,
         key_path,
         remote_username,
         instance_ip,  # self unused
         multihit_remote_outfile,
         chunk_id,
         try_number):
     # Check if every row has correct number of columns (12) in the output
     # file on the remote machine
     if service == "gsnap":
         verification_command = "cat %s" % shlex.quote(
             multihit_remote_outfile)
     else:
         # For rapsearch, first remove header lines starting with '#'
         verification_command = "grep -v '^#' %s" % shlex.quote(
             multihit_remote_outfile)
     verification_command += " | awk '{print NF}' | sort -nu | head -n 1"
     min_column_number_string = command.execute_with_output(
         command.remote(verification_command, key_path, remote_username,
                        instance_ip))
     min_column_number = PipelineStepRunAlignmentRemotely.__interpret_min_column_number_string(
         min_column_number_string, CORRECT_NUMBER_OF_OUTPUT_COLUMNS,
         try_number)
     error = None
     if min_column_number != CORRECT_NUMBER_OF_OUTPUT_COLUMNS:
         msg = "Chunk %s output corrupt; not copying to S3. min_column_number = %d -> expected = %d."
         msg += " Re-start pipeline to try again."
         error = msg % (chunk_id, min_column_number,
                        CORRECT_NUMBER_OF_OUTPUT_COLUMNS)
     return error
Пример #6
0
    def subsample_fastas(input_fas, output_fas, max_fragments):
        ''' In memory subsampling '''
        paired = len(input_fas) >= 2
        # count lines
        cmd = "wc -l %s | cut -f1 -d ' '" % input_fas[0]
        total_records = int(command.execute_with_output(cmd)) // 2
        log.write("total reads: %d" % total_records)
        log.write("target reads: %d" % max_fragments)
        if total_records <= max_fragments:
            for infile, outfile in zip(input_fas, output_fas):
                command.execute("cp %s %s" % (infile, outfile))
            return

        # total_records > max_fragments, sample
        randgen = random.Random(x=hash(input_fas[0]))
        records_to_keep = randgen.sample(range(total_records), max_fragments)
        PipelineStepRunSubsample.subset(input_fas[0], output_fas[0],
                                        records_to_keep)
        if paired:
            PipelineStepRunSubsample.subset(input_fas[1], output_fas[1],
                                            records_to_keep)
            if len(input_fas) == 3 and len(output_fas) == 3:
                # subset the merged fasta
                records_to_keep_merged = []
                for r in records_to_keep:
                    records_to_keep_merged += [2 * r, 2 * r + 1]
                PipelineStepRunSubsample.subset(input_fas[2], output_fas[2],
                                                records_to_keep_merged)
Пример #7
0
 def get_accession_metadata(accession):
     '''
     Retrieve metadata of an NCBI accession (e.g. name, country, collection date)
     TODO: Put this data in S3 instead and get it from there.
     '''
     accession_metadata = {}
     efetch_command = ";".join([
         f"QUERY={accession}",
         "BASE=https://eutils.ncbi.nlm.nih.gov/entrez/eutils",
         "SEARCH_URL=${BASE}/esearch.fcgi?db=nuccore\&term=${QUERY}\&usehistory=y",
         "OUTPUT=$(curl $SEARCH_URL)",
         "WEB=$(echo $OUTPUT | sed -e 's/.*<WebEnv>\(.*\)<\/WebEnv>.*/\\1/')",
         "KEY=$(echo $OUTPUT | sed -e 's/.*<QueryKey>\(.*\)<\/QueryKey>.*/\\1/')",
         "FETCH_URL=${BASE}/efetch.fcgi?db=nuccore\&query_key=${KEY}\&WebEnv=${WEB}\&rettype=gb\&retmode=xml",
         f"curl $FETCH_URL"
     ])
     genbank_xml = command.execute_with_output(efetch_command)
     root = ET.fromstring(genbank_xml).find('GBSeq')
     if not root:
         log.write(f"WARNING: {efetch_command} did not give a result")
         return accession_metadata
     accession_metadata['name'] = root.find('GBSeq_definition').text
     qualifiers_needed = {'country', 'collection_date'}
     for entry in root.find('GBSeq_feature-table')[0].find('GBFeature_quals'):
         if all(key in accession_metadata for key in qualifiers_needed):
             break
         for key in qualifiers_needed - accession_metadata.keys():
             if entry.find('GBQualifier_name').text == key:
                 accession_metadata[key] = entry.find('GBQualifier_value').text
     return accession_metadata
Пример #8
0
def reads(local_file_path, max_reads=None):
    '''
    Count reads in a local file based on file format inferred from extension,
    up to a maximum of max_reads.
    '''
    if local_file_path.endswith(".gz"):
        cmd = r'''zcat "${local_file_path}"'''
        file_format = local_file_path.split(".")[-2]
    else:
        cmd = r'''cat "${local_file_path}"'''
        file_format = local_file_path.split(".")[-1]

    named_args = {
        'local_file_path': local_file_path
    }

    if max_reads:
        max_lines = reads2lines(max_reads, file_format)
        assert max_lines is not None, "Could not convert max_reads to max_lines"
        cmd += r''' | head -n "${max_lines}"'''
        named_args.update({
            'max_lines': max_lines
        })

    cmd += " |  wc -l"

    cmd_output = command.execute_with_output(
        command_patterns.ShellScriptCommand(
            script=cmd,
            named_args=named_args
        )
    )
    line_count = int(cmd_output.strip().split(' ')[0])
    return lines2reads(line_count, file_format)
Пример #9
0
    def subsample_fastas(input_fas, output_fas, max_fragments):
        ''' In memory subsampling '''
        paired = len(input_fas) >= 2
        # count lines
        cmd_output = command.execute_with_output(
            command_patterns.SingleCommand(cmd="wc", args=["-l",
                                                           input_fas[0]]))
        lines_count = int(cmd_output.strip().split(' ')[0])
        total_records = lines_count // 2
        log.write("total reads: %d" % total_records)
        log.write("target reads: %d" % max_fragments)
        if total_records <= max_fragments:
            for infile, outfile in zip(input_fas, output_fas):
                command.copy_file(infile, outfile)
            return

        # total_records > max_fragments, sample
        randgen = random.Random(x=hash(input_fas[0]))
        records_to_keep = randgen.sample(range(total_records), max_fragments)
        PipelineStepRunSubsample.subset(input_fas[0], output_fas[0],
                                        records_to_keep)
        if paired:
            PipelineStepRunSubsample.subset(input_fas[1], output_fas[1],
                                            records_to_keep)
            if len(input_fas) == 3 and len(output_fas) == 3:
                # subset the merged fasta
                records_to_keep_merged = []
                for r in records_to_keep:
                    records_to_keep_merged += [2 * r, 2 * r + 1]
                PipelineStepRunSubsample.subset(input_fas[2], output_fas[2],
                                                records_to_keep_merged)
Пример #10
0
    def test_execute_python_cmd(self):
        '''WHEN using SingleCommand to invoke a .py file, THEN it works as expected'''
        cp1 = command_patterns.SingleCommand(cmd=TESTSCRIPT_HAPPY_PY,
                                             args=["Hello!"])

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "Python is happy to say: Hello!\n")
Пример #11
0
def check_s3_presence(s3_path):
    """True if s3_path exists. False otherwise."""
    try:
        o = command.execute_with_output("aws s3 ls %s" % s3_path)
        if o:
            return True
    except:
        pass
    return False
Пример #12
0
    def test_execute_legacy_format(self, _mock_log):
        '''WHEN command is a string, THEN execute and log a warning'''
        result = command.execute_with_output("echo 123")

        self.assertEqual(result, "123\n")
        _mock_log.assert_any_call(
            warning=True,
            message=MATCH_RE(".*legacy.*Use.*command_patterns"),
            obj_data=ANY)
Пример #13
0
    def test_execute_single_command_2(self):
        '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands'''
        assert " " in TESTFILE_ABC_TXT

        cp1 = command_patterns.SingleCommand(cmd="cat",
                                             args=[TESTFILE_ABC_TXT])

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "abc")
Пример #14
0
 def name_samples_vcf(self, input_file, output_file):
     # The VCF has standard columns CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT,
     # followed by 1 column for each of the pipeline_run_ids. This function replaces the pipeline_run)_ids
     # by the corresponding sample names so that users can understand the file.
     sample_names_by_run_ids = self.additional_attributes["sample_names_by_run_ids"]
     vcf_columns = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
     column_description_line = command.execute_with_output(f"awk /'^{vcf_columns}'/ {input_file}")
     run_ids_in_order = [id for id in column_description_line.split("FORMAT\t")[1].split("\t") if convert.can_convert_to_int(id)]
     sample_names_in_order = [sample_names_by_run_ids.get(id, f"pipeline_run_{id}") for id in run_ids_in_order]
     new_column_description = '\t'.join([vcf_columns] + sample_names_in_order)
     command.execute(f"sed 's/^#CHROM.*/{new_column_description}/' {input_file} > {output_file}")
Пример #15
0
    def test_shellscript_with_param_array(self):
        '''WHEN using ShellScriptCommand to invoke a command with an array of parameters, THEN it works as expected'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'''paste "${slice_outputs[@]}"''',
            named_args={
                'slice_outputs':
                ["-d", r"\n", TESTFILE_ABC_TXT, TESTFILE_BCD_TXT]
            })

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "abc\nbcd\n")
Пример #16
0
    def test_specific_pattern_1(self):
        '''WHEN using ShellScriptCommand with multiline script, THEN it works as expected'''
        cp1 = command_patterns.ShellScriptCommand(script=r'''
                a=123;
                echo May $a the force be with you \
                | sed "s/a/Z/g"
            ''',
                                                  args=[])

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "MZy 123 the force be with you\n")
Пример #17
0
 def poll_server(ip):
     # ServerAliveInterval to fix issue with containers keeping open
     # an SSH connection even after worker machines had finished
     # running.
     commands = "ps aux | grep %s | grep -v bash || echo error" % service_name
     output = command.execute_with_output(
         command.remote(commands, key_path, remote_username, ip),
         timeout=MAX_POLLING_LATENCY).rstrip().split("\n")
     if output != ["error"]:
         with dict_mutex:
             if dict_writable:
                 ip_nproc_dict[ip] = len(output) - 1
Пример #18
0
def get_server_ips_work(service_name, environment):
    tag = "service"
    value = "%s-%s" % (service_name, environment)
    describe_json = json.loads(
        command.execute_with_output(
            "aws ec2 describe-instances --filters 'Name=tag:%s,Values=%s' 'Name=instance-state-name,Values=running'"
            % (tag, value)))
    server_ips = [
        instance["NetworkInterfaces"][0]["PrivateIpAddress"]
        for reservation in describe_json["Reservations"] 
        for instance in reservation["Instances"]
    ]
    
    return server_ips
Пример #19
0
 def get_genbank_genomes(self, taxid, destination_dir, n=10):
     '''
     Retrieve up to n GenBank reference genomes under taxid.
     Assumes taxid is species-level.
     Saves the references under file names compatible with MakeKSNP3infile.
     '''
     if n == 0:
         return []
     categories = ["bacteria", "viral", "fungi", "protozoa"]
     # additional options in genbank that probably don't need right now:
     # ["archaea", "plant", 
     # "vertebrate_mammalian", "vertebrate_other", "invertebrate",
     # "other", "metagenomes"]
     for cat in categories:
         genome_list_path = f"ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt"
         genome_list_local = f"{destination_dir}/{os.path.basename(genome_list_path)}"
         cmd = f"wget -O {genome_list_local} {genome_list_path}; "
         cmd += f"cut -f6,7,8,20 {genome_list_local}" # columns: 6 = taxid; 7 = species_taxid, 8 = organism name, 20 = ftp_path
         cmd += f" | grep -P '\\t{taxid}\\t'" # try to find taxid in the species_taxids
         cmd += f" | head -n {n} | cut -f1,3,4" # take only top n results, keep name and ftp_path
         genomes = list(filter(None, command.execute_with_output(cmd).split("\n")))
         command.execute_with_output(f"rm {genome_list_local}")
         if genomes:
             local_genbank_fastas = []
             for line in genomes:
                 taxid, organism_name, ftp_path = line.split("\t")
                 clean_organism_name = PipelineStepGeneratePhyloTree.clean_name_for_ksnp3(organism_name)
                 ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz"
                 local_fasta = f"{destination_dir}/genbank__{clean_organism_name}__taxid-{taxid}.fasta"
                 if os.path.isfile(local_fasta):
                     local_fasta = f"{local_fasta.split('.')[0]}__I.fasta"
                 command.execute(f"wget -O {local_fasta}.gz {ftp_fasta_gz}")
                 command.execute(f"gunzip {local_fasta}.gz")
                 local_genbank_fastas.append(local_fasta)
             return local_genbank_fastas
     return []
Пример #20
0
def get_server_ips_work(service_name, environment, draining_tag):
    ''' return a dict of relevant instance IPs to instance IDs '''
    tag = "service"
    value = "%s-%s" % (service_name, environment)
    describe_json = json.loads(
        command.execute_with_output(
            "aws ec2 describe-instances --filters 'Name=tag:%s,Values=%s' 'Name=instance-state-name,Values=running'"
            % (tag, value)))
    server_ips = {
        instance["NetworkInterfaces"][0]["PrivateIpAddress"]: instance["InstanceId"]
        for reservation in describe_json["Reservations"] 
        for instance in reservation["Instances"]
        if draining_tag not in [tag["Key"] for tag in instance["Tags"]]
    }
    return server_ips
Пример #21
0
 def _vcf_new_column_description(input_file, sample_names_by_run_ids):
     vcf_columns = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
     column_description_line = command.execute_with_output(
         command_patterns.SingleCommand(
             cmd='awk', args=[f"/^{vcf_columns}/", input_file]))
     column_description_line = column_description_line.strip()
     additional_columns = column_description_line.split(
         "FORMAT\t")[1].split("\t")
     sample_names_in_order = [
         sample_names_by_run_ids.get(id, f"pipeline_run_{id}")
         if convert.can_convert_to_int(id) else id
         for id in additional_columns
     ]
     new_column_description = '\t'.join([vcf_columns] +
                                        sample_names_in_order)
     return new_column_description
Пример #22
0
    def test_execute_single_command_1(self):
        '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands'''
        cp1 = command_patterns.SingleCommand(cmd="echo",
                                             args=[
                                                 "1", 2, "$(pwd)", ";ls",
                                                 "&& ls", "`pwd`", ">",
                                                 "test.txt", "> test.txt",
                                                 ">> test.txt", "&&", "ls",
                                                 "$", "pwd"
                                             ])

        result = command.execute_with_output(cp1)

        self.assertEqual(
            result,
            "1 2 $(pwd) ;ls && ls `pwd` > test.txt > test.txt >> test.txt && ls $ pwd\n"
        )
Пример #23
0
    def test_execute_shell_script_command_2(self):
        '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'echo ${@: 1:$#-1} | sed ${@: $#}',
            args=[
                "1", 2, "$(pwd)", ";ls", ";", "ls", "\n", "ls", "&& ls",
                "`pwd`", ">", "test.txt", "> test.txt", ">> test.txt", "&&",
                "ls", "$", "abc\nls"
                "pwd", "s/w/a/g"
            ])

        result = command.execute_with_output(cp1)

        self.assertEqual(
            result,
            "1 2 $(pad) ;ls ; ls ls && ls `pad` > test.txt > test.txt >> test.txt && ls $ abc lspad\n"
        )
Пример #24
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        genome_dir = fetch_from_s3(self.additional_files["bowtie2_genome"],
                                   self.ref_dir_local,
                                   allow_s3mi=True,
                                   auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_files_to_upload.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        # The code below will handle up to "bowtie2_genome/GRCh38.primary_assembly.
        # genome.99.bt2" but not 100.
        cmd = "ls {genome_dir}/*.bt2*".format(genome_dir=genome_dir)
        local_genome_dir_ls = command.execute_with_output(cmd)
        genome_basename = local_genome_dir_ls.split("\n")[0][:-6]
        if genome_basename[-1] == '.':
            genome_basename = genome_basename[:-1]
        bowtie2_params = [
            'bowtie2', '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        seed = self.additional_attributes.get("random_seed")
        if seed:
            bowtie2_params.extend(['--seed', str(seed)])
        else:
            # Seed option won't work with -p threading option.
            bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])
        command.execute(" ".join(bowtie2_params))
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
 def get_taxid_genomes(genome_list_local, taxid, n_per_taxid):
     cmd = command_patterns.ShellScriptCommand(
         script=(
             # columns: 1 = assembly_accession; 6 = taxid; 7 = species_taxid, 8 = organism_name, 20 = ftp_path
             r'''cut -f1,6,7,8,20 "${genome_list_local}" '''
             # try to find taxid in the taxid column (2nd column of the piped input)
             r''' | awk -F '\t' "${awk_find_pattern}" '''
             # take only top n_per_taxid results
             r''' | head -n "${n_per_taxid}";'''
         ),
         named_args={
             'genome_list_local': genome_list_local,
             'awk_find_pattern': f'$2 == "{taxid}"',
             'n_per_taxid': n_per_taxid
         }
     )
     taxid_genomes = list(filter(None, command.execute_with_output(cmd).split("\n")))
     return taxid_genomes
Пример #26
0
def reads(local_file_path, max_reads=None):
    '''
    Count reads in a local file based on file format inferred from extension,
    up to a maximum of max_reads.
    '''
    if local_file_path.endswith(".gz"):
        cmd = "zcat {}".format(local_file_path)
        file_format = local_file_path.split(".")[-2]
    else:
        cmd = "cat {}".format(local_file_path)
        file_format = local_file_path.split(".")[-1]

    if max_reads:
        max_lines = reads2lines(max_reads, file_format)
        assert max_lines is not None, "Could not convert max_reads to max_lines"
        cmd += " | head -n {}".format(max_lines)

    cmd += " |  wc -l"
    line_count = int(command.execute_with_output(cmd))
    return lines2reads(line_count, file_format)
Пример #27
0
    def test_execute_shell_script_command_1(self):
        '''WHEN using ShellScriptCommand with args that contain spaces or special characters, THEN it doesn't split them into separate arguments'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'''
                echo May the force be with you >> "$1"
                echo The truth is out there > "$1"
                echo Live longer and prosper >> "$1"
                grep "${@:3}" "$1" > "$2"
                cat "$2"
                rm "$1" "$2"
            ''',
            args=[
                r'''/tmp/tmp file with spaces, 'quotes', "double-quotes" and other bizarre characters `~&>.txt''',
                r'''/tmp/another tmp output file.txt''', "-e", "is out", "-e",
                "longer and prosper"
            ])

        result = command.execute_with_output(cp1)

        self.assertEqual(result,
                         "The truth is out there\nLive longer and prosper\n")
Пример #28
0
def get_server_ips_work(service_name, environment, draining_tag):
    ''' return a dict of relevant instance IPs to instance IDs '''
    value = "%s-%s" % (service_name, environment)
    describe_json = json.loads(
        command.execute_with_output(
            command_patterns.SingleCommand(
                cmd="aws",
                args=[
                    "ec2",
                    "describe-instances",
                    "--filters",
                    f"Name=tag:service,Values={value}",
                    "Name=instance-state-name,Values=running"
                ]
            )
        )
    )
    server_ips = {
        instance["NetworkInterfaces"][0]["PrivateIpAddress"]: instance["InstanceId"]
        for reservation in describe_json["Reservations"]
        for instance in reservation["Instances"]
        if draining_tag not in [tag["Key"] for tag in instance["Tags"]]
    }
    return server_ips
Пример #29
0
    def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir,
                  remote_work_dir, remote_username, input_files, key_path,
                  service, lazy_run):
        """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch
        group machines and handle their execution.
        """
        assert service in ("gsnap", "rapsearch2")

        chunk_id = input_files[0].split(part_suffix)[-1]
        # TODO: Switch to python 3.6 which supports interpolation in string
        # formatting, and we will half the number of lines below.
        multihit_basename = "multihit-{service}-out{part_suffix}{chunk_id}.m8".format(
            service=service,
            part_suffix=part_suffix,
            chunk_id=chunk_id,
        )
        multihit_local_outfile = os.path.join(self.chunks_result_dir_local,
                                              multihit_basename)
        multihit_remote_outfile = os.path.join(remote_work_dir,
                                               multihit_basename)
        multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3,
                                           multihit_basename)

        base_str = "aws s3 cp --only-show-errors {s3_path}/{input_fa} {remote_work_dir}/{input_fa} "
        download_input_from_s3 = " ; ".join(
            base_str.format(s3_path=self.chunks_result_dir_s3,
                            input_fa=input_fa,
                            remote_work_dir=remote_work_dir)
            for input_fa in input_files)

        base_str = "mkdir -p {remote_work_dir} ; {download_input_from_s3} ; "
        if service == "gsnap":
            commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 36 --maxsearch=1000 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}"
        else:
            commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}"

        commands = commands.format(
            remote_work_dir=remote_work_dir,
            download_input_from_s3=download_input_from_s3,
            remote_home_dir=remote_home_dir,
            remote_index_dir=remote_index_dir,
            remote_input_files=" ".join(remote_work_dir + "/" + input_fa
                                        for input_fa in input_files),
            multihit_remote_outfile=multihit_remote_outfile
            if service == "gsnap" else multihit_remote_outfile[:-3]
            # Strip the .m8 for RAPSearch as it adds that
        )

        if not lazy_run or not fetch_from_s3(multihit_s3_outfile,
                                             multihit_local_outfile):
            correct_number_of_output_columns = 12
            min_column_number = 0
            max_tries = 2
            try_number = 1
            instance_ip = ""

            def interpret_min_column_number_string(
                    min_column_number_string, correct_number_of_output_columns,
                    try_number):
                if min_column_number_string:
                    min_column_number = float(min_column_number_string)
                    log.write(
                        "Try no. %d: Smallest number of columns observed in any line was %d"
                        % (try_number, min_column_number))
                else:
                    log.write("Try no. %d: No hits" % try_number)
                    min_column_number = correct_number_of_output_columns
                return min_column_number

            # Check if every row has correct number of columns (12) in the output
            # file on the remote machine
            while min_column_number != correct_number_of_output_columns \
                    and try_number <= max_tries:
                log.write("waiting for {} server for chunk {}".format(
                    service, chunk_id))
                max_concurrent = self.additional_attributes["max_concurrent"]
                environment = self.additional_attributes["environment"]

                instance_ip = server.wait_for_server_ip(
                    service, key_path, remote_username, environment,
                    max_concurrent, chunk_id)
                log.write("starting alignment for chunk %s on %s server %s" %
                          (chunk_id, service, instance_ip))
                command.execute(
                    command.remote(commands, key_path, remote_username,
                                   instance_ip))

                if service == "gsnap":
                    verification_command = "cat %s" % multihit_remote_outfile
                else:
                    # For rapsearch, first remove header lines starting with '#'
                    verification_command = "grep -v '^#' %s" % multihit_remote_outfile
                verification_command += " | awk '{print NF}' | sort -nu | head -n 1"
                min_column_number_string = command.execute_with_output(
                    command.remote(verification_command, key_path,
                                   remote_username, instance_ip))
                min_column_number = interpret_min_column_number_string(
                    min_column_number_string, correct_number_of_output_columns,
                    try_number)
                try_number += 1

            # Move output from remote machine to local machine
            msg = "Chunk %s output corrupt; not copying to S3. Re-start pipeline " \
                  "to try again." % chunk_id
            assert min_column_number == correct_number_of_output_columns, msg

            with self.iostream_upload:  # Limit concurrent uploads so as not to stall the pipeline.
                command.execute(
                    command.scp(key_path, remote_username, instance_ip,
                                multihit_remote_outfile,
                                multihit_local_outfile))
                command.execute(
                    "aws s3 cp --only-show-errors %s %s/" %
                    (multihit_local_outfile, self.chunks_result_dir_s3))
            log.write("finished alignment for chunk %s on %s server %s" %
                      (chunk_id, service, instance_ip))
        return multihit_local_outfile
Пример #30
0
    def chunk_input(self, input_files, chunksize):
        """Chunk input files into pieces for performance and parallelism."""
        part_lists = []  # Lists of partial files
        known_nlines = None
        part_suffix = ""
        chunk_nlines = chunksize * 2

        for input_file in input_files:
            # Count number of lines in the file
            cmd_output = command.execute_with_output(
                command_patterns.SingleCommand(cmd="wc",
                                               args=["-l", input_file]))
            nlines = int(cmd_output.strip().split()[0])
            # Number of lines should be the same in paired files
            if known_nlines is not None:
                msg = "Mismatched line counts in supposedly paired files: {}".format(
                    input_files)
                assert nlines == known_nlines, msg
            known_nlines = nlines

            # Set number of pieces and names
            numparts = (nlines + chunk_nlines - 1) // chunk_nlines
            ndigits = len(str(numparts - 1))
            part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize,
                                                               numparts)
            out_prefix_base = os.path.basename(input_file) + part_suffix
            out_prefix = os.path.join(self.chunks_result_dir_local,
                                      out_prefix_base)

            # Split large file into smaller named pieces
            command.execute(
                command_patterns.SingleCommand(cmd="split",
                                               args=[
                                                   "-a", ndigits,
                                                   "--numeric-suffixes", "-l",
                                                   chunk_nlines, input_file,
                                                   out_prefix
                                               ]))
            command.execute_with_retries(
                command_patterns.SingleCommand(
                    cmd="aws",
                    args=[
                        "s3", "sync", "--only-show-errors",
                        os.path.join(self.chunks_result_dir_local, ""),
                        os.path.join(self.chunks_result_dir_s3, ""),
                        "--exclude", "*", "--include", out_prefix_base + "*"
                    ]))

            # Get the partial file names
            partial_files = []
            paths = command.glob(glob_pattern=out_prefix + "*",
                                 strip_folder_names=True)
            partial_files.extend(paths)

            # Check that the partial files match our expected chunking pattern
            pattern = "{:0%dd}" % ndigits
            expected_partial_files = [(out_prefix_base + pattern.format(i))
                                      for i in range(numparts)]
            msg = "something went wrong with chunking: {} != {}".format(
                partial_files, expected_partial_files)
            assert expected_partial_files == partial_files, msg
            part_lists.append(partial_files)

        # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"],
        # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"],
        # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...]
        input_chunks = [list(part) for part in zip(*part_lists)]
        return part_suffix, input_chunks