Python execute_with_output示例，idseq_dag.util.command.execute_with_output Python示例

示例#1

0

显示文件

文件： generate_phylo_tree.py 项目： jonason91/idseq-workflows

 def fetch_ncbi(accession):
     query = accession
     base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
     search_url = f"{base}/esearch.fcgi?db=nuccore&term={query}&usehistory=y"
     output = command.execute_with_output(
         command_patterns.SingleCommand(
             cmd="curl",
             args=[search_url]
         )
     )
     root = ET.fromstring(output)
     web = root.find('WebEnv').text
     key = root.find('QueryKey').text
     fetch_url = f"{base}/efetch.fcgi?db=nuccore&query_key={key}&WebEnv={web}&rettype=gb&retmode=xml"
     genbank_xml = command.execute_with_output(
         command_patterns.SingleCommand(
             cmd="curl",
             args=[fetch_url]
         )
     )
     return {
         'search_url': search_url,
         'fetch_url': fetch_url,
         'genbank_xml': genbank_xml
     }

示例#2

0

显示文件

    def chunk_input(self, input_files, chunksize):
        """Chunk input files into pieces for performance and parallelism."""
        part_lists = []  # Lists of partial files
        known_nlines = None
        part_suffix = ""
        chunk_nlines = chunksize * 2

        for input_file in input_files:
            # Count number of lines in the file
            nlines = int(
                command.execute_with_output("wc -l %s" %
                                            input_file).strip().split()[0])
            # Number of lines should be the same in paired files
            if known_nlines is not None:
                msg = "Mismatched line counts in supposedly paired files: {}".format(
                    input_files)
                assert nlines == known_nlines, msg
            known_nlines = nlines

            # Set number of pieces and names
            numparts = (nlines + chunk_nlines - 1) // chunk_nlines
            ndigits = len(str(numparts - 1))
            part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize,
                                                               numparts)
            out_prefix_base = os.path.basename(input_file) + part_suffix
            out_prefix = os.path.join(self.chunks_result_dir_local,
                                      out_prefix_base)

            # Split large file into smaller named pieces
            command.execute("split -a %d --numeric-suffixes -l %d %s %s" %
                            (ndigits, chunk_nlines, input_file, out_prefix))
            command.execute_with_retries(
                f"aws s3 sync --only-show-errors {self.chunks_result_dir_local}/ {self.chunks_result_dir_s3}/ --exclude '*' --include '{out_prefix_base}*'"
            )

            # Get the partial file names
            partial_files = []
            paths = command.execute_with_output(
                "ls %s*" % out_prefix).rstrip().split("\n")
            for pf in paths:
                partial_files.append(os.path.basename(pf))

            # Check that the partial files match our expected chunking pattern
            pattern = "{:0%dd}" % ndigits
            expected_partial_files = [(out_prefix_base + pattern.format(i))
                                      for i in range(numparts)]
            msg = "something went wrong with chunking: {} != {}".format(
                partial_files, expected_partial_files)
            assert expected_partial_files == partial_files, msg
            part_lists.append(partial_files)

        # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"],
        # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"],
        # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...]
        input_chunks = [list(part) for part in zip(*part_lists)]
        return part_suffix, input_chunks

示例#3

0

显示文件

 def get_total_reads(self, is_zipped, is_fasta):
     """Gets the total number of reads in the sample by counting them directly from the
         fastq or fasta files."""
     # TODO: factor out into utility function, see nonhost_fastq
     input_filenames = self.input_files_local[0]
     if is_zipped:
         unzipped_filenames = []
         for filename in input_filenames:
             if not os.path.exists(filename[:len(filename) - 3]):
                 command.execute(
                     command_patterns.SingleCommand(
                         cmd='gunzip',
                         args=[
                             '-k',
                             filename
                         ]
                     )
                 )
             unzipped_filenames.append(filename[:len(filename) - 3])
         input_filenames = unzipped_filenames
     if is_fasta:  # Number of lines per read can vary, so we use grep
         grep_output = command.execute_with_output(
             command_patterns.SingleCommand(
                 cmd='grep',
                 args=[
                     '-c',
                     '^>',  # fastas start reads with "^>".
                     *input_filenames
                 ]
             )
         )
         output_lines = [line for line in grep_output.split("\n") if line != '']
         if ":" in output_lines[0]:
             # for paired fastas - when run on just one file, grep outputs only
             # a number. But when this command is run on two files, grep outputs
             # a string formatted as filename:count for each file, with count being
             # what we want to add up.
             read_counts = map(lambda line: int(line.split(":")[1]), output_lines)
             return reduce(lambda x, y: x + y, list(read_counts))
         else:
             return int(output_lines[0])
     else:  # fastqs have 4 lines for every read, so we count lines and divide by 4
         wc_params = ['wc', '-l']
         wc_params.extend(input_filenames)
         wc_output = command.execute_with_output(" ".join(wc_params))
         # take the set of characters from the last line, which is the total number of lines
         # for paired reads or the only line for unpaired reads
         wc_lines = [line for line in wc_output.split("\n") if line != '']
         wc_target_line = [line for line in wc_lines[-1].split(" ") if line != '']
         total_line_count = int(wc_target_line[0])
         return total_line_count / 4

示例#4

0

显示文件

文件： generate_phylo_tree.py 项目： rcs333/idseq-dag

 def get_genbank_genomes(self, reference_taxids, destination_dir, superkingdom_name, n=10):
     '''
     Retrieve up to n GenBank reference genomes under the reference_taxids.
     Assumes reference_taxids are species-level or below.
     Also assumes they are all in the same superkingdom, which is the only thing we need in our application.
     Saves the references under file names compatible with MakeKSNP3infile.
     TODO: Retrieve the genomes from S3 rather than ftp.ncbi.nih.gov (JIRA/IDSEQ-334).
     '''
     if n == 0 or not reference_taxids:
         return {}
     n_per_taxid = max(n // len(reference_taxids), 1)
     genbank_categories_by_superkingdom = {
         "Viruses": ["viral"],
         "Bacteria": ["bacteria"],
         "Eukaryota": ["fungi", "protozoa"],
         None: ["bacteria", "viral", "fungi", "protozoa"]
     }
     # additional options in genbank that we probably don't need right now:
     # ["archaea", "plant", 
     # "vertebrate_mammalian", "vertebrate_other", "invertebrate",
     # "other", "metagenomes"]
     categories = genbank_categories_by_superkingdom[superkingdom_name]
     for cat in categories:
         genome_list_path_s3 = f"s3://idseq-database/genbank/{cat}/assembly_summary.txt" # source: ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt
         genome_list_local = s3.fetch_from_s3(genome_list_path_s3, destination_dir)
         genomes = []
         for taxid in reference_taxids:
             cmd = f"cut -f1,6,7,8,20 {genome_list_local}" # columns: 1 = assembly_accession; 6 = taxid; 7 = species_taxid, 8 = organism_name, 20 = ftp_path
             cmd += f" | awk -F '\t' '$2 == {taxid}'" # try to find taxid in the taxid column (2nd column of the piped input)
             cmd += f" | head -n {n_per_taxid}" # take only top n_per_taxid results
             taxid_genomes = list(filter(None, command.execute_with_output(cmd).split("\n")))
             genomes += [entry for entry in taxid_genomes if entry not in genomes]
         genomes = genomes[:n]
         command.execute_with_output(f"rm {genome_list_local}")
         if genomes:
             genbank_fastas = {}
             for line in genomes:
                 assembly_accession, taxid, species_taxid, organism_name, ftp_path = line.split("\t")
                 ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz"
                 tree_node_name = f"genbank_{self.clean_name_for_ksnp3(assembly_accession)}"
                 local_fasta = f"{destination_dir}/{tree_node_name}.fasta"
                 if os.path.isfile(local_fasta):
                     local_fasta = f"{local_fasta.split('.')[0]}__I.fasta"
                 command.execute(f"wget -O {local_fasta}.gz {ftp_fasta_gz}")
                 command.execute(f"gunzip {local_fasta}.gz")
                 genbank_fastas[assembly_accession] = local_fasta
             return genbank_fastas
     return {}

示例#5

0

显示文件

 def __check_if_output_is_corrupt(
         self,
         service,
         key_path,
         remote_username,
         instance_ip,  # self unused
         multihit_remote_outfile,
         chunk_id,
         try_number):
     # Check if every row has correct number of columns (12) in the output
     # file on the remote machine
     if service == "gsnap":
         verification_command = "cat %s" % shlex.quote(
             multihit_remote_outfile)
     else:
         # For rapsearch, first remove header lines starting with '#'
         verification_command = "grep -v '^#' %s" % shlex.quote(
             multihit_remote_outfile)
     verification_command += " | awk '{print NF}' | sort -nu | head -n 1"
     min_column_number_string = command.execute_with_output(
         command.remote(verification_command, key_path, remote_username,
                        instance_ip))
     min_column_number = PipelineStepRunAlignmentRemotely.__interpret_min_column_number_string(
         min_column_number_string, CORRECT_NUMBER_OF_OUTPUT_COLUMNS,
         try_number)
     error = None
     if min_column_number != CORRECT_NUMBER_OF_OUTPUT_COLUMNS:
         msg = "Chunk %s output corrupt; not copying to S3. min_column_number = %d -> expected = %d."
         msg += " Re-start pipeline to try again."
         error = msg % (chunk_id, min_column_number,
                        CORRECT_NUMBER_OF_OUTPUT_COLUMNS)
     return error

示例#6

0

显示文件

文件： run_subsample.py 项目： czbiohub/idseq-dag

    def subsample_fastas(input_fas, output_fas, max_fragments):
        ''' In memory subsampling '''
        paired = len(input_fas) >= 2
        # count lines
        cmd = "wc -l %s | cut -f1 -d ' '" % input_fas[0]
        total_records = int(command.execute_with_output(cmd)) // 2
        log.write("total reads: %d" % total_records)
        log.write("target reads: %d" % max_fragments)
        if total_records <= max_fragments:
            for infile, outfile in zip(input_fas, output_fas):
                command.execute("cp %s %s" % (infile, outfile))
            return

        # total_records > max_fragments, sample
        randgen = random.Random(x=hash(input_fas[0]))
        records_to_keep = randgen.sample(range(total_records), max_fragments)
        PipelineStepRunSubsample.subset(input_fas[0], output_fas[0],
                                        records_to_keep)
        if paired:
            PipelineStepRunSubsample.subset(input_fas[1], output_fas[1],
                                            records_to_keep)
            if len(input_fas) == 3 and len(output_fas) == 3:
                # subset the merged fasta
                records_to_keep_merged = []
                for r in records_to_keep:
                    records_to_keep_merged += [2 * r, 2 * r + 1]
                PipelineStepRunSubsample.subset(input_fas[2], output_fas[2],
                                                records_to_keep_merged)

示例#7

0

显示文件

文件： generate_phylo_tree.py 项目： rcs333/idseq-dag

 def get_accession_metadata(accession):
     '''
     Retrieve metadata of an NCBI accession (e.g. name, country, collection date)
     TODO: Put this data in S3 instead and get it from there.
     '''
     accession_metadata = {}
     efetch_command = ";".join([
         f"QUERY={accession}",
         "BASE=https://eutils.ncbi.nlm.nih.gov/entrez/eutils",
         "SEARCH_URL=${BASE}/esearch.fcgi?db=nuccore\&term=${QUERY}\&usehistory=y",
         "OUTPUT=$(curl $SEARCH_URL)",
         "WEB=$(echo $OUTPUT | sed -e 's/.*<WebEnv>\(.*\)<\/WebEnv>.*/\\1/')",
         "KEY=$(echo $OUTPUT | sed -e 's/.*<QueryKey>\(.*\)<\/QueryKey>.*/\\1/')",
         "FETCH_URL=${BASE}/efetch.fcgi?db=nuccore\&query_key=${KEY}\&WebEnv=${WEB}\&rettype=gb\&retmode=xml",
         f"curl $FETCH_URL"
     ])
     genbank_xml = command.execute_with_output(efetch_command)
     root = ET.fromstring(genbank_xml).find('GBSeq')
     if not root:
         log.write(f"WARNING: {efetch_command} did not give a result")
         return accession_metadata
     accession_metadata['name'] = root.find('GBSeq_definition').text
     qualifiers_needed = {'country', 'collection_date'}
     for entry in root.find('GBSeq_feature-table')[0].find('GBFeature_quals'):
         if all(key in accession_metadata for key in qualifiers_needed):
             break
         for key in qualifiers_needed - accession_metadata.keys():
             if entry.find('GBQualifier_name').text == key:
                 accession_metadata[key] = entry.find('GBQualifier_value').text
     return accession_metadata

示例#8

0

显示文件

def reads(local_file_path, max_reads=None):
    '''
    Count reads in a local file based on file format inferred from extension,
    up to a maximum of max_reads.
    '''
    if local_file_path.endswith(".gz"):
        cmd = r'''zcat "${local_file_path}"'''
        file_format = local_file_path.split(".")[-2]
    else:
        cmd = r'''cat "${local_file_path}"'''
        file_format = local_file_path.split(".")[-1]

    named_args = {
        'local_file_path': local_file_path
    }

    if max_reads:
        max_lines = reads2lines(max_reads, file_format)
        assert max_lines is not None, "Could not convert max_reads to max_lines"
        cmd += r''' | head -n "${max_lines}"'''
        named_args.update({
            'max_lines': max_lines
        })

    cmd += " |  wc -l"

    cmd_output = command.execute_with_output(
        command_patterns.ShellScriptCommand(
            script=cmd,
            named_args=named_args
        )
    )
    line_count = int(cmd_output.strip().split(' ')[0])
    return lines2reads(line_count, file_format)

示例#9

0

显示文件

    def subsample_fastas(input_fas, output_fas, max_fragments):
        ''' In memory subsampling '''
        paired = len(input_fas) >= 2
        # count lines
        cmd_output = command.execute_with_output(
            command_patterns.SingleCommand(cmd="wc", args=["-l",
                                                           input_fas[0]]))
        lines_count = int(cmd_output.strip().split(' ')[0])
        total_records = lines_count // 2
        log.write("total reads: %d" % total_records)
        log.write("target reads: %d" % max_fragments)
        if total_records <= max_fragments:
            for infile, outfile in zip(input_fas, output_fas):
                command.copy_file(infile, outfile)
            return

        # total_records > max_fragments, sample
        randgen = random.Random(x=hash(input_fas[0]))
        records_to_keep = randgen.sample(range(total_records), max_fragments)
        PipelineStepRunSubsample.subset(input_fas[0], output_fas[0],
                                        records_to_keep)
        if paired:
            PipelineStepRunSubsample.subset(input_fas[1], output_fas[1],
                                            records_to_keep)
            if len(input_fas) == 3 and len(output_fas) == 3:
                # subset the merged fasta
                records_to_keep_merged = []
                for r in records_to_keep:
                    records_to_keep_merged += [2 * r, 2 * r + 1]
                PipelineStepRunSubsample.subset(input_fas[2], output_fas[2],
                                                records_to_keep_merged)

示例#10

0

显示文件

    def test_execute_python_cmd(self):
        '''WHEN using SingleCommand to invoke a .py file, THEN it works as expected'''
        cp1 = command_patterns.SingleCommand(cmd=TESTSCRIPT_HAPPY_PY,
                                             args=["Hello!"])

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "Python is happy to say: Hello!\n")

示例#11

0

显示文件

def check_s3_presence(s3_path):
    """True if s3_path exists. False otherwise."""
    try:
        o = command.execute_with_output("aws s3 ls %s" % s3_path)
        if o:
            return True
    except:
        pass
    return False

示例#12

0

显示文件

    def test_execute_legacy_format(self, _mock_log):
        '''WHEN command is a string, THEN execute and log a warning'''
        result = command.execute_with_output("echo 123")

        self.assertEqual(result, "123\n")
        _mock_log.assert_any_call(
            warning=True,
            message=MATCH_RE(".*legacy.*Use.*command_patterns"),
            obj_data=ANY)

示例#13

0

显示文件

    def test_execute_single_command_2(self):
        '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands'''
        assert " " in TESTFILE_ABC_TXT

        cp1 = command_patterns.SingleCommand(cmd="cat",
                                             args=[TESTFILE_ABC_TXT])

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "abc")

示例#14

0

显示文件

文件： generate_phylo_tree.py 项目： rcs333/idseq-dag

 def name_samples_vcf(self, input_file, output_file):
     # The VCF has standard columns CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT,
     # followed by 1 column for each of the pipeline_run_ids. This function replaces the pipeline_run)_ids
     # by the corresponding sample names so that users can understand the file.
     sample_names_by_run_ids = self.additional_attributes["sample_names_by_run_ids"]
     vcf_columns = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
     column_description_line = command.execute_with_output(f"awk /'^{vcf_columns}'/ {input_file}")
     run_ids_in_order = [id for id in column_description_line.split("FORMAT\t")[1].split("\t") if convert.can_convert_to_int(id)]
     sample_names_in_order = [sample_names_by_run_ids.get(id, f"pipeline_run_{id}") for id in run_ids_in_order]
     new_column_description = '\t'.join([vcf_columns] + sample_names_in_order)
     command.execute(f"sed 's/^#CHROM.*/{new_column_description}/' {input_file} > {output_file}")

示例#15

0

显示文件

    def test_shellscript_with_param_array(self):
        '''WHEN using ShellScriptCommand to invoke a command with an array of parameters, THEN it works as expected'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'''paste "${slice_outputs[@]}"''',
            named_args={
                'slice_outputs':
                ["-d", r"\n", TESTFILE_ABC_TXT, TESTFILE_BCD_TXT]
            })

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "abc\nbcd\n")

示例#16

0

显示文件

    def test_specific_pattern_1(self):
        '''WHEN using ShellScriptCommand with multiline script, THEN it works as expected'''
        cp1 = command_patterns.ShellScriptCommand(script=r'''
                a=123;
                echo May $a the force be with you \
                | sed "s/a/Z/g"
            ''',
                                                  args=[])

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "MZy 123 the force be with you\n")

示例#17

0

显示文件

文件： server.py 项目： rcs333/idseq-dag

 def poll_server(ip):
     # ServerAliveInterval to fix issue with containers keeping open
     # an SSH connection even after worker machines had finished
     # running.
     commands = "ps aux | grep %s | grep -v bash || echo error" % service_name
     output = command.execute_with_output(
         command.remote(commands, key_path, remote_username, ip),
         timeout=MAX_POLLING_LATENCY).rstrip().split("\n")
     if output != ["error"]:
         with dict_mutex:
             if dict_writable:
                 ip_nproc_dict[ip] = len(output) - 1

示例#18

0

显示文件

文件： server.py 项目： czbiohub/idseq-dag

def get_server_ips_work(service_name, environment):
    tag = "service"
    value = "%s-%s" % (service_name, environment)
    describe_json = json.loads(
        command.execute_with_output(
            "aws ec2 describe-instances --filters 'Name=tag:%s,Values=%s' 'Name=instance-state-name,Values=running'"
            % (tag, value)))
    server_ips = [
        instance["NetworkInterfaces"][0]["PrivateIpAddress"]
        for reservation in describe_json["Reservations"] 
        for instance in reservation["Instances"]
    ]
    
    return server_ips

示例#19

0

显示文件

 def get_genbank_genomes(self, taxid, destination_dir, n=10):
     '''
     Retrieve up to n GenBank reference genomes under taxid.
     Assumes taxid is species-level.
     Saves the references under file names compatible with MakeKSNP3infile.
     '''
     if n == 0:
         return []
     categories = ["bacteria", "viral", "fungi", "protozoa"]
     # additional options in genbank that probably don't need right now:
     # ["archaea", "plant", 
     # "vertebrate_mammalian", "vertebrate_other", "invertebrate",
     # "other", "metagenomes"]
     for cat in categories:
         genome_list_path = f"ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt"
         genome_list_local = f"{destination_dir}/{os.path.basename(genome_list_path)}"
         cmd = f"wget -O {genome_list_local} {genome_list_path}; "
         cmd += f"cut -f6,7,8,20 {genome_list_local}" # columns: 6 = taxid; 7 = species_taxid, 8 = organism name, 20 = ftp_path
         cmd += f" | grep -P '\\t{taxid}\\t'" # try to find taxid in the species_taxids
         cmd += f" | head -n {n} | cut -f1,3,4" # take only top n results, keep name and ftp_path
         genomes = list(filter(None, command.execute_with_output(cmd).split("\n")))
         command.execute_with_output(f"rm {genome_list_local}")
         if genomes:
             local_genbank_fastas = []
             for line in genomes:
                 taxid, organism_name, ftp_path = line.split("\t")
                 clean_organism_name = PipelineStepGeneratePhyloTree.clean_name_for_ksnp3(organism_name)
                 ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz"
                 local_fasta = f"{destination_dir}/genbank__{clean_organism_name}__taxid-{taxid}.fasta"
                 if os.path.isfile(local_fasta):
                     local_fasta = f"{local_fasta.split('.')[0]}__I.fasta"
                 command.execute(f"wget -O {local_fasta}.gz {ftp_fasta_gz}")
                 command.execute(f"gunzip {local_fasta}.gz")
                 local_genbank_fastas.append(local_fasta)
             return local_genbank_fastas
     return []

示例#20

0

显示文件

文件： server.py 项目： rcs333/idseq-dag

def get_server_ips_work(service_name, environment, draining_tag):
    ''' return a dict of relevant instance IPs to instance IDs '''
    tag = "service"
    value = "%s-%s" % (service_name, environment)
    describe_json = json.loads(
        command.execute_with_output(
            "aws ec2 describe-instances --filters 'Name=tag:%s,Values=%s' 'Name=instance-state-name,Values=running'"
            % (tag, value)))
    server_ips = {
        instance["NetworkInterfaces"][0]["PrivateIpAddress"]: instance["InstanceId"]
        for reservation in describe_json["Reservations"] 
        for instance in reservation["Instances"]
        if draining_tag not in [tag["Key"] for tag in instance["Tags"]]
    }
    return server_ips

示例#21

0

显示文件

 def _vcf_new_column_description(input_file, sample_names_by_run_ids):
     vcf_columns = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
     column_description_line = command.execute_with_output(
         command_patterns.SingleCommand(
             cmd='awk', args=[f"/^{vcf_columns}/", input_file]))
     column_description_line = column_description_line.strip()
     additional_columns = column_description_line.split(
         "FORMAT\t")[1].split("\t")
     sample_names_in_order = [
         sample_names_by_run_ids.get(id, f"pipeline_run_{id}")
         if convert.can_convert_to_int(id) else id
         for id in additional_columns
     ]
     new_column_description = '\t'.join([vcf_columns] +
                                        sample_names_in_order)
     return new_column_description

示例#22

0

显示文件

    def test_execute_single_command_1(self):
        '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands'''
        cp1 = command_patterns.SingleCommand(cmd="echo",
                                             args=[
                                                 "1", 2, "$(pwd)", ";ls",
                                                 "&& ls", "`pwd`", ">",
                                                 "test.txt", "> test.txt",
                                                 ">> test.txt", "&&", "ls",
                                                 "$", "pwd"
                                             ])

        result = command.execute_with_output(cp1)

        self.assertEqual(
            result,
            "1 2 $(pwd) ;ls && ls `pwd` > test.txt > test.txt >> test.txt && ls $ pwd\n"
        )

示例#23

0

显示文件

    def test_execute_shell_script_command_2(self):
        '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'echo ${@: 1:$#-1} | sed ${@: $#}',
            args=[
                "1", 2, "$(pwd)", ";ls", ";", "ls", "\n", "ls", "&& ls",
                "`pwd`", ">", "test.txt", "> test.txt", ">> test.txt", "&&",
                "ls", "$", "abc\nls"
                "pwd", "s/w/a/g"
            ])

        result = command.execute_with_output(cp1)

        self.assertEqual(
            result,
            "1 2 $(pad) ;ls ; ls ls && ls `pad` > test.txt > test.txt >> test.txt && ls $ abc lspad\n"
        )

示例#24

0

显示文件

文件： run_bowtie2.py 项目： rcs333/idseq-dag

    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        genome_dir = fetch_from_s3(self.additional_files["bowtie2_genome"],
                                   self.ref_dir_local,
                                   allow_s3mi=True,
                                   auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_files_to_upload.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        # The code below will handle up to "bowtie2_genome/GRCh38.primary_assembly.
        # genome.99.bt2" but not 100.
        cmd = "ls {genome_dir}/*.bt2*".format(genome_dir=genome_dir)
        local_genome_dir_ls = command.execute_with_output(cmd)
        genome_basename = local_genome_dir_ls.split("\n")[0][:-6]
        if genome_basename[-1] == '.':
            genome_basename = genome_basename[:-1]
        bowtie2_params = [
            'bowtie2', '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        seed = self.additional_attributes.get("random_seed")
        if seed:
            bowtie2_params.extend(['--seed', str(seed)])
        else:
            # Seed option won't work with -p threading option.
            bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])
        command.execute(" ".join(bowtie2_params))
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])

示例#25

0

显示文件

文件： generate_phylo_tree.py 项目： jonason91/idseq-workflows

 def get_taxid_genomes(genome_list_local, taxid, n_per_taxid):
     cmd = command_patterns.ShellScriptCommand(
         script=(
             # columns: 1 = assembly_accession; 6 = taxid; 7 = species_taxid, 8 = organism_name, 20 = ftp_path
             r'''cut -f1,6,7,8,20 "${genome_list_local}" '''
             # try to find taxid in the taxid column (2nd column of the piped input)
             r''' | awk -F '\t' "${awk_find_pattern}" '''
             # take only top n_per_taxid results
             r''' | head -n "${n_per_taxid}";'''
         ),
         named_args={
             'genome_list_local': genome_list_local,
             'awk_find_pattern': f'$2 == "{taxid}"',
             'n_per_taxid': n_per_taxid
         }
     )
     taxid_genomes = list(filter(None, command.execute_with_output(cmd).split("\n")))
     return taxid_genomes

示例#26

0

显示文件

def reads(local_file_path, max_reads=None):
    '''
    Count reads in a local file based on file format inferred from extension,
    up to a maximum of max_reads.
    '''
    if local_file_path.endswith(".gz"):
        cmd = "zcat {}".format(local_file_path)
        file_format = local_file_path.split(".")[-2]
    else:
        cmd = "cat {}".format(local_file_path)
        file_format = local_file_path.split(".")[-1]

    if max_reads:
        max_lines = reads2lines(max_reads, file_format)
        assert max_lines is not None, "Could not convert max_reads to max_lines"
        cmd += " | head -n {}".format(max_lines)

    cmd += " |  wc -l"
    line_count = int(command.execute_with_output(cmd))
    return lines2reads(line_count, file_format)

示例#27

0

显示文件

    def test_execute_shell_script_command_1(self):
        '''WHEN using ShellScriptCommand with args that contain spaces or special characters, THEN it doesn't split them into separate arguments'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'''
                echo May the force be with you >> "$1"
                echo The truth is out there > "$1"
                echo Live longer and prosper >> "$1"
                grep "${@:3}" "$1" > "$2"
                cat "$2"
                rm "$1" "$2"
            ''',
            args=[
                r'''/tmp/tmp file with spaces, 'quotes', "double-quotes" and other bizarre characters `~&>.txt''',
                r'''/tmp/another tmp output file.txt''', "-e", "is out", "-e",
                "longer and prosper"
            ])

        result = command.execute_with_output(cp1)

        self.assertEqual(result,
                         "The truth is out there\nLive longer and prosper\n")

示例#28

0

显示文件

def get_server_ips_work(service_name, environment, draining_tag):
    ''' return a dict of relevant instance IPs to instance IDs '''
    value = "%s-%s" % (service_name, environment)
    describe_json = json.loads(
        command.execute_with_output(
            command_patterns.SingleCommand(
                cmd="aws",
                args=[
                    "ec2",
                    "describe-instances",
                    "--filters",
                    f"Name=tag:service,Values={value}",
                    "Name=instance-state-name,Values=running"
                ]
            )
        )
    )
    server_ips = {
        instance["NetworkInterfaces"][0]["PrivateIpAddress"]: instance["InstanceId"]
        for reservation in describe_json["Reservations"]
        for instance in reservation["Instances"]
        if draining_tag not in [tag["Key"] for tag in instance["Tags"]]
    }
    return server_ips

示例#29

0

显示文件

    def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir,
                  remote_work_dir, remote_username, input_files, key_path,
                  service, lazy_run):
        """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch
        group machines and handle their execution.
        """
        assert service in ("gsnap", "rapsearch2")

        chunk_id = input_files[0].split(part_suffix)[-1]
        # TODO: Switch to python 3.6 which supports interpolation in string
        # formatting, and we will half the number of lines below.
        multihit_basename = "multihit-{service}-out{part_suffix}{chunk_id}.m8".format(
            service=service,
            part_suffix=part_suffix,
            chunk_id=chunk_id,
        )
        multihit_local_outfile = os.path.join(self.chunks_result_dir_local,
                                              multihit_basename)
        multihit_remote_outfile = os.path.join(remote_work_dir,
                                               multihit_basename)
        multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3,
                                           multihit_basename)

        base_str = "aws s3 cp --only-show-errors {s3_path}/{input_fa} {remote_work_dir}/{input_fa} "
        download_input_from_s3 = " ; ".join(
            base_str.format(s3_path=self.chunks_result_dir_s3,
                            input_fa=input_fa,
                            remote_work_dir=remote_work_dir)
            for input_fa in input_files)

        base_str = "mkdir -p {remote_work_dir} ; {download_input_from_s3} ; "
        if service == "gsnap":
            commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 36 --maxsearch=1000 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}"
        else:
            commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}"

        commands = commands.format(
            remote_work_dir=remote_work_dir,
            download_input_from_s3=download_input_from_s3,
            remote_home_dir=remote_home_dir,
            remote_index_dir=remote_index_dir,
            remote_input_files=" ".join(remote_work_dir + "/" + input_fa
                                        for input_fa in input_files),
            multihit_remote_outfile=multihit_remote_outfile
            if service == "gsnap" else multihit_remote_outfile[:-3]
            # Strip the .m8 for RAPSearch as it adds that
        )

        if not lazy_run or not fetch_from_s3(multihit_s3_outfile,
                                             multihit_local_outfile):
            correct_number_of_output_columns = 12
            min_column_number = 0
            max_tries = 2
            try_number = 1
            instance_ip = ""

            def interpret_min_column_number_string(
                    min_column_number_string, correct_number_of_output_columns,
                    try_number):
                if min_column_number_string:
                    min_column_number = float(min_column_number_string)
                    log.write(
                        "Try no. %d: Smallest number of columns observed in any line was %d"
                        % (try_number, min_column_number))
                else:
                    log.write("Try no. %d: No hits" % try_number)
                    min_column_number = correct_number_of_output_columns
                return min_column_number

            # Check if every row has correct number of columns (12) in the output
            # file on the remote machine
            while min_column_number != correct_number_of_output_columns \
                    and try_number <= max_tries:
                log.write("waiting for {} server for chunk {}".format(
                    service, chunk_id))
                max_concurrent = self.additional_attributes["max_concurrent"]
                environment = self.additional_attributes["environment"]

                instance_ip = server.wait_for_server_ip(
                    service, key_path, remote_username, environment,
                    max_concurrent, chunk_id)
                log.write("starting alignment for chunk %s on %s server %s" %
                          (chunk_id, service, instance_ip))
                command.execute(
                    command.remote(commands, key_path, remote_username,
                                   instance_ip))

                if service == "gsnap":
                    verification_command = "cat %s" % multihit_remote_outfile
                else:
                    # For rapsearch, first remove header lines starting with '#'
                    verification_command = "grep -v '^#' %s" % multihit_remote_outfile
                verification_command += " | awk '{print NF}' | sort -nu | head -n 1"
                min_column_number_string = command.execute_with_output(
                    command.remote(verification_command, key_path,
                                   remote_username, instance_ip))
                min_column_number = interpret_min_column_number_string(
                    min_column_number_string, correct_number_of_output_columns,
                    try_number)
                try_number += 1

            # Move output from remote machine to local machine
            msg = "Chunk %s output corrupt; not copying to S3. Re-start pipeline " \
                  "to try again." % chunk_id
            assert min_column_number == correct_number_of_output_columns, msg

            with self.iostream_upload:  # Limit concurrent uploads so as not to stall the pipeline.
                command.execute(
                    command.scp(key_path, remote_username, instance_ip,
                                multihit_remote_outfile,
                                multihit_local_outfile))
                command.execute(
                    "aws s3 cp --only-show-errors %s %s/" %
                    (multihit_local_outfile, self.chunks_result_dir_s3))
            log.write("finished alignment for chunk %s on %s server %s" %
                      (chunk_id, service, instance_ip))
        return multihit_local_outfile

示例#30

0

显示文件

    def chunk_input(self, input_files, chunksize):
        """Chunk input files into pieces for performance and parallelism."""
        part_lists = []  # Lists of partial files
        known_nlines = None
        part_suffix = ""
        chunk_nlines = chunksize * 2

        for input_file in input_files:
            # Count number of lines in the file
            cmd_output = command.execute_with_output(
                command_patterns.SingleCommand(cmd="wc",
                                               args=["-l", input_file]))
            nlines = int(cmd_output.strip().split()[0])
            # Number of lines should be the same in paired files
            if known_nlines is not None:
                msg = "Mismatched line counts in supposedly paired files: {}".format(
                    input_files)
                assert nlines == known_nlines, msg
            known_nlines = nlines

            # Set number of pieces and names
            numparts = (nlines + chunk_nlines - 1) // chunk_nlines
            ndigits = len(str(numparts - 1))
            part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize,
                                                               numparts)
            out_prefix_base = os.path.basename(input_file) + part_suffix
            out_prefix = os.path.join(self.chunks_result_dir_local,
                                      out_prefix_base)

            # Split large file into smaller named pieces
            command.execute(
                command_patterns.SingleCommand(cmd="split",
                                               args=[
                                                   "-a", ndigits,
                                                   "--numeric-suffixes", "-l",
                                                   chunk_nlines, input_file,
                                                   out_prefix
                                               ]))
            command.execute_with_retries(
                command_patterns.SingleCommand(
                    cmd="aws",
                    args=[
                        "s3", "sync", "--only-show-errors",
                        os.path.join(self.chunks_result_dir_local, ""),
                        os.path.join(self.chunks_result_dir_s3, ""),
                        "--exclude", "*", "--include", out_prefix_base + "*"
                    ]))

            # Get the partial file names
            partial_files = []
            paths = command.glob(glob_pattern=out_prefix + "*",
                                 strip_folder_names=True)
            partial_files.extend(paths)

            # Check that the partial files match our expected chunking pattern
            pattern = "{:0%dd}" % ndigits
            expected_partial_files = [(out_prefix_base + pattern.format(i))
                                      for i in range(numparts)]
            msg = "something went wrong with chunking: {} != {}".format(
                partial_files, expected_partial_files)
            assert expected_partial_files == partial_files, msg
            part_lists.append(partial_files)

        # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"],
        # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"],
        # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...]
        input_chunks = [list(part) for part in zip(*part_lists)]
        return part_suffix, input_chunks