def fetch_ncbi(accession): query = accession base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" search_url = f"{base}/esearch.fcgi?db=nuccore&term={query}&usehistory=y" output = command.execute_with_output( command_patterns.SingleCommand( cmd="curl", args=[search_url] ) ) root = ET.fromstring(output) web = root.find('WebEnv').text key = root.find('QueryKey').text fetch_url = f"{base}/efetch.fcgi?db=nuccore&query_key={key}&WebEnv={web}&rettype=gb&retmode=xml" genbank_xml = command.execute_with_output( command_patterns.SingleCommand( cmd="curl", args=[fetch_url] ) ) return { 'search_url': search_url, 'fetch_url': fetch_url, 'genbank_xml': genbank_xml }
def chunk_input(self, input_files, chunksize): """Chunk input files into pieces for performance and parallelism.""" part_lists = [] # Lists of partial files known_nlines = None part_suffix = "" chunk_nlines = chunksize * 2 for input_file in input_files: # Count number of lines in the file nlines = int( command.execute_with_output("wc -l %s" % input_file).strip().split()[0]) # Number of lines should be the same in paired files if known_nlines is not None: msg = "Mismatched line counts in supposedly paired files: {}".format( input_files) assert nlines == known_nlines, msg known_nlines = nlines # Set number of pieces and names numparts = (nlines + chunk_nlines - 1) // chunk_nlines ndigits = len(str(numparts - 1)) part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize, numparts) out_prefix_base = os.path.basename(input_file) + part_suffix out_prefix = os.path.join(self.chunks_result_dir_local, out_prefix_base) # Split large file into smaller named pieces command.execute("split -a %d --numeric-suffixes -l %d %s %s" % (ndigits, chunk_nlines, input_file, out_prefix)) command.execute_with_retries( f"aws s3 sync --only-show-errors {self.chunks_result_dir_local}/ {self.chunks_result_dir_s3}/ --exclude '*' --include '{out_prefix_base}*'" ) # Get the partial file names partial_files = [] paths = command.execute_with_output( "ls %s*" % out_prefix).rstrip().split("\n") for pf in paths: partial_files.append(os.path.basename(pf)) # Check that the partial files match our expected chunking pattern pattern = "{:0%dd}" % ndigits expected_partial_files = [(out_prefix_base + pattern.format(i)) for i in range(numparts)] msg = "something went wrong with chunking: {} != {}".format( partial_files, expected_partial_files) assert expected_partial_files == partial_files, msg part_lists.append(partial_files) # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"], # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"], # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...] input_chunks = [list(part) for part in zip(*part_lists)] return part_suffix, input_chunks
def get_total_reads(self, is_zipped, is_fasta): """Gets the total number of reads in the sample by counting them directly from the fastq or fasta files.""" # TODO: factor out into utility function, see nonhost_fastq input_filenames = self.input_files_local[0] if is_zipped: unzipped_filenames = [] for filename in input_filenames: if not os.path.exists(filename[:len(filename) - 3]): command.execute( command_patterns.SingleCommand( cmd='gunzip', args=[ '-k', filename ] ) ) unzipped_filenames.append(filename[:len(filename) - 3]) input_filenames = unzipped_filenames if is_fasta: # Number of lines per read can vary, so we use grep grep_output = command.execute_with_output( command_patterns.SingleCommand( cmd='grep', args=[ '-c', '^>', # fastas start reads with "^>". *input_filenames ] ) ) output_lines = [line for line in grep_output.split("\n") if line != ''] if ":" in output_lines[0]: # for paired fastas - when run on just one file, grep outputs only # a number. But when this command is run on two files, grep outputs # a string formatted as filename:count for each file, with count being # what we want to add up. read_counts = map(lambda line: int(line.split(":")[1]), output_lines) return reduce(lambda x, y: x + y, list(read_counts)) else: return int(output_lines[0]) else: # fastqs have 4 lines for every read, so we count lines and divide by 4 wc_params = ['wc', '-l'] wc_params.extend(input_filenames) wc_output = command.execute_with_output(" ".join(wc_params)) # take the set of characters from the last line, which is the total number of lines # for paired reads or the only line for unpaired reads wc_lines = [line for line in wc_output.split("\n") if line != ''] wc_target_line = [line for line in wc_lines[-1].split(" ") if line != ''] total_line_count = int(wc_target_line[0]) return total_line_count / 4
def get_genbank_genomes(self, reference_taxids, destination_dir, superkingdom_name, n=10): ''' Retrieve up to n GenBank reference genomes under the reference_taxids. Assumes reference_taxids are species-level or below. Also assumes they are all in the same superkingdom, which is the only thing we need in our application. Saves the references under file names compatible with MakeKSNP3infile. TODO: Retrieve the genomes from S3 rather than ftp.ncbi.nih.gov (JIRA/IDSEQ-334). ''' if n == 0 or not reference_taxids: return {} n_per_taxid = max(n // len(reference_taxids), 1) genbank_categories_by_superkingdom = { "Viruses": ["viral"], "Bacteria": ["bacteria"], "Eukaryota": ["fungi", "protozoa"], None: ["bacteria", "viral", "fungi", "protozoa"] } # additional options in genbank that we probably don't need right now: # ["archaea", "plant", # "vertebrate_mammalian", "vertebrate_other", "invertebrate", # "other", "metagenomes"] categories = genbank_categories_by_superkingdom[superkingdom_name] for cat in categories: genome_list_path_s3 = f"s3://idseq-database/genbank/{cat}/assembly_summary.txt" # source: ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt genome_list_local = s3.fetch_from_s3(genome_list_path_s3, destination_dir) genomes = [] for taxid in reference_taxids: cmd = f"cut -f1,6,7,8,20 {genome_list_local}" # columns: 1 = assembly_accession; 6 = taxid; 7 = species_taxid, 8 = organism_name, 20 = ftp_path cmd += f" | awk -F '\t' '$2 == {taxid}'" # try to find taxid in the taxid column (2nd column of the piped input) cmd += f" | head -n {n_per_taxid}" # take only top n_per_taxid results taxid_genomes = list(filter(None, command.execute_with_output(cmd).split("\n"))) genomes += [entry for entry in taxid_genomes if entry not in genomes] genomes = genomes[:n] command.execute_with_output(f"rm {genome_list_local}") if genomes: genbank_fastas = {} for line in genomes: assembly_accession, taxid, species_taxid, organism_name, ftp_path = line.split("\t") ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz" tree_node_name = f"genbank_{self.clean_name_for_ksnp3(assembly_accession)}" local_fasta = f"{destination_dir}/{tree_node_name}.fasta" if os.path.isfile(local_fasta): local_fasta = f"{local_fasta.split('.')[0]}__I.fasta" command.execute(f"wget -O {local_fasta}.gz {ftp_fasta_gz}") command.execute(f"gunzip {local_fasta}.gz") genbank_fastas[assembly_accession] = local_fasta return genbank_fastas return {}
def __check_if_output_is_corrupt( self, service, key_path, remote_username, instance_ip, # self unused multihit_remote_outfile, chunk_id, try_number): # Check if every row has correct number of columns (12) in the output # file on the remote machine if service == "gsnap": verification_command = "cat %s" % shlex.quote( multihit_remote_outfile) else: # For rapsearch, first remove header lines starting with '#' verification_command = "grep -v '^#' %s" % shlex.quote( multihit_remote_outfile) verification_command += " | awk '{print NF}' | sort -nu | head -n 1" min_column_number_string = command.execute_with_output( command.remote(verification_command, key_path, remote_username, instance_ip)) min_column_number = PipelineStepRunAlignmentRemotely.__interpret_min_column_number_string( min_column_number_string, CORRECT_NUMBER_OF_OUTPUT_COLUMNS, try_number) error = None if min_column_number != CORRECT_NUMBER_OF_OUTPUT_COLUMNS: msg = "Chunk %s output corrupt; not copying to S3. min_column_number = %d -> expected = %d." msg += " Re-start pipeline to try again." error = msg % (chunk_id, min_column_number, CORRECT_NUMBER_OF_OUTPUT_COLUMNS) return error
def subsample_fastas(input_fas, output_fas, max_fragments): ''' In memory subsampling ''' paired = len(input_fas) >= 2 # count lines cmd = "wc -l %s | cut -f1 -d ' '" % input_fas[0] total_records = int(command.execute_with_output(cmd)) // 2 log.write("total reads: %d" % total_records) log.write("target reads: %d" % max_fragments) if total_records <= max_fragments: for infile, outfile in zip(input_fas, output_fas): command.execute("cp %s %s" % (infile, outfile)) return # total_records > max_fragments, sample randgen = random.Random(x=hash(input_fas[0])) records_to_keep = randgen.sample(range(total_records), max_fragments) PipelineStepRunSubsample.subset(input_fas[0], output_fas[0], records_to_keep) if paired: PipelineStepRunSubsample.subset(input_fas[1], output_fas[1], records_to_keep) if len(input_fas) == 3 and len(output_fas) == 3: # subset the merged fasta records_to_keep_merged = [] for r in records_to_keep: records_to_keep_merged += [2 * r, 2 * r + 1] PipelineStepRunSubsample.subset(input_fas[2], output_fas[2], records_to_keep_merged)
def get_accession_metadata(accession): ''' Retrieve metadata of an NCBI accession (e.g. name, country, collection date) TODO: Put this data in S3 instead and get it from there. ''' accession_metadata = {} efetch_command = ";".join([ f"QUERY={accession}", "BASE=https://eutils.ncbi.nlm.nih.gov/entrez/eutils", "SEARCH_URL=${BASE}/esearch.fcgi?db=nuccore\&term=${QUERY}\&usehistory=y", "OUTPUT=$(curl $SEARCH_URL)", "WEB=$(echo $OUTPUT | sed -e 's/.*<WebEnv>\(.*\)<\/WebEnv>.*/\\1/')", "KEY=$(echo $OUTPUT | sed -e 's/.*<QueryKey>\(.*\)<\/QueryKey>.*/\\1/')", "FETCH_URL=${BASE}/efetch.fcgi?db=nuccore\&query_key=${KEY}\&WebEnv=${WEB}\&rettype=gb\&retmode=xml", f"curl $FETCH_URL" ]) genbank_xml = command.execute_with_output(efetch_command) root = ET.fromstring(genbank_xml).find('GBSeq') if not root: log.write(f"WARNING: {efetch_command} did not give a result") return accession_metadata accession_metadata['name'] = root.find('GBSeq_definition').text qualifiers_needed = {'country', 'collection_date'} for entry in root.find('GBSeq_feature-table')[0].find('GBFeature_quals'): if all(key in accession_metadata for key in qualifiers_needed): break for key in qualifiers_needed - accession_metadata.keys(): if entry.find('GBQualifier_name').text == key: accession_metadata[key] = entry.find('GBQualifier_value').text return accession_metadata
def reads(local_file_path, max_reads=None): ''' Count reads in a local file based on file format inferred from extension, up to a maximum of max_reads. ''' if local_file_path.endswith(".gz"): cmd = r'''zcat "${local_file_path}"''' file_format = local_file_path.split(".")[-2] else: cmd = r'''cat "${local_file_path}"''' file_format = local_file_path.split(".")[-1] named_args = { 'local_file_path': local_file_path } if max_reads: max_lines = reads2lines(max_reads, file_format) assert max_lines is not None, "Could not convert max_reads to max_lines" cmd += r''' | head -n "${max_lines}"''' named_args.update({ 'max_lines': max_lines }) cmd += " | wc -l" cmd_output = command.execute_with_output( command_patterns.ShellScriptCommand( script=cmd, named_args=named_args ) ) line_count = int(cmd_output.strip().split(' ')[0]) return lines2reads(line_count, file_format)
def subsample_fastas(input_fas, output_fas, max_fragments): ''' In memory subsampling ''' paired = len(input_fas) >= 2 # count lines cmd_output = command.execute_with_output( command_patterns.SingleCommand(cmd="wc", args=["-l", input_fas[0]])) lines_count = int(cmd_output.strip().split(' ')[0]) total_records = lines_count // 2 log.write("total reads: %d" % total_records) log.write("target reads: %d" % max_fragments) if total_records <= max_fragments: for infile, outfile in zip(input_fas, output_fas): command.copy_file(infile, outfile) return # total_records > max_fragments, sample randgen = random.Random(x=hash(input_fas[0])) records_to_keep = randgen.sample(range(total_records), max_fragments) PipelineStepRunSubsample.subset(input_fas[0], output_fas[0], records_to_keep) if paired: PipelineStepRunSubsample.subset(input_fas[1], output_fas[1], records_to_keep) if len(input_fas) == 3 and len(output_fas) == 3: # subset the merged fasta records_to_keep_merged = [] for r in records_to_keep: records_to_keep_merged += [2 * r, 2 * r + 1] PipelineStepRunSubsample.subset(input_fas[2], output_fas[2], records_to_keep_merged)
def test_execute_python_cmd(self): '''WHEN using SingleCommand to invoke a .py file, THEN it works as expected''' cp1 = command_patterns.SingleCommand(cmd=TESTSCRIPT_HAPPY_PY, args=["Hello!"]) result = command.execute_with_output(cp1) self.assertEqual(result, "Python is happy to say: Hello!\n")
def check_s3_presence(s3_path): """True if s3_path exists. False otherwise.""" try: o = command.execute_with_output("aws s3 ls %s" % s3_path) if o: return True except: pass return False
def test_execute_legacy_format(self, _mock_log): '''WHEN command is a string, THEN execute and log a warning''' result = command.execute_with_output("echo 123") self.assertEqual(result, "123\n") _mock_log.assert_any_call( warning=True, message=MATCH_RE(".*legacy.*Use.*command_patterns"), obj_data=ANY)
def test_execute_single_command_2(self): '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands''' assert " " in TESTFILE_ABC_TXT cp1 = command_patterns.SingleCommand(cmd="cat", args=[TESTFILE_ABC_TXT]) result = command.execute_with_output(cp1) self.assertEqual(result, "abc")
def name_samples_vcf(self, input_file, output_file): # The VCF has standard columns CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, # followed by 1 column for each of the pipeline_run_ids. This function replaces the pipeline_run)_ids # by the corresponding sample names so that users can understand the file. sample_names_by_run_ids = self.additional_attributes["sample_names_by_run_ids"] vcf_columns = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" column_description_line = command.execute_with_output(f"awk /'^{vcf_columns}'/ {input_file}") run_ids_in_order = [id for id in column_description_line.split("FORMAT\t")[1].split("\t") if convert.can_convert_to_int(id)] sample_names_in_order = [sample_names_by_run_ids.get(id, f"pipeline_run_{id}") for id in run_ids_in_order] new_column_description = '\t'.join([vcf_columns] + sample_names_in_order) command.execute(f"sed 's/^#CHROM.*/{new_column_description}/' {input_file} > {output_file}")
def test_shellscript_with_param_array(self): '''WHEN using ShellScriptCommand to invoke a command with an array of parameters, THEN it works as expected''' cp1 = command_patterns.ShellScriptCommand( script=r'''paste "${slice_outputs[@]}"''', named_args={ 'slice_outputs': ["-d", r"\n", TESTFILE_ABC_TXT, TESTFILE_BCD_TXT] }) result = command.execute_with_output(cp1) self.assertEqual(result, "abc\nbcd\n")
def test_specific_pattern_1(self): '''WHEN using ShellScriptCommand with multiline script, THEN it works as expected''' cp1 = command_patterns.ShellScriptCommand(script=r''' a=123; echo May $a the force be with you \ | sed "s/a/Z/g" ''', args=[]) result = command.execute_with_output(cp1) self.assertEqual(result, "MZy 123 the force be with you\n")
def poll_server(ip): # ServerAliveInterval to fix issue with containers keeping open # an SSH connection even after worker machines had finished # running. commands = "ps aux | grep %s | grep -v bash || echo error" % service_name output = command.execute_with_output( command.remote(commands, key_path, remote_username, ip), timeout=MAX_POLLING_LATENCY).rstrip().split("\n") if output != ["error"]: with dict_mutex: if dict_writable: ip_nproc_dict[ip] = len(output) - 1
def get_server_ips_work(service_name, environment): tag = "service" value = "%s-%s" % (service_name, environment) describe_json = json.loads( command.execute_with_output( "aws ec2 describe-instances --filters 'Name=tag:%s,Values=%s' 'Name=instance-state-name,Values=running'" % (tag, value))) server_ips = [ instance["NetworkInterfaces"][0]["PrivateIpAddress"] for reservation in describe_json["Reservations"] for instance in reservation["Instances"] ] return server_ips
def get_genbank_genomes(self, taxid, destination_dir, n=10): ''' Retrieve up to n GenBank reference genomes under taxid. Assumes taxid is species-level. Saves the references under file names compatible with MakeKSNP3infile. ''' if n == 0: return [] categories = ["bacteria", "viral", "fungi", "protozoa"] # additional options in genbank that probably don't need right now: # ["archaea", "plant", # "vertebrate_mammalian", "vertebrate_other", "invertebrate", # "other", "metagenomes"] for cat in categories: genome_list_path = f"ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt" genome_list_local = f"{destination_dir}/{os.path.basename(genome_list_path)}" cmd = f"wget -O {genome_list_local} {genome_list_path}; " cmd += f"cut -f6,7,8,20 {genome_list_local}" # columns: 6 = taxid; 7 = species_taxid, 8 = organism name, 20 = ftp_path cmd += f" | grep -P '\\t{taxid}\\t'" # try to find taxid in the species_taxids cmd += f" | head -n {n} | cut -f1,3,4" # take only top n results, keep name and ftp_path genomes = list(filter(None, command.execute_with_output(cmd).split("\n"))) command.execute_with_output(f"rm {genome_list_local}") if genomes: local_genbank_fastas = [] for line in genomes: taxid, organism_name, ftp_path = line.split("\t") clean_organism_name = PipelineStepGeneratePhyloTree.clean_name_for_ksnp3(organism_name) ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz" local_fasta = f"{destination_dir}/genbank__{clean_organism_name}__taxid-{taxid}.fasta" if os.path.isfile(local_fasta): local_fasta = f"{local_fasta.split('.')[0]}__I.fasta" command.execute(f"wget -O {local_fasta}.gz {ftp_fasta_gz}") command.execute(f"gunzip {local_fasta}.gz") local_genbank_fastas.append(local_fasta) return local_genbank_fastas return []
def get_server_ips_work(service_name, environment, draining_tag): ''' return a dict of relevant instance IPs to instance IDs ''' tag = "service" value = "%s-%s" % (service_name, environment) describe_json = json.loads( command.execute_with_output( "aws ec2 describe-instances --filters 'Name=tag:%s,Values=%s' 'Name=instance-state-name,Values=running'" % (tag, value))) server_ips = { instance["NetworkInterfaces"][0]["PrivateIpAddress"]: instance["InstanceId"] for reservation in describe_json["Reservations"] for instance in reservation["Instances"] if draining_tag not in [tag["Key"] for tag in instance["Tags"]] } return server_ips
def _vcf_new_column_description(input_file, sample_names_by_run_ids): vcf_columns = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" column_description_line = command.execute_with_output( command_patterns.SingleCommand( cmd='awk', args=[f"/^{vcf_columns}/", input_file])) column_description_line = column_description_line.strip() additional_columns = column_description_line.split( "FORMAT\t")[1].split("\t") sample_names_in_order = [ sample_names_by_run_ids.get(id, f"pipeline_run_{id}") if convert.can_convert_to_int(id) else id for id in additional_columns ] new_column_description = '\t'.join([vcf_columns] + sample_names_in_order) return new_column_description
def test_execute_single_command_1(self): '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands''' cp1 = command_patterns.SingleCommand(cmd="echo", args=[ "1", 2, "$(pwd)", ";ls", "&& ls", "`pwd`", ">", "test.txt", "> test.txt", ">> test.txt", "&&", "ls", "$", "pwd" ]) result = command.execute_with_output(cp1) self.assertEqual( result, "1 2 $(pwd) ;ls && ls `pwd` > test.txt > test.txt >> test.txt && ls $ pwd\n" )
def test_execute_shell_script_command_2(self): '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands''' cp1 = command_patterns.ShellScriptCommand( script=r'echo ${@: 1:$#-1} | sed ${@: $#}', args=[ "1", 2, "$(pwd)", ";ls", ";", "ls", "\n", "ls", "&& ls", "`pwd`", ">", "test.txt", "> test.txt", ">> test.txt", "&&", "ls", "$", "abc\nls" "pwd", "s/w/a/g" ]) result = command.execute_with_output(cp1) self.assertEqual( result, "1 2 $(pad) ;ls ; ls ls && ls `pad` > test.txt > test.txt >> test.txt && ls $ abc lspad\n" )
def run(self): input_fas = self.input_files_local[0][0:2] output_fas = self.output_files_local() genome_dir = fetch_from_s3(self.additional_files["bowtie2_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_files_to_upload.append(output_sam_file) # The file structure looks like # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2" # The code below will handle up to "bowtie2_genome/GRCh38.primary_assembly. # genome.99.bt2" but not 100. cmd = "ls {genome_dir}/*.bt2*".format(genome_dir=genome_dir) local_genome_dir_ls = command.execute_with_output(cmd) genome_basename = local_genome_dir_ls.split("\n")[0][:-6] if genome_basename[-1] == '.': genome_basename = genome_basename[:-1] bowtie2_params = [ 'bowtie2', '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S', output_sam_file ] seed = self.additional_attributes.get("random_seed") if seed: bowtie2_params.extend(['--seed', str(seed)]) else: # Seed option won't work with -p threading option. bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())]) if len(input_fas) == 2: bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]]) else: bowtie2_params.extend(['-U', input_fas[0]]) command.execute(" ".join(bowtie2_params)) log.write("Finished Bowtie alignment.") if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def get_taxid_genomes(genome_list_local, taxid, n_per_taxid): cmd = command_patterns.ShellScriptCommand( script=( # columns: 1 = assembly_accession; 6 = taxid; 7 = species_taxid, 8 = organism_name, 20 = ftp_path r'''cut -f1,6,7,8,20 "${genome_list_local}" ''' # try to find taxid in the taxid column (2nd column of the piped input) r''' | awk -F '\t' "${awk_find_pattern}" ''' # take only top n_per_taxid results r''' | head -n "${n_per_taxid}";''' ), named_args={ 'genome_list_local': genome_list_local, 'awk_find_pattern': f'$2 == "{taxid}"', 'n_per_taxid': n_per_taxid } ) taxid_genomes = list(filter(None, command.execute_with_output(cmd).split("\n"))) return taxid_genomes
def reads(local_file_path, max_reads=None): ''' Count reads in a local file based on file format inferred from extension, up to a maximum of max_reads. ''' if local_file_path.endswith(".gz"): cmd = "zcat {}".format(local_file_path) file_format = local_file_path.split(".")[-2] else: cmd = "cat {}".format(local_file_path) file_format = local_file_path.split(".")[-1] if max_reads: max_lines = reads2lines(max_reads, file_format) assert max_lines is not None, "Could not convert max_reads to max_lines" cmd += " | head -n {}".format(max_lines) cmd += " | wc -l" line_count = int(command.execute_with_output(cmd)) return lines2reads(line_count, file_format)
def test_execute_shell_script_command_1(self): '''WHEN using ShellScriptCommand with args that contain spaces or special characters, THEN it doesn't split them into separate arguments''' cp1 = command_patterns.ShellScriptCommand( script=r''' echo May the force be with you >> "$1" echo The truth is out there > "$1" echo Live longer and prosper >> "$1" grep "${@:3}" "$1" > "$2" cat "$2" rm "$1" "$2" ''', args=[ r'''/tmp/tmp file with spaces, 'quotes', "double-quotes" and other bizarre characters `~&>.txt''', r'''/tmp/another tmp output file.txt''', "-e", "is out", "-e", "longer and prosper" ]) result = command.execute_with_output(cp1) self.assertEqual(result, "The truth is out there\nLive longer and prosper\n")
def get_server_ips_work(service_name, environment, draining_tag): ''' return a dict of relevant instance IPs to instance IDs ''' value = "%s-%s" % (service_name, environment) describe_json = json.loads( command.execute_with_output( command_patterns.SingleCommand( cmd="aws", args=[ "ec2", "describe-instances", "--filters", f"Name=tag:service,Values={value}", "Name=instance-state-name,Values=running" ] ) ) ) server_ips = { instance["NetworkInterfaces"][0]["PrivateIpAddress"]: instance["InstanceId"] for reservation in describe_json["Reservations"] for instance in reservation["Instances"] if draining_tag not in [tag["Key"] for tag in instance["Tags"]] } return server_ips
def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir, remote_work_dir, remote_username, input_files, key_path, service, lazy_run): """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch group machines and handle their execution. """ assert service in ("gsnap", "rapsearch2") chunk_id = input_files[0].split(part_suffix)[-1] # TODO: Switch to python 3.6 which supports interpolation in string # formatting, and we will half the number of lines below. multihit_basename = "multihit-{service}-out{part_suffix}{chunk_id}.m8".format( service=service, part_suffix=part_suffix, chunk_id=chunk_id, ) multihit_local_outfile = os.path.join(self.chunks_result_dir_local, multihit_basename) multihit_remote_outfile = os.path.join(remote_work_dir, multihit_basename) multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3, multihit_basename) base_str = "aws s3 cp --only-show-errors {s3_path}/{input_fa} {remote_work_dir}/{input_fa} " download_input_from_s3 = " ; ".join( base_str.format(s3_path=self.chunks_result_dir_s3, input_fa=input_fa, remote_work_dir=remote_work_dir) for input_fa in input_files) base_str = "mkdir -p {remote_work_dir} ; {download_input_from_s3} ; " if service == "gsnap": commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 36 --maxsearch=1000 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}" else: commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}" commands = commands.format( remote_work_dir=remote_work_dir, download_input_from_s3=download_input_from_s3, remote_home_dir=remote_home_dir, remote_index_dir=remote_index_dir, remote_input_files=" ".join(remote_work_dir + "/" + input_fa for input_fa in input_files), multihit_remote_outfile=multihit_remote_outfile if service == "gsnap" else multihit_remote_outfile[:-3] # Strip the .m8 for RAPSearch as it adds that ) if not lazy_run or not fetch_from_s3(multihit_s3_outfile, multihit_local_outfile): correct_number_of_output_columns = 12 min_column_number = 0 max_tries = 2 try_number = 1 instance_ip = "" def interpret_min_column_number_string( min_column_number_string, correct_number_of_output_columns, try_number): if min_column_number_string: min_column_number = float(min_column_number_string) log.write( "Try no. %d: Smallest number of columns observed in any line was %d" % (try_number, min_column_number)) else: log.write("Try no. %d: No hits" % try_number) min_column_number = correct_number_of_output_columns return min_column_number # Check if every row has correct number of columns (12) in the output # file on the remote machine while min_column_number != correct_number_of_output_columns \ and try_number <= max_tries: log.write("waiting for {} server for chunk {}".format( service, chunk_id)) max_concurrent = self.additional_attributes["max_concurrent"] environment = self.additional_attributes["environment"] instance_ip = server.wait_for_server_ip( service, key_path, remote_username, environment, max_concurrent, chunk_id) log.write("starting alignment for chunk %s on %s server %s" % (chunk_id, service, instance_ip)) command.execute( command.remote(commands, key_path, remote_username, instance_ip)) if service == "gsnap": verification_command = "cat %s" % multihit_remote_outfile else: # For rapsearch, first remove header lines starting with '#' verification_command = "grep -v '^#' %s" % multihit_remote_outfile verification_command += " | awk '{print NF}' | sort -nu | head -n 1" min_column_number_string = command.execute_with_output( command.remote(verification_command, key_path, remote_username, instance_ip)) min_column_number = interpret_min_column_number_string( min_column_number_string, correct_number_of_output_columns, try_number) try_number += 1 # Move output from remote machine to local machine msg = "Chunk %s output corrupt; not copying to S3. Re-start pipeline " \ "to try again." % chunk_id assert min_column_number == correct_number_of_output_columns, msg with self.iostream_upload: # Limit concurrent uploads so as not to stall the pipeline. command.execute( command.scp(key_path, remote_username, instance_ip, multihit_remote_outfile, multihit_local_outfile)) command.execute( "aws s3 cp --only-show-errors %s %s/" % (multihit_local_outfile, self.chunks_result_dir_s3)) log.write("finished alignment for chunk %s on %s server %s" % (chunk_id, service, instance_ip)) return multihit_local_outfile
def chunk_input(self, input_files, chunksize): """Chunk input files into pieces for performance and parallelism.""" part_lists = [] # Lists of partial files known_nlines = None part_suffix = "" chunk_nlines = chunksize * 2 for input_file in input_files: # Count number of lines in the file cmd_output = command.execute_with_output( command_patterns.SingleCommand(cmd="wc", args=["-l", input_file])) nlines = int(cmd_output.strip().split()[0]) # Number of lines should be the same in paired files if known_nlines is not None: msg = "Mismatched line counts in supposedly paired files: {}".format( input_files) assert nlines == known_nlines, msg known_nlines = nlines # Set number of pieces and names numparts = (nlines + chunk_nlines - 1) // chunk_nlines ndigits = len(str(numparts - 1)) part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize, numparts) out_prefix_base = os.path.basename(input_file) + part_suffix out_prefix = os.path.join(self.chunks_result_dir_local, out_prefix_base) # Split large file into smaller named pieces command.execute( command_patterns.SingleCommand(cmd="split", args=[ "-a", ndigits, "--numeric-suffixes", "-l", chunk_nlines, input_file, out_prefix ])) command.execute_with_retries( command_patterns.SingleCommand( cmd="aws", args=[ "s3", "sync", "--only-show-errors", os.path.join(self.chunks_result_dir_local, ""), os.path.join(self.chunks_result_dir_s3, ""), "--exclude", "*", "--include", out_prefix_base + "*" ])) # Get the partial file names partial_files = [] paths = command.glob(glob_pattern=out_prefix + "*", strip_folder_names=True) partial_files.extend(paths) # Check that the partial files match our expected chunking pattern pattern = "{:0%dd}" % ndigits expected_partial_files = [(out_prefix_base + pattern.format(i)) for i in range(numparts)] msg = "something went wrong with chunking: {} != {}".format( partial_files, expected_partial_files) assert expected_partial_files == partial_files, msg part_lists.append(partial_files) # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"], # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"], # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...] input_chunks = [list(part) for part in zip(*part_lists)] return part_suffix, input_chunks