def filter_similar_plasmids(plasmid_scores, output_dir): """ Fairly frequently the plasmids recovered end up being close to identical. This method sorts through the plasmids to find which plasmids are very similar and picks the best one. :param plasmid_scores: Dictionary generated by find_plasmid_kmer_scores. :param output_dir: Directory to put temporary files. :return: List of plasmids. """ # If only one plasmid, just return that. if len(plasmid_scores) == 1: return list(plasmid_scores.keys()) # Otherwise, we create a distance matrix using mash. mash_results = list() i = 0 for query_plasmid in plasmid_scores: mash_results.append(list()) for reference_plasmid in plasmid_scores: mash.dist(query_plasmid, reference_plasmid, output_file=os.path.join(output_dir, 'distances.tab')) result = mash.read_mash_output( os.path.join(output_dir, 'distances.tab')) mash_results[i].append(result) i += 1 matrix = list() iteration = 1 for result in mash_results: j = 1 for item in result: if j > iteration: matrix.append(item[0].distance) j += 1 iteration += 1 # Once the distance matrix has been made, feed it into SciPy to do clustering. z = cluster.hierarchy.linkage(matrix, method='average') clustering = cluster.hierarchy.fcluster(z, 0.05, criterion='distance') num_clusters = max(clustering) clusters = list() # Create our clusters. plasmids_to_use = list() for i in range(num_clusters): clusters.append(list()) plasmid_names = list(plasmid_scores.keys()) for i in range(len(clustering)): clusters[clustering[i] - 1].append(plasmid_names[i]) # Iterate through clusters, and use the highest scoring plasmid from each cluster for further analysis. for group in clusters: max_score = 0 best_hit = '' for strain in group: if plasmid_scores[strain] > max_score: best_hit = strain max_score = plasmid_scores[strain] plasmids_to_use.append(best_hit) return plasmids_to_use
def check_distances(ref_fasta, fastq_folder, work_dir): bad_fastqs = list() # fastqs = glob.glob(os.path.join(fastq_folder, '*R1*')) mash.sketch(os.path.join(fastq_folder, '*R1*'), output_sketch=os.path.join(work_dir, 'sketch.msh'), threads=5) mash.dist(os.path.join(work_dir, 'sketch.msh'), ref_fasta, threads=48, output_file=os.path.join(work_dir, 'distances.tab')) mash_output = mash.read_mash_output(os.path.join(work_dir, 'distances.tab')) for item in mash_output: if item.distance > 0.15: # Moved value from 0.06 to 0.15 - was definitely too conservative before. bad_fastqs.append(item.reference) return bad_fastqs
def check_distances(ref_fasta, fasta_folder): bad_fastqs = list() # fastqs = glob.glob(os.path.join(fastq_folder, '*R1*')) mash.sketch(os.path.join(fasta_folder, '*.fasta'), output_sketch=os.path.join(fasta_folder, 'sketch.msh'), threads=56) mash.dist(os.path.join(fasta_folder, 'sketch.msh'), ref_fasta, threads=56, output_file=os.path.join(fasta_folder, 'distances.tab')) mash_output = mash.read_mash_output(os.path.join(fasta_folder, 'distances.tab')) for item in mash_output: print(item.reference, item.query, str(item.distance)) if item.distance > 0.06: # May need to adjust this value. bad_fastqs.append(item.reference) return bad_fastqs
def find_closest_refseq_genome(forward_reads, refseq_sketch, outdir): print('FINDING CLOSEST REFSEQ GENOME') out_file = os.path.join(outdir, 'distances.tab') mash.dist(forward_reads, refseq_sketch, output_file=out_file) closest_genome = 'NA' closest_distance = 10000 mash_results = mash.read_mash_output(result_file=out_file) for result in mash_results: if result.distance < closest_distance: print(result.distance) print(result.query) closest_genome = result.query closest_distance = result.distance return closest_genome.replace('_genomic.fna', '')
def test_mash_dist_call_kwargs(): out, err, cmd = mash.dist('tests/dummy_fastq/*fastq', output_file='tests/distances.tab', returncmd=True, s='34') assert cmd == 'mash dist tests/dummy_fastq/*fastq -p 1 -s 34 > tests/distances.tab' os.remove('tests/distances.tab')
def test_mash_dist_call_multithreaded(): out, err, cmd = mash.dist('tests/dummy_fastq/*fastq', output_file='tests/distances.tab', returncmd=True, threads=4) assert cmd == 'mash dist tests/dummy_fastq/*fastq -p 4 > tests/distances.tab' os.remove('tests/distances.tab')
def test_read_mash_dist(): out, err, cmd = mash.dist('tests/dummy_fastq/*fastq', output_file='tests/distances.tab', returncmd=True) results = mash.read_mash_output('tests/distances.tab') assert results[1].reference == 'tests/dummy_fastq/single.fastq' \ and results[1].query == 'tests/dummy_fastq/test_R2.fastq' \ and results[1].distance == 0.00763536 os.remove('tests/distances.tab')
def closerelatives_redmine(redmine_instance, issue, work_dir, description): sentry_sdk.init(SENTRY_DSN, before_send=before_send) # Unpickle Redmine objects redmine_instance = pickle.load(open(redmine_instance, 'rb')) issue = pickle.load(open(issue, 'rb')) description = pickle.load(open(description, 'rb')) try: # First line of description should be number of close relatives desired. try: num_close_relatives = int(description[0]) except ValueError: redmine_instance.issue.update(resource_id=issue.id, notes='Error! The first line of the description must be the number' ' of strains you want to find. The first line of your ' 'description was: {}'.format(description[0]), status_id=4) return # Second line of description should be the SEQID of what you want to find a close reference for. seqid = description[1] # Try to extract FASTA files for the specified SEQID. retrieve_nas_files(seqids=[seqid], outdir=os.path.join(work_dir, 'fasta'), filetype='fasta', copyflag=False) if len(glob.glob(os.path.join(work_dir, 'fasta', '*.fasta'))) != 1: redmine_instance.issue.update(resource_id=issue.id, notes='Error! Could not find FASTA file for the specified SEQID. The SEQID' ' that you specified was: {}'.format(seqid), status_id=4) return # Run mash dist with the FASTQ file specified against the sketch of all our stuff. query_fasta = glob.glob(os.path.join(work_dir, 'fasta', '*.fasta'))[0] mash.dist(query_fasta, '/mnt/nas2/redmine/bio_requests/14674/all_sequences.msh', threads=8, output_file=os.path.join(work_dir, 'distances.tab')) mash_results = mash.read_mash_output(os.path.join(work_dir, 'distances.tab')) result_dict = dict() # Put all the results into a dictionary, where the key is the sequence file and the value is mash distance # between query fastq and reference fastq. for item in mash_results: seq_name = os.path.split(item.query)[-1].split('_')[0] result_dict[seq_name] = item.distance # Sort the results, store the sorted dictionary keys in a list. sorted_distance_results = sorted(result_dict, key=result_dict.get) # Prepare a string that lists the top hit SEQIDs to be posted to redmine. upload_string = '' for i in range(num_close_relatives): upload_string = upload_string + sorted_distance_results[i].replace('.fasta', '') + ' (' + str(result_dict[sorted_distance_results[i]]) + ')\n' # Also make a CSV file of all results, in case someone wants to take a closer look. with open(os.path.join(work_dir, 'close_relatives_results.csv'), 'w') as f: f.write('Strain,MashDistance\n') for seq in sorted_distance_results: f.write('{},{}\n'.format(seq.replace('.fasta', ''), result_dict[seq])) output_list = [ { 'path': os.path.join(work_dir, 'close_relatives_results.csv'), 'filename': 'close_relatives_results.csv' } ] # Post the list of closely related SEQIDs to redmine, as well as the CSV result file. redmine_instance.issue.update(resource_id=issue.id, notes='Process complete! Here is the list of the {num_relatives} closest strains ' 'to {query_strain} (mash distance between query and result in brackets):' '\n{upload_string}'.format(num_relatives=str(num_close_relatives), query_strain=seqid, upload_string=upload_string), status_id=4, uploads=output_list) except Exception as e: sentry_sdk.capture_exception(e) redmine_instance.issue.update(resource_id=issue.id, notes='Something went wrong! We log this automatically and will look into the ' 'problem and get back to you with a fix soon.')
def test_dist_no_input_files(): with pytest.raises(ValueError): mash.dist()