def process_ref(ref_fpaths, organism, downloaded_dirpath, max_organism_name_len, downloaded_organisms, not_founded_organisms, total_downloaded, total_scored_left): ref_fpath = os.path.join(downloaded_dirpath, correct_name(organism) + '.fasta') spaces = (max_organism_name_len - len(organism)) * ' ' new_ref_fpath = None was_downloaded = False if not os.path.exists(ref_fpath) and organism not in not_founded_organisms: new_ref_fpath = download_ref(organism, ref_fpath) elif os.path.exists(ref_fpath): was_downloaded = True new_ref_fpath = ref_fpath total_scored_left -= 1 if new_ref_fpath: total_downloaded += 1 if was_downloaded: logger.main_info(" %s%s | was downloaded previously (total %d, %d more to go)" % (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left)) if new_ref_fpath not in ref_fpaths: ref_fpaths.append(new_ref_fpath) else: logger.main_info(" %s%s | successfully downloaded (total %d, %d more to go)" % (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left)) ref_fpaths.append(new_ref_fpath) downloaded_organisms.append(organism) else: logger.main_info(" %s%s | not found in the NCBI database" % (organism.replace('+', ' '), spaces)) not_founded_organisms.add(organism) return new_ref_fpath, total_downloaded, total_scored_left
def parse_gff(file, feature): genes = [] number = 0 for line in file: m = gff_pattern.match(line.rstrip()) if m and m.group('feature').lower() == feature: gene = Gene(seqname=qutils.correct_name(m.group('seqname')), start=int(m.group('start')), end=int(m.group('end'))) attributes = m.group('attributes').split(';') for attr in attributes: if attr and attr != '' and '=' in attr: key = attr.split('=')[0] val = attr[len(key) + 1:] if key.lower() == 'id': gene.id = val if key.lower() == 'name': gene.name = val gene.attributes[key.lower()] = val gene.number = number number += 1 genes.append(gene) return genes
def parse_ncbi(ncbi_file): annotation_pattern = re.compile(r'Annotation: (?P<seqname>.+) \((?P<start>\d+)\.\.(?P<end>\d+)(, complement)?\)', re.I) chromosome_pattern = re.compile(r'Chromosome: (?P<chromosome>\S+);', re.I) id_pattern = re.compile(r'ID: (?P<id>\d+)', re.I) genes = [] line = ncbi_file.readline() while line != '': while line.rstrip() == '' or line.startswith('##'): if line == '': break line = ncbi_file.readline() m = ncbi_start_pattern.match(line.rstrip()) while not m: m = ncbi_start_pattern.match(line.rstrip()) gene = Gene(number=int(m.group('number')), name=qutils.correct_name(m.group('name'))) the_rest_lines = [] line = ncbi_file.readline() while line != '' and not ncbi_start_pattern.match(line.rstrip()): the_rest_lines.append(line.rstrip()) line = ncbi_file.readline() for info_line in the_rest_lines: if info_line.startswith('Chromosome:'): m = re.match(chromosome_pattern, info_line) if m: gene.chromosome = m.group('chromosome') if info_line.startswith('Annotation:'): m = re.match(annotation_pattern, info_line) if m: gene.seqname = m.group('seqname') gene.start = int(m.group('start')) gene.end = int(m.group('end')) to_trim = 'Chromosome' + ' ' + str(gene.chromosome) if gene.chromosome and gene.seqname.startswith(to_trim): gene.seqname = gene.seqname[len(to_trim):] gene.seqname.lstrip(' ,') else: logger.warning('Wrong NCBI annotation for gene ' + str(gene.number) + '. ' + gene.name + '. Skipping this gene.') if info_line.startswith('ID:'): m = re.match(id_pattern, info_line) if m: gene.id = m.group('id') else: logger.warning('Can\'t parse gene\'s ID in NCBI format. Gene is ' + str(gene.number) + '. ' + gene.name + '. Skipping it.') if gene.start is not None and gene.end is not None: genes.append(gene) # raise ParseException('NCBI format parsing error: provide start and end for gene ' + gene.number + '. ' + gene.name + '.') return genes
def find_all_sv(bed_fpath): if not bed_fpath: return None region_struct_variations = StructuralVariations() f = open(bed_fpath) for line in f: l = line.split('\t') if len(l) > 6 and not line.startswith('#'): try: align1 = Mapping(s1=int(l[1]), e1=int(l[2]), ref=correct_name(l[0]), s2=None, e2=None, len1=None, len2=None, idy=None, contig=None) align2 = Mapping(s1=int(l[4]), e1=int(l[5]), ref=correct_name(l[3]), s2=None, e2=None, len1=None, len2=None, idy=None, contig=None) if align1.ref != align2.ref: region_struct_variations.translocations.append((align1, align2)) elif 'INV' in l[6]: region_struct_variations.inversions.append((align1, align2)) elif 'DEL' in l[6]: region_struct_variations.relocations.append((align1, align2)) else: pass # not supported yet except ValueError: pass # incorrect line format return region_struct_variations
def find_all_sv(bed_fpath): if not bed_fpath: return None region_struct_variations = StructuralVariations() with open(bed_fpath) as f: for line in f: fs = line.split('\t') if not line.startswith('#'): try: align1 = Mapping(s1=int(fs[1]), e1=int(fs[2]), ref=correct_name(fs[0]), sv_type=fs[6]) align2 = Mapping(s1=int(fs[4]), e1=int(fs[5]), ref=correct_name(fs[3]), sv_type=fs[6]) if align1.ref != align2.ref: region_struct_variations.translocations.append((align1, align2)) elif 'INV' in fs[6]: region_struct_variations.inversions.append((align1, align2)) elif 'DEL' in fs[6] or 'INS' in fs[6] or 'BND' in fs[6]: region_struct_variations.relocations.append((align1, align2)) else: pass # not supported yet except ValueError: pass # incorrect line format return region_struct_variations
def get_correct_names_for_chroms(output_dirpath, fasta_fpath, sam_fpath, err_path, reads_fpaths, logger, is_reference=False): correct_chr_names = dict() fasta_chr_lengths = get_chr_lengths_from_fastafile(fasta_fpath) sam_chr_lengths = dict() sam_header_fpath = join(dirname(output_dirpath), basename(sam_fpath) + '.header') if not isfile(sam_fpath) and not isfile(sam_header_fpath): return None if isfile(sam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath], stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) chr_name_pattern = 'SN:(\S+)' chr_len_pattern = 'LN:(\d+)' with open(sam_header_fpath) as sam_in: for l in sam_in: if l.startswith('@SQ'): chr_name = re.findall(chr_name_pattern, l)[0] chr_len = re.findall(chr_len_pattern, l)[0] sam_chr_lengths[chr_name] = int(chr_len) inconsistency = '' if len(fasta_chr_lengths) != len(sam_chr_lengths): inconsistency = 'Number of chromosomes' else: for fasta_chr, sam_chr in zip(fasta_chr_lengths.keys(), sam_chr_lengths.keys()): if correct_name(sam_chr) == fasta_chr[:len(sam_chr)] and sam_chr_lengths[sam_chr] == fasta_chr_lengths[fasta_chr]: correct_chr_names[sam_chr] = fasta_chr elif sam_chr_lengths[sam_chr] != fasta_chr_lengths[fasta_chr]: inconsistency = 'Chromosome lengths' break else: inconsistency = 'Chromosome names' break if inconsistency: if reads_fpaths: logger.warning(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' + 'QUAST will try to realign reads to ' + ('the reference genome' if is_reference else fasta_fpath)) else: logger.error(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' + 'Use SAM file obtained by aligning reads to ' + ('the reference genome' if is_reference else fasta_fpath)) return None return correct_chr_names
def parse_txt(file): genes = [] number = 0 for line in file: line = line.rstrip() m = txt_pattern_gi.match(line) or txt_pattern.match(line) if m: gene = Gene(number=number, seqname=qutils.correct_name(m.group('seqname'))) number += 1 s = int(m.group('start')) e = int(m.group('end')) gene.start = min(s, e) gene.end = max(s, e) gene.id = m.group('gene_id') genes.append(gene) return genes
def parse_bed(file): genes = [] number = 0 for line in file: fs = line.rstrip().split() if fs: seqname = fs[0] s = int(fs[1]) e = int(fs[2]) gene = Gene(number=number, seqname=qutils.correct_name(seqname)) gene.start = min(s, e) gene.end = max(s, e) gene.id = fs[3] if len(fs) > 3 else None if s < e: gene.strand = '+' else: gene.strand = '-' number += 1 genes.append(gene) return genes
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name) chromosomes_by_refs = {} def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[ corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) excluded_ref_fpaths = [] ref_names = qutils.process_labels(ref_fpaths) for ref_fpath, ref_name in zip(ref_fpaths, ref_names): total_references = 0 ref_fname = os.path.basename(ref_fpath) _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) chromosomes_by_refs[ref_name] = [] used_seq_names = defaultdict(int) corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1) uniq_seq_name = get_uniq_name(seq_name, used_seq_names) used_seq_names[seq_name] += 1 corr_seq_name, corr_seq_fpath = _proceed_seq( uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a') elif downloaded_refs: logger.warning( 'Skipping ' + ref_fpath + ' because it' ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!' ) # cleaning for corr_seq_name, _ in chromosomes_by_refs[ref_name]: del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] del chromosomes_by_refs[ref_name] corrected_ref_fpaths.pop() excluded_ref_fpaths.append(ref_fpath) else: logger.error( 'Reference file ' + ref_fpath + ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!', exit_with_code=1) for excluded in excluded_ref_fpaths: ref_fpaths.remove(excluded) if len(chromosomes_by_refs) > 0: logger.main_info(' All references were combined in ' + qconfig.combined_ref_name) else: logger.warning('All references were skipped!') return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name) chromosomes_by_refs = {} def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) excluded_ref_fpaths = [] ref_names = qutils.process_labels(ref_fpaths) for ref_fpath, ref_name in zip(ref_fpaths, ref_names): total_references = 0 ref_fname = os.path.basename(ref_fpath) _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) chromosomes_by_refs[ref_name] = [] used_seq_names = defaultdict(int) corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1) uniq_seq_name = get_uniq_name(seq_name, used_seq_names) used_seq_names[seq_name] += 1 corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a') elif downloaded_refs: logger.warning('Skipping ' + ref_fpath + ' because it' ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!') # cleaning for corr_seq_name, _ in chromosomes_by_refs[ref_name]: del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] del chromosomes_by_refs[ref_name] corrected_ref_fpaths.pop() excluded_ref_fpaths.append(ref_fpath) else: logger.error('Reference file ' + ref_fpath + ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!', exit_with_code=1) for excluded in excluded_ref_fpaths: ref_fpaths.remove(excluded) if len(chromosomes_by_refs) > 0: logger.main_info(' All references were combined in ' + qconfig.combined_ref_name) else: logger.warning('All references were skipped!') return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
def get_corr_name(name): return qutils.correct_name(name)
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath): if not download_blast_binaries(filenames=blast_filenames): return None, None if qconfig.custom_blast_db_fpath: global db_fpath db_fpath = qconfig.custom_blast_db_fpath if isdir(db_fpath): db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')] if db_aux_files: db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', '')) elif isfile(db_fpath) and db_fpath.endswith('.nsq'): db_fpath = db_fpath[:-len('.nsq')] if not os.path.isfile(db_fpath + '.nsq'): logger.error('You should specify path to BLAST database obtained by running makeblastdb command: ' 'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.' ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.', exit_with_code=2) elif not download_blastdb(): return None, None blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') if len(blast_assemblies) > 0: logger.main_info('Running BlastN..') n_jobs = min(qconfig.max_threads, len(blast_assemblies)) blast_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(assembly.fpath, assembly.label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads) for i, assembly in enumerate(blast_assemblies)) logger.main_info() species_scores = [] species_by_assembly = dict() max_entries = 4 replacement_dict = defaultdict(list) for label in labels: assembly_scores = [] assembly_species = [] res_fpath = get_blast_output_fpath(blast_res_fpath, label) if os.path.exists(res_fpath): refs_for_query = 0 with open(res_fpath) as res_file: query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None for line in res_file: fs = line.split() if line.startswith('#'): refs_for_query = 0 # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score if 'Fields' in line: fs = line.strip().split('Fields: ')[-1].split(', ') query_id_col = fs.index('query id') subj_id_col = fs.index('subject id') idy_col = fs.index('% identity') len_col = fs.index('alignment length') score_col = fs.index('bit score') elif refs_for_query < max_entries and len(fs) > score_col: query_id = fs[query_id_col] organism_id = fs[subj_id_col] idy = float(fs[idy_col]) length = int(fs[len_col]) score = float(fs[score_col]) if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore: # and (not scores or min(scores) - score < max_identity_difference): seqname, taxons = parse_organism_id(organism_id) if not seqname: continue species_name = seqname.split('_') if len(species_name) > 1 and 'uncultured' not in seqname: species_name = species_name[0] + '_' + species_name[1] if refs_for_query == 0: if species_name not in assembly_species: assembly_scores.append((seqname, query_id, score)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons assembly_species.append(species_name) refs_for_query += 1 else: seq_scores = [(seqname, query_id, score) for seqname, query_id, score in assembly_scores if species_name in seqname] if seq_scores and score > seq_scores[0][2]: assembly_scores.remove(seq_scores[0]) assembly_scores.append((seqname, query_id, score)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons refs_for_query += 1 else: if seqname not in replacement_dict[query_id]: replacement_dict[query_id].append(seqname) refs_for_query += 1 assembly_scores = sorted(assembly_scores, reverse=True) assembly_scores = assembly_scores[:qconfig.max_references] for seqname, query_id, score in assembly_scores: if not species_by_assembly or not any(seqname in species_list for species_list in species_by_assembly.values()): species_scores.append((seqname, query_id, score)) species_by_assembly[label] = [seqname for seqname, query_id, score in assembly_scores] if not species_scores: return None, None return species_scores, species_by_assembly, replacement_dict
def process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths, blast_check_fpath, err_fpath, organisms_assemblies=None): ref_fpaths = [] downloaded_organisms = [] total_downloaded = 0 total_scored_left = len(organisms) if total_scored_left == 0: if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) return ref_fpaths max_organism_name_len = 0 for organism in organisms: max_organism_name_len = max(len(organism), max_organism_name_len) for organism in downloaded_organisms: max_organism_name_len = max(len(organism), max_organism_name_len) logger.print_timestamp() logger.main_info('Trying to download found references from NCBI. ' 'Totally ' + str(total_scored_left) + ' organisms to try.') if len(downloaded_ref_fpaths) > 0: logger.main_info('MetaQUAST will attempt to use previously downloaded references...') for organism in organisms: ref_fpath = os.path.join(downloaded_dirpath, correct_name(organism) + '.fasta') spaces = (max_organism_name_len - len(organism)) * ' ' new_ref_fpath = None was_downloaded = False if not os.path.exists(ref_fpath) and organism not in not_founded_organisms: new_ref_fpath = download_refs(organism, ref_fpath) elif os.path.exists(ref_fpath): was_downloaded = True new_ref_fpath = ref_fpath if new_ref_fpath: total_scored_left -= 1 total_downloaded += 1 if was_downloaded: logger.main_info(" %s%s | was downloaded previously (total %d, %d more to go)" % (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left)) if new_ref_fpath not in ref_fpaths: ref_fpaths.append(new_ref_fpath) else: logger.main_info(" %s%s | successfully downloaded (total %d, %d more to go)" % (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left)) ref_fpaths.append(new_ref_fpath) downloaded_organisms.append(organism) else: total_scored_left -= 1 logger.main_info(" %s%s | not found in the NCBI database" % (organism.replace('+', ' '), spaces)) not_founded_organisms.add(organism) for assembly, label in zip(assemblies, labels): check_fpath = get_blast_output_fpath(blast_check_fpath, label) if os.path.exists(check_fpath): with open(check_fpath) as check_file: text = check_file.read() text = text[:text.find('\n')] else: text = 'Assembly: %s md5 checksum: %s\n' % (assembly.fpath, md5(assembly.fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines(text) check_file.writelines('\n---\n') cur_downloaded_organisms = [organism for organism in downloaded_organisms] if not organisms_assemblies else \ [organism for organism in downloaded_organisms if organism in organisms_assemblies[label]] cur_not_founded_organisms = [organism for organism in not_founded_organisms] if not organisms_assemblies else \ [organism for organism in not_founded_organisms if organism in organisms_assemblies[label]] check_file.writelines('Downloaded: %s\n' % ','.join(cur_downloaded_organisms)) check_file.writelines('Not_founded: %s\n' % ','.join(cur_not_founded_organisms)) return ref_fpaths
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath): if not download_all_blast_binaries(): return None, None if qconfig.custom_blast_db_fpath: global db_fpath db_fpath = qconfig.custom_blast_db_fpath if isdir(db_fpath): db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')] if db_aux_files: db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', '')) elif isfile(db_fpath) and db_fpath.endswith('.nsq'): db_fpath = db_fpath[:-len('.nsq')] if not os.path.isfile(db_fpath + '.nsq'): logger.error('You should specify path to BLAST database obtained by running makeblastdb command: ' 'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.' ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.', exit_with_code=2) elif not download_blastdb(): return None, None blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') if len(blast_assemblies) > 0: logger.main_info('Running BlastN..') n_jobs = min(qconfig.max_threads, len(blast_assemblies)) blast_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(assembly.fpath, assembly.label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads) for i, assembly in enumerate(blast_assemblies)) logger.main_info('') scores_organisms = [] organisms_assemblies = {} for label in labels: all_scores = [] organisms = [] res_fpath = get_blast_output_fpath(blast_res_fpath, label) if os.path.exists(res_fpath): refs_for_query = 0 with open(res_fpath) as res_file: for line in res_file: if refs_for_query == 0 and not line.startswith('#') and len(line.split()) > 10: # TODO: find and parse "Fields" line to detect each column indexes: # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score # We need: identity, legnth, score, query and subject id. line = line.split() organism_id = line[1] idy = float(line[2]) length = int(line[3]) score = float(line[11]) if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore: # and (not scores or min(scores) - score < max_identity_difference): seqname, taxons = parse_organism_id(organism_id) if not seqname: continue specie = seqname.split('_') if len(specie) > 1 and 'uncultured' not in seqname: specie = specie[0] + '_' + specie[1] if specie not in organisms: all_scores.append((score, seqname)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons organisms.append(specie) refs_for_query += 1 else: tuple_scores = [x for x in all_scores if specie in x[1]] if tuple_scores and score > tuple_scores[0][0]: all_scores.remove((tuple_scores[0][0], tuple_scores[0][1])) all_scores.append((score, seqname)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons refs_for_query += 1 elif line.startswith('#'): refs_for_query = 0 all_scores = sorted(all_scores, reverse=True) all_scores = all_scores[:qconfig.max_references] for score in all_scores: if not organisms_assemblies or (organisms_assemblies.values() and not [1 for list in organisms_assemblies.values() if score[1] in list]): scores_organisms.append(score) organisms_assemblies[label] = [score[1] for score in all_scores] if not scores_organisms: return None, None return scores_organisms, organisms_assemblies
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath): if not download_blast_binaries(filenames=blast_filenames): return None, None, None if qconfig.custom_blast_db_fpath: global db_fpath db_fpath = qconfig.custom_blast_db_fpath if isdir(db_fpath): db_aux_files = [ f for f in os.listdir(db_fpath) if f.endswith('.nsq') ] if db_aux_files: db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', '')) elif isfile(db_fpath) and db_fpath.endswith('.nsq'): db_fpath = db_fpath[:-len('.nsq')] if not os.path.isfile(db_fpath + '.nsq'): logger.error( 'You should specify path to BLAST database obtained by running makeblastdb command: ' 'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.' ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.', exit_with_code=2) elif not download_blastdb(): return None, None, None blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') if len(blast_assemblies) > 0: logger.main_info('Running BlastN..') n_jobs = min(qconfig.max_threads, len(blast_assemblies)) blast_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed Parallel(n_jobs=n_jobs)(delayed(parallel_blast)( assembly.fpath, assembly.label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads) for i, assembly in enumerate(blast_assemblies)) logger.main_info() species_scores = [] species_by_assembly = dict() max_entries = 4 replacement_dict = defaultdict(list) for label in labels: assembly_scores = [] assembly_species = [] res_fpath = get_blast_output_fpath(blast_res_fpath, label) if os.path.exists(res_fpath): refs_for_query = 0 with open(res_fpath) as res_file: query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None for line in res_file: fs = line.split() if line.startswith('#'): refs_for_query = 0 # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score if 'Fields' in line: fs = line.strip().split('Fields: ')[-1].split(', ') query_id_col = fs.index( 'query id') if 'query id' in fs else 0 subj_id_col = fs.index( 'subject id') if 'subject id' in fs else 1 idy_col = fs.index( '% identity') if '% identity' in fs else 2 len_col = fs.index( 'alignment length' ) if 'alignment length' in fs else 3 score_col = fs.index( 'bit score') if 'bit score' in fs else 11 elif refs_for_query < max_entries and len(fs) > score_col: query_id = fs[query_id_col] organism_id = fs[subj_id_col] idy = float(fs[idy_col]) length = int(fs[len_col]) score = float(fs[score_col]) if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore: # and (not scores or min(scores) - score < max_identity_difference): seqname, taxons = parse_organism_id(organism_id) if not seqname: continue species_name = get_species_name(seqname) if species_name and 'uncultured' not in seqname: if refs_for_query == 0: if species_name not in assembly_species: assembly_scores.append( (seqname, query_id, score)) if taxons: taxons_for_krona[correct_name( seqname)] = taxons assembly_species.append(species_name) refs_for_query += 1 else: seq_scores = [ (query_name, seq_query_id, seq_score) for query_name, seq_query_id, seq_score in assembly_scores if get_species_name( query_name) == species_name ] if seq_scores and score > seq_scores[ 0][2]: assembly_scores.remove( seq_scores[0]) assembly_scores.append( (seqname, query_id, score)) if taxons: taxons_for_krona[correct_name( seqname)] = taxons refs_for_query += 1 else: if seqname not in replacement_dict[ query_id]: replacement_dict[query_id].append( seqname) refs_for_query += 1 assembly_scores = sorted(assembly_scores, reverse=True) assembly_scores = assembly_scores[:qconfig.max_references] for seqname, query_id, score in assembly_scores: if not species_by_assembly or not any( seqname in species_list for species_list in species_by_assembly.values()): species_scores.append((seqname, query_id, score)) species_by_assembly[label] = [ seqname for seqname, query_id, score in assembly_scores ] if not species_scores: return None, None, None return species_scores, species_by_assembly, replacement_dict
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath): if not os.path.isdir(blastdb_dirpath): os.makedirs(blastdb_dirpath) if not download_all_blast_binaries(): return None, None if qconfig.custom_blast_db_fpath: global db_fpath db_fpath = qconfig.custom_blast_db_fpath if isdir(db_fpath): db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')] if db_aux_files: db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', '')) elif isfile(db_fpath) and db_fpath.endswith('.nsq'): db_fpath = db_fpath[:-len('.nsq')] if not os.path.isfile(db_fpath + '.nsq'): logger.error('You should specify path to BLAST database obtained by running makeblastdb command: ' 'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.' ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.', exit_with_code=2) elif not os.path.isfile(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize: # if os.path.isdir(blastdb_dirpath): # shutil.rmtree(blastdb_dirpath) if not download_blastdb(): return None, None logger.info() blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') if len(blast_assemblies) > 0: logger.main_info('Running BlastN..') n_jobs = min(qconfig.max_threads, len(blast_assemblies)) blast_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(assembly.fpath, assembly.label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads) for i, assembly in enumerate(blast_assemblies)) logger.main_info('') scores_organisms = [] organisms_assemblies = {} for label in labels: all_scores = [] organisms = [] res_fpath = get_blast_output_fpath(blast_res_fpath, label) if os.path.exists(res_fpath): refs_for_query = 0 for line in open(res_fpath): if refs_for_query == 0 and not line.startswith('#') and len(line.split()) > 10: # TODO: find and parse "Fields" line to detect each column indexes: # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score # We need: identity, legnth, score, query and subject id. line = line.split() organism_id = line[1] idy = float(line[2]) length = int(line[3]) score = float(line[11]) if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore: # and (not scores or min(scores) - score < max_identity_difference): seqname, taxons = parse_organism_id(organism_id) if not seqname: continue specie = seqname.split('_') if len(specie) > 1 and 'uncultured' not in seqname: specie = specie[0] + '_' + specie[1] if specie not in organisms: all_scores.append((score, seqname)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons organisms.append(specie) refs_for_query += 1 else: tuple_scores = [x for x in all_scores if specie in x[1]] if tuple_scores and score > tuple_scores[0][0]: all_scores.remove((tuple_scores[0][0], tuple_scores[0][1])) all_scores.append((score, seqname)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons refs_for_query += 1 elif line.startswith('#'): refs_for_query = 0 all_scores = sorted(all_scores, reverse=True) all_scores = all_scores[:qconfig.max_references] for score in all_scores: if not organisms_assemblies or (organisms_assemblies.values() and not [1 for list in organisms_assemblies.values() if score[1] in list]): scores_organisms.append(score) organisms_assemblies[label] = [score[1] for score in all_scores] if not scores_organisms: return None, None return scores_organisms, organisms_assemblies
def parse_ncbi(ncbi_file): annotation_pattern = re.compile( r'Annotation: (?P<seqname>.+) \((?P<start>\d+)\.\.(?P<end>\d+)(, complement)?\)', re.I) chromosome_pattern = re.compile(r'Chromosome: (?P<chromosome>\S+);', re.I) id_pattern = re.compile(r'ID: (?P<id>\d+)', re.I) genes = [] line = ncbi_file.readline() while line != '': while line.rstrip() == '' or line.startswith('##'): if line == '': break line = ncbi_file.readline() m = ncbi_start_pattern.match(line.rstrip()) while not m: m = ncbi_start_pattern.match(line.rstrip()) gene = Gene(number=int(m.group('number')), name=qutils.correct_name(m.group('name'))) the_rest_lines = [] line = ncbi_file.readline() while line != '' and not ncbi_start_pattern.match(line.rstrip()): the_rest_lines.append(line.rstrip()) line = ncbi_file.readline() for info_line in the_rest_lines: if info_line.startswith('Chromosome:'): m = re.match(chromosome_pattern, info_line) if m: gene.chromosome = m.group('chromosome') if info_line.startswith('Annotation:'): m = re.match(annotation_pattern, info_line) if m: gene.seqname = m.group('seqname') gene.start = int(m.group('start')) gene.end = int(m.group('end')) to_trim = 'Chromosome' + ' ' + str(gene.chromosome) if gene.chromosome and gene.seqname.startswith(to_trim): gene.seqname = gene.seqname[len(to_trim):] gene.seqname.lstrip(' ,') else: logger.warning('Wrong NCBI annotation for gene ' + str(gene.number) + '. ' + gene.name + '. Skipping this gene.') if info_line.startswith('ID:'): m = re.match(id_pattern, info_line) if m: gene.id = m.group('id') else: logger.warning( 'Can\'t parse gene\'s ID in NCBI format. Gene is ' + str(gene.number) + '. ' + gene.name + '. Skipping it.') if gene.start is not None and gene.end is not None: genes.append(gene) # raise ParseException('NCBI format parsing error: provide start and end for gene ' + gene.number + '. ' + gene.name + '.') return genes
def process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths, blast_check_fpath, err_fpath, organisms_assemblies=None): ref_fpaths = [] downloaded_organisms = [] total_downloaded = 0 total_scored_left = len(organisms) if total_scored_left == 0: if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) return ref_fpaths max_organism_name_len = 0 for organism in organisms: max_organism_name_len = max(len(organism), max_organism_name_len) for organism in downloaded_organisms: max_organism_name_len = max(len(organism), max_organism_name_len) logger.print_timestamp() logger.main_info('Trying to download found references from NCBI. ' 'Totally ' + str(total_scored_left) + ' organisms to try.') if len(downloaded_ref_fpaths) > 0: logger.main_info('MetaQUAST will attempt to use previously downloaded references...') for organism in organisms: ref_fpath = os.path.join(downloaded_dirpath, correct_name(organism) + '.fasta') spaces = (max_organism_name_len - len(organism)) * ' ' new_ref_fpath = None was_downloaded = False if not os.path.exists(ref_fpath) and organism not in not_founded_organisms: new_ref_fpath = download_refs(organism, ref_fpath) elif os.path.exists(ref_fpath): was_downloaded = True new_ref_fpath = ref_fpath if new_ref_fpath: total_scored_left -= 1 total_downloaded += 1 if was_downloaded: logger.main_info(" %s%s | was downloaded previously (total %d, %d more to go)" % (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left)) if new_ref_fpath not in ref_fpaths: ref_fpaths.append(new_ref_fpath) else: logger.main_info(" %s%s | successfully downloaded (total %d, %d more to go)" % (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left)) ref_fpaths.append(new_ref_fpath) downloaded_organisms.append(organism) else: total_scored_left -= 1 logger.main_info(" %s%s | not found in the NCBI database" % (organism.replace('+', ' '), spaces)) not_founded_organisms.add(organism) for assembly, label in zip(assemblies, labels): check_fpath = get_blast_output_fpath(blast_check_fpath, label) if os.path.exists(check_fpath): with open(check_fpath) as check_file: text = check_file.read() text = text[:text.find('\n')] else: text = 'Assembly: %s size: %d\n' % (assembly.fpath, os.path.getsize(assembly.fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines(text) check_file.writelines('\n---\n') cur_downloaded_organisms = [organism for organism in downloaded_organisms] if not organisms_assemblies else \ [organism for organism in downloaded_organisms if organism in organisms_assemblies[label]] cur_not_founded_organisms = [organism for organism in not_founded_organisms] if not organisms_assemblies else \ [organism for organism in not_founded_organisms if organism in organisms_assemblies[label]] check_file.writelines('Downloaded: %s\n' % ','.join(cur_downloaded_organisms)) check_file.writelines('Not_founded: %s\n' % ','.join(cur_not_founded_organisms)) return ref_fpaths
def get_correct_names_for_chroms(output_dirpath, fasta_fpath, sam_fpath, err_path, reads_fpaths, logger, is_reference=False): correct_chr_names = dict() fasta_chr_lengths = get_chr_lengths_from_fastafile(fasta_fpath) sam_chr_lengths = OrderedDict() sam_header_fpath = join(dirname(output_dirpath), basename(sam_fpath) + '.header') if not isfile(sam_fpath) and not isfile(sam_header_fpath): return None if isfile(sam_fpath): qutils.call_subprocess( [sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath], stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) chr_name_pattern = 'SN:(\S+)' chr_len_pattern = 'LN:(\d+)' with open(sam_header_fpath) as sam_in: for l in sam_in: if l.startswith('@SQ'): chr_name = re.findall(chr_name_pattern, l)[0] chr_len = re.findall(chr_len_pattern, l)[0] sam_chr_lengths[chr_name] = int(chr_len) inconsistency = '' if len(fasta_chr_lengths) != len(sam_chr_lengths): inconsistency = 'Number of chromosomes' else: for fasta_chr, sam_chr in zip(fasta_chr_lengths.keys(), sam_chr_lengths.keys()): if correct_name( sam_chr) == fasta_chr[:len(sam_chr)] and sam_chr_lengths[ sam_chr] == fasta_chr_lengths[fasta_chr]: correct_chr_names[sam_chr] = fasta_chr elif sam_chr_lengths[sam_chr] != fasta_chr_lengths[fasta_chr]: inconsistency = 'Chromosome lengths' break else: inconsistency = 'Chromosome names' break if inconsistency: if reads_fpaths: logger.warning( inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' + 'QUAST will try to realign reads to ' + ('the reference genome' if is_reference else fasta_fpath)) else: logger.error( inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' + 'Use SAM file obtained by aligning reads to ' + ('the reference genome' if is_reference else fasta_fpath)) return None return correct_chr_names
def correct_meta_references(ref_fpaths, corrected_dirpath): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name) chromosomes_by_refs = {} def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) dupl_ref_names = [ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1] for ref_fpath in ref_fpaths: total_references = 0 ref_fname = os.path.basename(ref_fpath) ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) if ref_name in dupl_ref_names: ref_name = qutils.get_label_from_par_dir_and_fname(ref_fpath) chromosomes_by_refs[ref_name] = [] used_seq_names = defaultdict(int) corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1) uniq_seq_name = get_uniq_name(seq_name, used_seq_names) used_seq_names[seq_name] += 1 corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') logger.main_info(' All references combined in ' + qconfig.combined_ref_name) return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths