def do(assemblies, labels, downloaded_dirpath, corrected_dirpath, ref_txt_fpath=None): logger.print_timestamp() err_fpath = os.path.join(downloaded_dirpath, 'blast.err') blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check') blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') files_md5 = dict((assembly.fpath, md5(assembly.fpath)) for assembly in assemblies) assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies) blast_assemblies, downloaded_organisms, not_founded_organisms = \ check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels) species_list = [] replacement_list = None if ref_txt_fpath: species_list = parse_refs_list(ref_txt_fpath) species_by_assembly = None else: species_scores, species_by_assembly, replacement_dict = process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath) if species_scores: species_scores = sorted(species_scores, reverse=True) species_list = [species for (species, query_id, score) in species_scores] replacement_list = [replacement_dict[query_id] for (species, query_id, score) in species_scores] downloaded_ref_fpaths = [os.path.join(downloaded_dirpath, file) for (path, dirs, files) in os.walk(downloaded_dirpath) for file in files if qutils.check_is_fasta_file(file)] ref_fpaths = search_references(species_list, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths, blast_check_fpath, err_fpath, species_by_assembly, replacement_list) if not ref_fpaths: logger.main_info('Reference genomes are not found.') if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) ref_fpaths.sort() return ref_fpaths
def do(assemblies, labels, downloaded_dirpath, corrected_dirpath, ref_txt_fpath=None): logger.print_timestamp() err_fpath = os.path.join(downloaded_dirpath, 'blast.err') blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check') blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') files_sizes = dict((assembly.fpath, os.path.getsize(assembly.fpath)) for assembly in assemblies) assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies) blast_assemblies, downloaded_organisms, not_founded_organisms = \ check_blast(blast_check_fpath, blast_res_fpath, files_sizes, assemblies_fpaths, assemblies, labels) organisms = [] if ref_txt_fpath: organisms = parse_refs_list(ref_txt_fpath) organisms_assemblies = None else: scores_organisms, organisms_assemblies = process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath) if scores_organisms: scores_organisms = sorted(scores_organisms, reverse=True) organisms = [organism for (score, organism) in scores_organisms] downloaded_ref_fpaths = [os.path.join(downloaded_dirpath, file) for (path, dirs, files) in os.walk(downloaded_dirpath) for file in files if qutils.check_is_fasta_file(file)] ref_fpaths = process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths, blast_check_fpath, err_fpath, organisms_assemblies) if not ref_fpaths: logger.main_info('Reference genomes are not found.') if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) ref_fpaths.sort() return ref_fpaths
def do(assemblies, labels, downloaded_dirpath, corrected_dirpath, ref_txt_fpath=None): logger.print_timestamp() err_fpath = os.path.join(downloaded_dirpath, 'blast.err') blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check') blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') files_md5 = dict((assembly.fpath, md5(assembly.fpath)) for assembly in assemblies) assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies) blast_assemblies, downloaded_organisms, not_founded_organisms = \ check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels) organisms = [] if ref_txt_fpath: organisms = parse_refs_list(ref_txt_fpath) organisms_assemblies = None else: scores_organisms, organisms_assemblies = process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath) if scores_organisms: scores_organisms = sorted(scores_organisms, reverse=True) organisms = [organism for (score, organism) in scores_organisms] downloaded_ref_fpaths = [os.path.join(downloaded_dirpath, file) for (path, dirs, files) in os.walk(downloaded_dirpath) for file in files if qutils.check_is_fasta_file(file)] ref_fpaths = process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths, blast_check_fpath, err_fpath, organisms_assemblies) if not ref_fpaths: logger.main_info('Reference genomes are not found.') if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) ref_fpaths.sort() return ref_fpaths
def do(assemblies, labels, downloaded_dirpath, corrected_dirpath, ref_txt_fpath=None): logger.print_timestamp() err_fpath = os.path.join(downloaded_dirpath, 'blast.err') blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check') blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') files_sizes = dict((assembly.fpath, os.path.getsize(assembly.fpath)) for assembly in assemblies) assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies) blast_assemblies, downloaded_organisms, not_founded_organisms = \ check_blast(blast_check_fpath, blast_res_fpath, files_sizes, assemblies_fpaths, assemblies, labels) species_list = [] replacement_list = None if ref_txt_fpath: species_list = parse_refs_list(ref_txt_fpath) species_by_assembly = None else: species_scores, species_by_assembly, replacement_dict = process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath) if species_scores: species_scores = sorted(species_scores, reverse=True) species_list = [species for (species, query_id, score) in species_scores] replacement_list = [replacement_dict[query_id] for (species, query_id, score) in species_scores] downloaded_ref_fpaths = [os.path.join(downloaded_dirpath, file) for (path, dirs, files) in os.walk(downloaded_dirpath) for file in files if qutils.check_is_fasta_file(file)] ref_fpaths = search_references(species_list, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths, blast_check_fpath, err_fpath, species_by_assembly, replacement_list) if not ref_fpaths: logger.main_info('Reference genomes are not found.') if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) ref_fpaths.sort() return ref_fpaths
def parse_meta_references(option, opt_str, value, parser, logger): ref_fpaths = [] ref_values = value.split(',') for i, ref_value in enumerate(ref_values): if os.path.isdir(ref_value): references = [join(path, file) for (path, dirs, files) in os.walk(ref_value) for file in files if qutils.check_is_fasta_file(file, logger=logger)] ref_fpaths.extend(sorted(references)) else: assert_file_exists(ref_value, 'reference') ref_fpaths.append(ref_value) ensure_value(qconfig, option.dest, []).extend(ref_fpaths)