def fasta_records(fa_path): # Generator yields records retrieved from fasta files. # # :param fasta_file: file instance of FASTA file to retrieve sequences from; # :type fasta_file: _io.TextIOWrapper or gzip.GzipFile; # Returns dictionary of the following structure: # { # "seq_id": ID_of_sequence, # "seq": sequence_itself # } how_to_open = OPEN_FUNCS[is_gzipped(fa_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(fa_path)] with how_to_open(fa_path) as fa_file: line = fmt_func(fa_file.readline()) seq_id = line seq = "" line = fmt_func(fa_file.readline()) while line != "": seq += line line = fmt_func(fa_file.readline()) if line.startswith('>') or line == "": yield {"seq_id": seq_id, "seq": seq} seq_id = line seq = "" line = fmt_func(fa_file.readline())
def fastq_records(fq_path): # Generator yields records retrieved from fasta files. # # :param read_file: file instance of FASTQ file to retrieve sequences from; # :type fasta_file: _io.TextIOWrapper or gzip.GzipFile; # # Returns dictionary of the following structure: # { # "seq_id": ID_of_sequence, # "seq": sequence_itself, # "opt_id": the_third_line, # "qual_line": quality_line # } how_to_open = OPEN_FUNCS[is_gzipped(fq_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(fq_path)] with how_to_open(fq_path) as fq_file: eof = False while not eof: seq_id = fmt_func(fq_file.readline()) if seq_id != "": yield { "seq_id": seq_id, "seq": fmt_func(fq_file.readline()), "opt_id": fmt_func(fq_file.readline()), "qual_line": fmt_func(fq_file.readline()) } else: eof = True
def recover_taxonomy(acc, hit_def, taxonomy_path): # Function recovers missing taxonomy by given accession. # # :param acc: accession of taxonomy entry to recover; # :type acc: str; # :param hit_def: name of this sequence; # :type hit_def: sre; # :param taxonomy_path: path to TSV file with taxonomy; # :type taxonomy_path: str; if acc == "LAMBDA": # If we are missing lambda phage taxonomy -- just add it save_taxonomy_directly(taxonomy_path, acc, "Lambda-phage-nanopore-control") elif acc.startswith("OWN_SEQ_"): # If sequence is an "own seq" -- check fasta file # Get necessary title line from `local_seq_set.fasta` # Firstly find fasta file (it may be compressed) classif_dir = os.path.dirname(os.path.dirname(taxonomy_path)) db_dir = os.path.join(classif_dir, "local_database") db_files = glob.glob("{}{}*".format(db_dir, os.sep)) try: local_fasta = next(iter(filter(is_fasta, db_files))) except StopIteration: printlog_error_time( "Error: cannot recover taxonomy for following sequence:") printlog_error(" `{} - {}`.".format(acc, hit_def)) printlog_error( "You can solve this problem by yourself (it's pretty simple).") printlog_error("Just add taxonomy line for {} to file `{}`".format( acc, taxonomy_path)) printlog_error(" and run the program again.") platf_depend_exit(1) # end try # Find our line startingg with `acc` how_to_open = OPEN_FUNCS[is_gzipped(local_fasta)] fmt_func = FORMATTING_FUNCS[is_gzipped(local_fasta)] if is_gzipped(local_fasta): search_for = b">" + bytes(acc, 'ascii') + b" " else: search_for = ">{} ".format(acc) # end if with how_to_open(local_fasta) as fasta_file: for line in fasta_file: if line.startswith(search_for): seq_name = fmt_func(line).partition(' ')[ 2] # get name of the sequence save_taxonomy_directly(taxonomy_path, acc, seq_name) break # end if # end for # end with else: # Try to find taxonomy in NCBI download_taxonomy(acc, hit_def, taxonomy_path)
def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path, nfiles): # Function performs 'many_files'-parallel mode of barapost-local.py. # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed; # :type fq_fa_list: list<str>; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; # :param nfiles: total number of files; # :type nfiles: int; queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp") # Iterate over source FASTQ and FASTA files for fq_fa_path in fq_fa_list: # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join( new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data[ "n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data[ "tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum( 1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len( tuple( filter( lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: with print_lock: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format( os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end with # end try # end if if num_seqs == num_done_seqs: with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\ format(i, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") # end with continue # end if for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs): # Blast the packet align_xml_text = launch_blastn(packet["fasta"], blast_algorithm, use_index, queries_tmp_dir, db_path) # Cnfigure result TSV lines result_tsv_lines = parse_align_results_xml(align_xml_text, packet["qual"]) # Write the result to tsv write_classification(result_tsv_lines, tsv_res_path) # end for with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end with # end for query_fpath = os.path.join(queries_tmp_dir, "query{}_tmp.fasta".format(os.getpid())) remove_tmp_files(query_fpath)
def fasta_packets(fasta, packet_size, num_done_seqs, packet_mode=0, saved_packet_size=None, saved_packet_mode=None, max_seq_len=float("inf"), probing_batch_size=float("inf")): # Generator yields fasta-formattedpackets of records from fasta file. # This function passes 'num_done_seqs' sequences (i.e. they will not be processed) # to 'pass_processed_files'. # # :param fasta: path to fasta file; # :type fasta: str; # :param packet_size: number of sequences to align in one request ('blastn' launching); # :type packet_size: int; # :param num_done_seqs: number of sequnces in current file that have been already processed; # :type num_done_seqs: int; # :param packet_mode: packet mode (see -c option); # :type packet_mode: int; # :param saved_packet_size: size of last sent packet from tmp file. Necessary for resumption. # It will be None, if no tmp file was in classification directory; # :type saved_packet_size: int; # :param saved_packet_mode: mode used whilst formig the last sent packet from tmp file. # Necessary for resumption. It will be None, if no tmp file was in classification directory; # :type saved_packet_mode: int; # :param max_seq_len: maximum length of a sequence proessed; # :type max_seq_len: int (float("inf") if pruning is disabled); how_to_open = OPEN_FUNCS[is_gzipped(fasta)] fmt_func = FORMATTING_FUNCS[is_gzipped(fasta)] with how_to_open(fasta) as fasta_file: # Next line retrieving is implemented as simple line-from-file reading. get_next_line = lambda: fmt_func(fasta_file.readline()) # Variable that contains ID of next sequence in current FASTA file. # If no or all sequences in current FASTA file have been already processed, this variable is None. # There is no way to count sequences in multi-FASTA file, accept of counting sequence IDs. # Therefore 'next_id_line' should be saved in memory just after moment when packet is formed. next_id_line = pass_processed_seqs(fasta_file, num_done_seqs, fmt_func) if next_id_line == "": yield {"fasta": "", "qual": dict()} # end if packet = "" # We are resuming, nucleotide sequence will be saved in 'line' variable here: try: line = get_next_line() except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(fasta)) printlog_warning("Ceasing reading sequences from this file.") return # end try if line.startswith('>'): line = fmt_read_id(line) # format sequence ID # end if # If some sequences have been passed, this if-statement will be executed. # New packet should start with sequence ID line. if not next_id_line is None: packet += next_id_line + '\n' # end if packet += line + '\n' # add recently read line # Here goes check for saved packet size and mode: if not saved_packet_size is None: wrk_pack_size = saved_packet_size else: wrk_pack_size = packet_size # end if if not saved_packet_mode is None: wrk_pack_mode = saved_packet_mode else: wrk_pack_mode = packet_mode # end if eof = False while not eof: # till the end of file counter = 0 # variable for counting sequences within packet seqlen = 0 while counter < wrk_pack_size: try: line = get_next_line() except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(fasta)) printlog_warning( "Ceasing reading sequences from this file.") line = "" break # end try if line.startswith('>'): line = fmt_read_id(line) if packet_mode == 0: counter += 1 else: counter += min(seqlen, max_seq_len) seqlen = 0 # end if # end if if line == "": # if end of file (data) is reached break # end if if not line.startswith('>'): seqlen += len(line.strip()) # end if packet += line + '\n' # add line to packet # end while if line != "": next_id_line = packet.splitlines()[ -1] # save sequence ID next packet will start with packet = '\n'.join(packet.splitlines() [:-1]) # exclude 'next_id_line' from packet else: eof = True next_id_line = None # end if # Get list of sequence IDs: names = filter(lambda l: l.startswith('>'), packet.splitlines()) names = map(lambda l: l.replace('>', ''), names) # {<seq_id>: '-'}, as soon as it is a fasta file qual_dict = {name: '-' for name in names} if max_seq_len < float("inf"): # prune sequences packet = prune_seqs(packet, max_seq_len) # end if if packet != "": yield {"fasta": packet, "qual": qual_dict} if packet_mode == 0: probing_batch_size -= wrk_pack_size wrk_pack_size = min(packet_size, probing_batch_size) else: probing_batch_size -= len(qual_dict) # end if # Switch back to standart packet size # As Vorotos said, repeated assignment is the best check: if wrk_pack_mode != packet_mode: wrk_pack_mode = packet_mode # end if if not next_id_line is None: packet = next_id_line + '\n' # end if else: return
def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path): # Function preforms "few_files"-parallel mode. # # :param fq_fa_list: list of paths to files meant to be processed; # :type fq_fa_list: list<str>; # :param n_thr: number of threads to launch; # :type n_thr: int; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; nfiles = len(fq_fa_list) for i, fq_fa_path in enumerate(fq_fa_list): # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ] fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end try # end if packet_size = min(packet_size, num_seqs // n_thr) if num_seqs == num_done_seqs: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\ .format(i+1, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") return # end if # Get number of seqeunces to pass to each thread file_part_size = num_seqs // n_thr if num_seqs % n_thr != 0: file_part_size += 1 # end if pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral, initargs=(mp.Lock(), mp.Lock(),)) pool.starmap(process_part_of_file, [(file_part, tsv_res_path, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path) for file_part in packet_generator(fq_fa_path, file_part_size, num_done_seqs)]) # Reaping zombies pool.close() pool.join() sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i+1, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end for
def fastq_packets(fastq, packet_size, num_done_seqs, packet_mode=0, saved_packet_size=None, saved_packet_mode=None, max_seq_len=float("inf"), probing_batch_size=float("inf")): # Generator yields fasta-formattedpackets of records from fastq file. # This function passes 'num_done_seqs' sequences (i.e. they will not be processed) # to 'pass_processed_files'. # :param fastq: path to fastq file; # :type fastq: str; # :param packet_size: number of sequences to align in one request ('blastn' launching); # :type packet_size: int; # :param num_done_seqs: number of sequnces in current file that have been already processed; # :type num_done_seqs: int; # :param packet_mode: packet mode (see -c option); # :type packet_mode: int; # :param saved_packet_size: size of last sent packet from tmp file. Necessary for resumption. # It will be None, if no tmp file was in classification directory; # :type saved_packet_size: int; # :param saved_packet_mode: mode used whilst formig the last sent packet from tmp file. # Necessary for resumption. It will be None, if no tmp file was in classification directory; # :type saved_packet_mode: int; # :param max_seq_len: maximum length of a sequence proessed; # :type max_seq_len: int (float("inf") if pruning is disabled); how_to_open = OPEN_FUNCS[ is_gzipped(fastq) ] fmt_func = FORMATTING_FUNCS[ is_gzipped(fastq) ] with how_to_open(fastq) as fastq_file: # Pass reads, which have been already processed: for _ in range(int(num_done_seqs * FASTQ_LINES_PER_READ)): fastq_file.readline() # end for # End of file eof = False # Here goes check for saved packet size and mode: if not saved_packet_size is None: wrk_pack_size = saved_packet_size else: wrk_pack_size = packet_size # end if if not saved_packet_mode is None: wrk_pack_mode = saved_packet_mode else: wrk_pack_mode = packet_mode # end if if wrk_pack_mode == 0: form_packet = form_packet_numseqs else: form_packet = form_packet_totalbp # end if # Process all remaining sequences with standart packet size: while not eof: packet, eof = form_packet(fastq_file, wrk_pack_size, fmt_func, max_seq_len) if eof and packet["fasta"] == "": return # end if yield packet if packet_mode == 0: probing_batch_size -= wrk_pack_size wrk_pack_size = min(packet_size, probing_batch_size) else: probing_batch_size -= len(packet['qual']) # end if # Switch back to standart packet size # As Vorotos said, repeated assignment is the best check: if wrk_pack_mode != packet_mode: wrk_pack_mode = packet_mode
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst, accs_to_download, use_index): # Function creates a database with utilities from 'blast+' toolkit # according to acc_dict and your_own_fasta_lst. # # :param tax_annot_res_dir: path to current result directory # (each processed file has it's own result directory); # :type tax_annot_res_dir: str; # :param acc_fpath: path to file "hits_to_download.tsv"; # :type acc_fpath: str; # :param your_own_fasta_lst: list of user's fasta files to be included in database; # :type your_own_fasta_lst: list<str>; # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param use_index: whether to use index; # :type use_index: str; # Returns path to created database. # Path to directory in which database will be placed db_dir = os.path.join(tax_annot_res_dir, "local_database") # Path to DBM taxonomy file taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") try: os.makedirs(db_dir) except OSError: #If this directory exists while True: if len(os.listdir(db_dir)) == 0: # If db directory is empty -- break and build a database break else: print() printlog_info("Database directory is not empty:") printlog_info(" `{}`".format(os.path.abspath(db_dir))) printlog_info("Here is it's content:") for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))): printlog_info(" {}. `{}`".format(i + 1, fname)) # end for reply = input( """\nPress ENTER to start classification using existing database. Enter 'r' to remove all files in this directory and create the database from the beginning:>>""" ) if reply == "": # Do not build a database, just return path to it. printlog_info("You have chosen to use extant database.") # Return path to DB located in this directory dbpath = next(iter(os.listdir(db_dir))) dbpath = dbpath.partition(".fasta")[0] + dbpath.partition( ".fasta")[1] # remove all after '.fasta' return os.path.join(db_dir, dbpath) elif reply == 'r': printlog_info("You have chosen to rebuild the database.") # Rename old classification files and write actual data to new one: old_classif_dirs = filter( lambda d: os.path.exists( os.path.join(d, "classification.tsv")), glob(os.path.join(tax_annot_res_dir, "*"))) old_classif_files = tuple( map(lambda f: os.path.join(f, "classification.tsv"), old_classif_dirs)) if len(old_classif_files) > 0: print() printlog_info("Renaming old classification files:") for classif_file in old_classif_files: rename_file_verbosely(classif_file) # end for # end if # Empty database directory for file in glob("{}{}*".format(db_dir, os.sep)): os.unlink(file) # end for # Break from the loop in order to build a database break else: print("Invalid reply: `{}`\n".format(reply)) continue # end if # end if # end while # end try # It is a dictionary of accessions and record names. # Accessions are keys, record names are values. acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download) if len(accs_to_download) != 0: verify_cl_accessions(accs_to_download, acc_dict) # end if # Retrieve already existing taxonomy data from taxonomy file tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path) # If accession file does not exist and execution has reached here -- everything is OK -- # we are building a database from user's files only. if len(acc_dict) != 0: print() print("""Following sequences (and all replicons related to them) will be downloaded from Genbank for further taxonomic classification on your local machine:\n""") printlog_info( "Following sequences (and all replicons related to them) \ will be downloaded from Genbank for further taxonomic classification \ on your local machine:") for i, acc in enumerate(acc_dict.keys()): printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc])) # end for search_for_related_replicons(acc_dict) printlog_info_time("Completing taxonomy file...") for i, acc in enumerate(acc_dict.keys()): if not acc in tax_exist_accs: taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path) # end if # Accessions can be of different length printn("\r{} - {}: {}/{}".format(getwt(), acc, i + 1, len(acc_dict)) + " " * 10 + "\b" * 10) # end for print() printlog_info_time("Taxonomy file is consistent.") # end if local_fasta = os.path.join( db_dir, "local_seq_set.fasta") # path to downloaded FASTA file add_lambda_phage(local_fasta, taxonomy_path) # add lambda phage control sequence retrieve_fastas_by_acc( acc_dict, db_dir, local_fasta) # download main fasta data from GenBank # Add 'your own' fasta files to database if not len(your_own_fasta_lst) == 0: # This variable counts sequences from local files. # It is necessary for not allowing duplicated accessions. own_seq_counter = 0 # Check if these files are assembly made by SPAdes or a5 spades_patt = r">NODE_[0-9]+" # this pattern will match sequence IDs generated y SPAdes a5_patt = r">scaffold_[0-9]+" # this pattern will match sequence IDs generated y a5 assemblies = list( ) # this list will contain paths to assembly files (SPAdes or a5) for own_fasta_path in reversed(your_own_fasta_lst): how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: first_seq_id = fmt_func(fasta_file.readline( )) # get the first line in file (the first seq ID) # end with # if we've got SPAdes assembly if not re.search(spades_patt, first_seq_id) is None: assemblies.append(own_fasta_path) # Remove these file from list -- they will be processed in a specific way your_own_fasta_lst.remove(own_fasta_path) continue # end if # if we've got a5 assembly if not re.search(a5_patt, first_seq_id) is None: assemblies.append(own_fasta_path) your_own_fasta_lst.remove(own_fasta_path) continue # end if # end for # Include assemblies files in multi-fasta file # Find common prefix of all assembly paths and remove it from assembly names if len(assemblies) > 1: assemblies_formatted = tuple( map(lambda f: os.path.abspath(f).replace(os.sep, '-'), assemblies)) common_prefix = find_common_prefix(assemblies_formatted) assemblies_formatted = tuple( map(lambda f: f.replace(common_prefix, ''), assemblies_formatted)) elif len(assemblies) > 0: common_prefix = '' assemblies_formatted = tuple(map(os.path.basename, assemblies)) # end if # Add assembled sequences to database with open(local_fasta, 'a') as fasta_db: for i, assm_path in enumerate(assemblies): printlog_info("Adding `{}` to database...".format( os.path.basename(assm_path))) assm_name_fmt = assemblies_formatted[i] how_to_open = OPEN_FUNCS[is_gzipped(assm_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)] with how_to_open(assm_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # You can find comments to "OWN_SEQ..." below. # Paths will be written to seq IDs in following way: # some-happy-path.fastq-- # in order to retrieve them securely with regex later. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) own_def = "{}--".format( assm_name_fmt.replace(common_prefix, '')) + line[1:] own_def = remove_bad_chars(own_def) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, own_def) line = ">" + "{} {}".format(own_acc, own_def) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with with open(local_fasta, 'a') as fasta_db: for own_fasta_path in your_own_fasta_lst: printlog_info("Adding `{}` to database...".format( os.path.basename(own_fasta_path))) how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # 'makeblastdb' considers first word (sep. is space) as sequence ID # and throws an error if there are duplicated IDs. # In order not to allow this duplication we'll create our own sequence IDs: # 'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, line[1:]) line = ">" + own_acc + ' ' + remove_bad_chars( line[1:]) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with # end if # 'lcl|ACCESSION...' entries can be given with '.1' # (or '.2', whatever) terminus by blastn. # There is no '.1' terminus in taxonomy file. # Therefore we will prune accessions in advance. print() printn("{} - Formatting accessions...".format(getwt())) log_info("Formatting accessions...") corrected_path = os.path.join(db_dir, "corrected_seqs.fasta") with open(local_fasta, 'r') as source_file, open(corrected_path, 'w') as dest_file: for line in source_file: if line.startswith('>'): line = line.strip() acc, seq_name = (line.partition(' ')[0], line.partition(' ')[2]) acc = acc.partition('.')[0] seq_name = remove_bad_chars(seq_name) seq_name = re.sub(r'[^\x00-\x7F]+', '_', seq_name) # remove non-ascii chars line = ' '.join((acc, seq_name)) + '\n' # end if dest_file.write(line) # end for # end with os.unlink(local_fasta) os.rename(corrected_path, local_fasta) sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt())) log_info("Formatting accessions done.") # Configure command line make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format( local_fasta) exit_code = os.system(make_db_cmd) # make a blast-format database if exit_code != 0: printlog_error_time("Error occured while making the database") platf_depend_exit(exit_code) # end if print("\033[1A{} - Database is successfully created: `{}`\n".format( getwt(), local_fasta)) log_info("Database is successfully created: `{}`".format(local_fasta)) if use_index == "true": printlog_info_time("Database index creating started") # Configure command line make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format( local_fasta) exit_code = os.system( make_index_cmd) # create an index for the database if exit_code != 0: printlog_info_time("Error occured while creating database index") platf_depend_exit(exit_code) # end if printlog_info_time("Database index has been successfully created") # end if # Gzip downloaded FASTA file printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta)) if gzip_util_found: os.system("{} -v {}".format(gzip_util, local_fasta)) else: # form .fasta.gz file 'by hand' with open(local_fasta, 'rb') as fasta_file, open_as_gzip(local_fasta + ".gz", "wb") as fagz_file: shutil_copyfileobj(fasta_file, fagz_file) # end with os.unlink(local_fasta) # remove source FASTA file, not the database # end if return local_fasta