def get_qmatrix_data(qmatrix_path): # check path exist if utils.check_path_exist(qmatrix_path): # initialize qmatrix_comments_list = [] qmatrix_data_list = [] # open and read qmatrix for qmatrix_line in open(qmatrix_path): qmatrix_line = qmatrix_line.strip() line_columns = qmatrix_line.split('\t') # read data if line_columns[0] == '*': qmatrix_data_list.append(qmatrix_line) else: qmatrix_comments_list.append(qmatrix_line) return (qmatrix_comments_list, qmatrix_data_list) else: sys.stderr.write("\n** Cannot open %s.\n" % qmatrix_path) utils.die("** Program exit!")
def count_number_of_reads(reads_filepath, fasta_reads_flag): # number of reads number_of_reads = 0 # check path exist if utils.check_path_exist(reads_filepath): # check fasta or fastq format if fasta_reads_flag: # fasta format for read_line in open(reads_filepath,): read_line = read_line.strip() # fasta read id starts with > if read_line.startswith('>'): number_of_reads += 1 else: # fastq format for read_line in open(reads_filepath): read_line = read_line.strip() # fastq read id starts with @ if read_line.startswith('@'): number_of_reads += 1 else: sys.stderr.write("\n** Cannot open %s.\n" % reads_filepath) utils.die("** Program exit!") return number_of_reads
def get_bootstrap_gvector(bootstrap_gvector_path): # check path exist if utils.check_path_exist(bootstrap_gvector_path): # initialize bootstrap_gvector_map = {} # open and read bootstrap_gvector_path for gvector_line in open(bootstrap_gvector_path): gvector_line = gvector_line.strip() gvector_columns = gvector_line.split('\t') # read data if gvector_columns[0] == '*': genome_index = int(gvector_columns[1]) # ScaledPercentageChance is considered now. # genome_chance = float(gvector_columns[2]) genome_chance = float(gvector_columns[3]) bootstrap_gvector_map[genome_index] = genome_chance return bootstrap_gvector_map else: sys.stderr.write("\n** Cannot open %s!\n" % bootstrap_gvector_path) utils.die("** Program exit!")
def check_aligned_reads(filename): check_align = False if utils.check_path_exist(filename): check_align = True return check_align else: sys.stderr.write("\n** Cannot open %s.\n" %(filename)) utils.die("** Program exit!")
def get_target_genome(gvector_path, reconstruction_selection, reconstruction_cutoff_abundance, reconstruction_genome_name): # genome_name_map(key:genome_index, val:genome_name) genome_name_map = {} # genome_chance_map genome_chance_map = {} # target_genome_index_list target_genome_index_list = [] # to check the reconstruction_genome_name reconstruction_genome_name_exist_tag = False # check g-vector file if not utils.check_path_exist(gvector_path): utils.die("** Error: No such file or directory: " + gvector_path) # get target genome list (g_vector percentage >= cutoff_reconstruction_percentage) with open(gvector_path, 'r') as f: for gvector_line in f: gvector_line = gvector_line.strip() gvector_columns = gvector_line.split('\t') # consider first column is '@': @ Genome_Index Genome_Name Alignment_Rate if gvector_columns[0] == '@': genome_index = int(gvector_columns[1]) genome_name = gvector_columns[2] genome_name_map[genome_index] = genome_name # consider first column is '*': * Genome_Index Percentage_Chance elif gvector_columns[0] == '*': genome_index = int(gvector_columns[1]) # ScaledPercentageChance is considered now. # genome_chance = float(gvector_columns[2]) genome_chance = float(gvector_columns[3]) genome_chance_map[genome_index] = genome_chance # reconstruct all genomes >= cut-off if reconstruction_selection == 1: # if genome_chance >= Reconstruction_Cutoff_Abundance, then save if genome_chance >= reconstruction_cutoff_abundance: target_genome_index_list.append(genome_index) # reconstruct one target genome else: genome_name = genome_name_map[genome_index] if genome_name == reconstruction_genome_name: reconstruction_genome_name_exist_tag = True target_genome_index_list.append(genome_index) if not reconstruction_genome_name_exist_tag: utils.die("** Check the config file! Reconstruction_Genome_Name value does not match to the SIGMA gvector results!") if len(target_genome_index_list) == 0: utils.die("** No target genomes exist for reconstruction!") return (genome_name_map, genome_chance_map, target_genome_index_list)
def get_filtering_target_genome(gvector_path, filtering_genome_name): # genome_name_map(key:genome_index, val:genome_name) genome_name_map = {} # genome_chance_map genome_chance_map = {} # target_genome_index_list target_genome_index_list = [] # to check the filtering_genome_name filtering_genome_name_exist_tag = False # check g-vector file if not utils.check_path_exist(gvector_path): utils.die("** Error: No such file or directory: " + gvector_path) # get target genome list with open(gvector_path, 'r') as f: for gvector_line in f: gvector_line = gvector_line.strip() gvector_columns = gvector_line.split('\t') # consider first column is '@': @ Genome_Index Genome_Name Alignment_Rate if gvector_columns[0] == '@': genome_index = int(gvector_columns[1]) genome_name = gvector_columns[2] genome_name_map[genome_index] = genome_name # consider first column is '*': * Genome_Index Percentage_Chance elif gvector_columns[0] == '*': genome_index = int(gvector_columns[1]) # if PercentageChance is considered genome_chance = float(gvector_columns[2]) # if ScaledPercentageChance is considered #genome_chance = float(gvector_columns[3]) genome_chance_map[genome_index] = genome_chance genome_name = genome_name_map[genome_index] if genome_name == filtering_genome_name: filtering_genome_name_exist_tag = True target_genome_index_list.append(genome_index) if not filtering_genome_name_exist_tag: utils.die("** Check the config file! Filtering_Genome_Name value does not match to the SIGMA gvector results!") if len(target_genome_index_list) == 0: utils.die("** No target genomes exist for filtering!") return (genome_name_map, genome_chance_map, target_genome_index_list)
def write_bootstrap_qmatrix(bootstrap_qmatrix_path, qmatrix_comments_list, qmatrix_data_list): # check path exist if utils.check_path_exist(bootstrap_qmatrix_path): check_call(["rm", "-f", bootstrap_qmatrix_path], stdout = PIPE, stderr = sys.stderr) bootstrap_qmatrix_out = open(bootstrap_qmatrix_path, 'wb') # loop for qmatrix_comments_list for qmatrix_comments_list_item in qmatrix_comments_list: bootstrap_qmatrix_out.write(str(qmatrix_comments_list_item) + '\n') # get qmatrix reads count qmatrix_data_count = len(qmatrix_data_list) # loop for qmatrix_data_list for qmatrix_data_index in xrange(0, qmatrix_data_count): random_index = random.randint(0, qmatrix_data_count - 1) bootstrap_qmatrix_out.write(str(qmatrix_data_list[random_index]) + '\n')
def get_qmatrix_reads_count(qmatrix_path): # check path exist if utils.check_path_exist(qmatrix_path): # initialize qmatrix_reads_count = 0 # open and read qmatrix for qmatrix_line in open(qmatrix_path): qmatrix_line = qmatrix_line.strip() line_columns = qmatrix_line.split('\t') # read data if line_columns[0] == '*': qmatrix_reads_count += 1 return qmatrix_reads_count else: sys.stderr.write("\n** Cannot open %s.\n" % qmatrix_path) utils.die("** Program exit!")
def get_gvector_genome_list(gvector_path): # check file exist if utils.check_path_exist(gvector_path): # initialize gvector_genome_list = [] # open file gvector_file = open(gvector_path, 'r') for gvector_line in gvector_file: gvector_line = gvector_line.strip() gvector_columns = gvector_line.split('\t') # consider first column is '@': @ Genome_Index Genome_Name Alignment_Rate if gvector_columns[0] == '@': gvector_genome_list.append(gvector_line) return gvector_genome_list else: sys.stderr.write("\n** Cannot open %s.\n" % qmatrix_path) utils.die("** Program exit!")
def check_bowtie_index_built(genome_index_base, genome_fasta_path_sublist): # get bowtie index filepath (basename.1.bt2) genome_index_bt1_path = genome_index_base + ".1.bt2" # get fasta filepath (first fasta file) genome_fasta_1_path = genome_fasta_path_sublist[0] # check if the bowtie index file exist if utils.check_path_exist(genome_index_bt1_path): # get file creation time index_creation_time = os.path.getctime(genome_index_bt1_path) fasta_creation_time = os.path.getctime(genome_fasta_1_path) # if index_creation_time is newer than fasta_creation_time if index_creation_time >= fasta_creation_time: return True else: return False # if bowtie indexes do not exist, return false else: return False
def save(self, path=None, name='dqn_net.pkl'): path = self.save_path if not path else path utils.check_path_exist(path) torch.save(self.eval_net.state_dict(), path + name)