def __merge_chains_output(self): """Call parse results.""" # define where to save intermediate table eprint("Merging chain output...") merge_chains_output(self.ref_bed, self.isoforms, self.chain_class_results, self.chain_results_df) self.temp_files.append(self.chain_results_df)
def __merge_cesar_output(self): """Merge CESAR output, save final fasta and bed.""" eprint( "Merging CESAR output to make final fasta and pre-final bed files." ) merge_c_stage_skipped = os.path.join(self.rejected_dir, "CESAR_MERGE.txt") self.temp_files.append(self.intermediate_bed) all_ok = merge_cesar_output(self.cesar_results, self.intermediate_bed, self.nucl_fasta, self.meta_data, merge_c_stage_skipped, self.prot_fasta, self.trash_exons) if all_ok: # there are no empty output files print("CESAR results merged") self.cesar_ok_merged = True else: # there are some empty output files # MAYBE everything is fine # but need to notify user anyway print("WARNING!\nSOME CESAR JOBS LIKELY CRASHED\n!") print("RESULTS ARE LIKELY INCOMPLETE") print("PLEASE SEE LOGS FOR DETAILS") self.cesar_ok_merged = False
def __gene_loss_summary(self): """Call gene loss summary.""" eprint("Calling gene loss summary") gene_losses_summary(self.gene_loss_data, self.ref_bed, self.intermediate_bed, self.query_annotation, self.loss_summ, iforms=self.isoforms, paral=self.paralogs_log)
def __make_indexed_chain(self): """Make chain index file.""" # make *.bb file eprint("make_indexed in progress...") chain_bst_index(self.chain_file, self.chain_index_file, txt_index=self.chain_index_txt_file) self.temp_files.append(self.chain_index_file) self.temp_files.append(self.chain_file) self.temp_files.append(self.chain_index_txt_file) eprint("Indexed")
def read_fasta(fasta_line, v=False): """Read fasta, return dict and type.""" fasta_data = fasta_line.split(">") eprint(f"fasta_data[0] is:\n{fasta_data[0]}") if v else None eprint(f"fasta_data[1] is:\n{fasta_data[1]}") if v else None if fasta_data[0] != "": # this is a bug eprint("ERROR! Cesar output is corrupted") eprint(f"Issue detected in the following string:\n{fasta_line}") die("Abort") del fasta_data[0] # remove it "" we don't need that sequences = {} # accumulate data here order = [] # to have ordered list # there is no guarantee that dict will contain elements in the # same order as they were added for elem in fasta_data: raw_lines = elem.split("\n") # it must be first ['capHir1', 'ATGCCGCGCCAATTCCCCAAGCTGA... ] header = raw_lines[0] # separate nucleotide-containing lines lines = [x for x in raw_lines[1:] if x != "" and not x.startswith("!")] if len(lines) == 0: # it is a mistake - empty sequence --> get rid of continue fasta_content = "".join(lines) sequences[header] = fasta_content order.append(header) return sequences, order
def load_results(results_dir): """Load and sort the chain feature extractor results.""" verbose("Loading the results...") results_files = os.listdir(results_dir) verbose(f"There are {len(results_files)} result files to combine") # to hold data from fields "genes": chain_genes_data = defaultdict(list) # to hold data from "chains" field: chain_raw_data = {} # read file-by-file, otherwise it takes too much place genes_counter, chain_counter = 0, 0 # count chain and genes lines for results_file in results_files: # there are N files: read them one-by-one path = os.path.join(results_dir, results_file) f = open(path, "r") for line in f: # read file line-by-line, all fields are tab-separated line_data = line.rstrip().split("\t") # define the class of this line # a line could be either gene or chain-related if line_data[0] == "genes": # process as a gene line chain, genes = process_gene_line(line_data) chain_genes_data[chain].extend(genes) genes_counter += 1 elif line_data[0] == "chain": # chain related data the_chain_related = process_chain_line(line_data) # add this chain-related dict to the global one chain_raw_data.update(the_chain_related) chain_counter += 1 # do not forget to close the file f.close() verbose(f"Got {len(chain_genes_data)} keys in chain_genes_data") verbose(f"Got {len(chain_raw_data)} keys in chain_raw_data") verbose( f"There were {genes_counter} genes lines and {chain_counter} chain lines" ) # actually, these values must be equal # just a sanity check if not genes_counter == chain_counter: eprint(f"WARNING! genes_counter and chain_counter hold different " f"values:\n{genes_counter} and {chain_counter} respectively") die("Some features extracting jobs died!") return chain_genes_data, chain_raw_data
def __check_dependencies(self): """Check all dependencies.""" eprint("check if binaries are compiled and libs are installed...") c_not_compiled = any( os.path.isfile(f) is False for f in [ self.CHAIN_SCORE_FILTER, self.CHAIN_COORDS_CONVERT_LIB, self.CHAIN_FILTER_BY_ID, self.EXTRACT_SUBCHAIN_LIB, self.CHAIN_INDEX_SLIB ]) if c_not_compiled: eprint("Warning! C code is not compiled, trying to compile...") imports_not_found = False try: import twobitreader import networkx import pandas import xgboost import joblib import h5py except ImportError: eprint("Warning! Some of the required packages are not installed.") imports_not_found = True not_all_found = any([c_not_compiled, imports_not_found]) self.__call_proc(self.CONFIGURE, "Could not call configure.sh!")\ if not_all_found else eprint("All dependencies found")
def __orthology_type_map(self): """Call orthology_type_map.py""" # need to combine projections in genes query_isoforms_file = os.path.join(self.wd, "query_isoforms.tsv") query_gene_spans = os.path.join(self.wd, "query_gene_spans.bed") get_query_isoforms_data(self.query_annotation, query_isoforms_file, save_genes_track=query_gene_spans) eprint("Calling orthology_type_map...") skipped_ref_trans = os.path.join(self.wd, "ref_orphan_transcripts.txt") orthology_type_map(self.ref_bed, self.query_annotation, self.orthology_type, ref_iso=self.isoforms, que_iso=query_isoforms_file, paralogs_arg=self.paralogs_log, loss_data=self.loss_summ, save_skipped=skipped_ref_trans)
def __check_isoforms_file(self, t_in_bed): """Sanity checks for isoforms file.""" if not self.isoforms_arg: return # not provided: nothing to check # isoforms file provided: need to check correctness and completeness # then check isoforms file itself f = open(self.isoforms_arg, "r") self.isoforms = os.path.join(self.wd, "isoforms.tsv") header = f.__next__() # first line is header filt_isoforms_lines = [ header, ] # remove isoforms that don't appear in the bed file # also we catch isoforms that are in the bed but not in the isoforms file t_in_i = [] for num, line in enumerate(f, 2): line_data = line.rstrip().split("\t") if len(line_data) != ISOFORMS_FILE_COLS: err_msg = f"Error! Isoforms file {self.isoforms} line {num}: " \ f"Expected {ISOFORMS_FILE_COLS} fields, got {len(line_data)}" self.die(err_msg) transcript = line_data[1] if transcript in t_in_bed: # this isoforms appears in the bed file: keep it filt_isoforms_lines.append(line) t_in_i.append(line_data[1]) else: # this isoform doesn't appear in the bed: we can skip it continue f.close() # this set contains isoforms found in the isoforms file t_in_i = set(t_in_i) # there are transcripts that appear in bed but not in the isoforms file # if this set is non-empty: raise an error u_in_b = t_in_bed.difference(t_in_i) if len(u_in_b) != 0: # isoforms file is incomplete extra_t_list = "\n".join( list(u_in_b)[:100]) # show first 100 (or maybe show all?) err_msg = f"Error! There are {len(u_in_b)} transcripts in the bed file absent in the isoforms file! " \ f"There are the transcripts (first 100):\n{extra_t_list}" self.die(err_msg) # write isoforms file with open(self.isoforms, "w") as f: f.write("".join(filt_isoforms_lines)) eprint("Isoforms file is OK")
def __check_u12_file(self, t_in_bed): """Sanity check for U12 file.""" if not self.u12_arg: # just not provided: nothing to check return # U12 file provided self.u12 = os.path.join(self.wd, "u12_data.txt") filt_lines = [] f = open(self.u12_arg, "r") for num, line in enumerate(f, 1): line_data = line.rstrip().split("\t") if len(line_data) != U12_FILE_COLS: err_msg = f"Error! U12 file {self.u12} line {num} is corrupted, 3 fields expected; "\ f"Got {len(line_data)}; please note that a tab-separated file expected" self.die(err_msg) trans_id = line_data[0] if trans_id not in t_in_bed: # transcript doesn't appear in the bed file: skip it continue exon_num = line_data[1] if not exon_num.isnumeric(): err_msg = f"Error! U12 file {self.u12} line {num} is corrupted, field 2 value is {exon_num}; "\ f"This field must contain a numeric value (exon number)." self.die(err_msg) acc_don = line_data[2] if acc_don not in U12_AD_FIELD: err_msg = f"Error! U12 file {self.u12} line {num} is corrupted, field 3 value is {acc_don}; "\ f"This field could have either A or D value." self.die(err_msg) filt_lines.append(line) # save this line f.close() # another check: what if there are no lines after filter? if len(filt_lines) == 0: err_msg = f"Error! No lines left in the {self.u12_arg} file after filter." \ f"Please check that transcript IDs in this file and bed {self.ref_bed} are consistent" self.die(err_msg) with open(self.u12, "w") as f: f.write("".join(filt_lines)) eprint("U12 file is correct")
def __call_proc(self, cmd, extra_msg=None): """Call a subprocess and catch errors.""" eprint(f"{cmd} in progress...") rc = subprocess.call(cmd, shell=True) if rc != 0: eprint(extra_msg) if extra_msg else None self.die(f"Error! Process {cmd} died! Abort.") eprint(f"{cmd} done with code 0")
def merge_chains_output(bed_file, isoforms, results_dir, output, exon_cov_chains=False): """Chains output merger core function.""" # read bed file, get gene features bed_data = read_bed_data(bed_file) # load isoforms data if provided isoforms = read_isoforms(isoforms) if isoforms else None # read result files from unit chain_genes_data, chain_raw_data = load_results(results_dir) # I need this dict reverted actually # not chain-genes-data but gene-chains-data genes_data = revert_dict(chain_genes_data) # combine all the data into one gene-oriented dictionary combined_data = combine(bed_data, chain_raw_data, genes_data, exon_cov_chains, isoforms) # save this data save(combined_data, output) # finish the program eprint(f"Estimated_time: {format(dt.now() - t0)}")
def __classify_chains(self): """Run decision tree.""" # define input and output.""" eprint("Decision tree in progress...") self.orthologs = os.path.join(self.wd, "trans_to_chain_classes.tsv") self.pred_scores = os.path.join(self.wd, "orthology_scores.tsv") self.se_model = os.path.join(self.LOCATION, "models", "se_model.dat") self.me_model = os.path.join(self.LOCATION, "models", "me_model.dat") cl_rej_log = os.path.join(self.rejected_dir, "classify_chains_rejected.txt") if not os.path.isfile(self.se_model) or not os.path.isfile( self.me_model): self.__call_proc(self.MODEL_TRAINER, "Models not found, training...") classify_chains(self.chain_results_df, self.orthologs, self.se_model, self.me_model, rejected=cl_rej_log, raw_out=self.pred_scores) if self.stop_at_chain_class: self.die("User requested to halt after chain features extraction", rc=0)
def check_args(chain_id, genes, chain_file, chain_dict, bed_file, verbose_level, work_data, result): # print(chain_index, chain_file) """Check if arguments are correct, extract initial data if so.""" global VERBOSE # set verbosity level VERBOSE = True if verbose_level else False verbose("# unit.py called for chain {} and genes {}".format(chain_id, genes)) # another minor things verbose(f"Using {bed_file} and {chain_file}") work_data["chain_id"] = chain_id # check genes raw_genes = [x for x in genes.split(",") if x != ""] # bed_lines = bedExtractSqlite(raw_genes, bed_index, bed_file) bed_lines = bed_extract_id(bed_file, raw_genes) work_data["bed"] = bed_lines # save it work_data["genes"] = [x.split("\t")[3] for x in bed_lines.split("\n")[:-1]] # check if numbers of genes are equal if len(raw_genes) != len(bed_lines.split("\n")[:-1]): eprint("Warning. Not all the genes you set were found!\n") need_ = len(raw_genes) extracted_ = len(bed_lines.split('\n')[:-1]) eprint(f"You set {need_} genes, {extracted_}") missing_genes = ",".join([x for x in raw_genes if x not in work_data["genes"]]) eprint(f"Missing genes:\n{missing_genes}") # extract chain body from the file work_data["chain"] = extract_chain(chain_file, chain_dict, chain_id) # parse chain header chain_header = work_data["chain"].split("\n")[0].split() verbose("Chain header is:\n{0}".format(chain_header)) q_start = int(chain_header[10]) q_end = int(chain_header[11]) q_len = abs(q_end - q_start) work_data["chain_QLen"] = q_len work_data["chain_Tstarts"] = int(chain_header[5]) work_data["chain_Tends"] = int(chain_header[6]) result["chain_global_score"] = int(chain_header[1]) result["chain_len"] = work_data["chain_Tends"] - work_data["chain_Tstarts"]
def merge_cesar_output(input_dir, output_bed, output_fasta, meta_data_arg, skipped_arg, prot_arg, output_trash): """Merge multiple CESAR output files.""" # check that input dir is correct die(f"Error! {input_dir} is not a dir!") \ if not os.path.isdir(input_dir) else None # get list of bdb files (output of CESAR part) bdbs = [x for x in os.listdir(input_dir) if x.endswith(".bdb")] # initiate lists for different types of output: bed_summary = [] fasta_summary = [] trash_summary = [] meta_summary = [] prot_summary = [] skipped = [] all_ok = True task_size = len(bdbs) # extract data for all the files for num, bdb_file in enumerate(bdbs): # parse bdb files one by one bdb_path = os.path.join(input_dir, bdb_file) try: # try to parse data parsed_data = parse_cesar_bdb(bdb_path) except AssertionError: # if this happened: some assertion was violated # probably CESAR output data is corrupted sys.exit(f"Error! Failed reading file {bdb_file}") # unpack parsed data tuple: bed_lines = parsed_data[0] trash_exons = parsed_data[1] fasta_lines = parsed_data[2] meta_data = parsed_data[3] prot_fasta = parsed_data[4] skip = parsed_data[5] if len(bed_lines) == 0: # actually should not happen, but can eprint(f"Warning! {bdb_file} is empty") all_ok = False continue # it is empty # append data to lists bed_summary.append("\n".join(bed_lines) + "\n") fasta_summary.append(fasta_lines) trash_summary.append("".join(trash_exons)) meta_summary.append(meta_data) skipped.append(skip) prot_summary.append(prot_fasta) eprint(f"Reading file {num + 1}/{task_size}", end="\r") # save output eprint("Saving the output") if len(bed_summary) == 0: # if so, no need to continue eprint("! merge_cesar_output.py:") die("No projections found! Abort.") # save bed, fasta and the rest with open(output_bed, "w") as f: f.write("".join(bed_summary)) with open(output_fasta, "w") as f: f.write("".join(fasta_summary)) with open(meta_data_arg, "w") as f: f.write("\n".join(meta_summary)) with open(skipped_arg, "w") as f: f.write("\n".join(skipped)) with open(prot_arg, "w") as f: f.write("\n".join(prot_summary)) if output_trash: # if requested: provide trash annotation f = open(output_trash, "w") f.write("".join(trash_summary)) f.close() return all_ok
def parse_cesar_bdb(arg_input, v=False): """Parse CESAR bdb file core function.""" in_ = open(arg_input, "r") # read cesar bdb file # two \n\n divide each unit of information content = [x for x in in_.read().split("\n\n") if x] in_.close() # GLP-related data is already filtered out by cesar_runner # initiate collectors bed_lines = [] # save bed lines here skipped = [] # save skipper projections here pred_seq_chain = {} # for nucleotide sequences to fasta t_exon_seqs = defaultdict(dict) # reference exon sequences wrong_exons = [] # exons that are predicted but actually deleted/missing all_meta_data = [META_HEADER] # to collect exons meta data prot_data = [] # protein sequences for elem in content: # one elem - one CESAR call (one ref transcript and >=1 chains) # now loop gene-by-gene gene = elem.split("\n")[0][1:] eprint(f"Reading gene {gene}") if v else None cesar_out = "\n".join(elem.split("\n")[1:]) # basically this is a fasta file with headers # saturated with different information sequences, order = read_fasta(cesar_out, v=v) # initiate dicts to fill later ranges_chain, chain_dir = defaultdict(dict), {} pred_seq_chain[gene] = defaultdict(dict) # split fasta headers in different classes # query, ref and prot sequence headers are explicitly marked query_headers = [h for h in order if h.endswith("query_exon")] ref_headers = [h for h in order if h.endswith("reference_exon")] prot_ids = [h for h in order if "PROT" in h] # parse reference exons, quite simple for header in ref_headers: # one header for one exon # fields look like this: # FIELD_1 | FIELD_2 | FIELD_3\n header_fields = [s.replace(" ", "") for s in header.split("|")] exon_num = int(header_fields[1]) # 0-based! exon_seq = sequences[header].replace( "-", "") # header is also a key for seq dict t_exon_seqs[gene][exon_num] = exon_seq # save protein data for prot_id in prot_ids: prot_seq = sequences[prot_id] prot_line = f">{prot_id}\n{prot_seq}\n" prot_data.append(prot_line) # get gene: exons dict to trace deleted exons gene_chain_exon_status = defaultdict(dict) # parse query headers for header in query_headers: header_fields = [s.replace(" ", "") for s in header.split("|")] if len(header_fields) != Q_HEADER_FIELDS_NUM: continue # ref exon? # extract metadata, parse query header trans = header_fields[0] exon_num = int(header_fields[1]) chain_id = int(header_fields[2]) exon_region = read_region(header_fields[3]) pid = float(header_fields[4]) # nucleotide %ID blosum = float(header_fields[5]) is_gap = header_fields[6] # asm gap in the expected region exon_class = header_fields[7] # how it aligns to chain exp_region_str = header_fields[8] # expected region in_exp = header_fields[9] # detected in the expected region or not in_exp_b = True if in_exp == "INC" else False # mark that it's paralogous projection: para_annot = True if header_fields[10] == "True" else False stat_key = (trans, chain_id) # projection ID # classify exon, check whether it's deleted/missing exon_decision, q_mark = classify_exon(exon_class, in_exp_b, pid, blosum) if exon_decision is False: # exon is deleted/missing wrong_exons.append(header) # save this data gene_chain_exon_status[stat_key][exon_num] = False else: # exon is not deleted # get/write necessary info gene_chain_exon_status[stat_key][exon_num] = True chain_dir[chain_id] = exon_region["end"] > exon_region["start"] ranges_chain[chain_id][exon_num] = exon_region pred_seq_chain[gene][chain_id][exon_num] = sequences[header] # collect exon meta-data -> write to file later meta_data = "\t".join([ gene, header_fields[1], header_fields[2], header_fields[3], exp_region_str, in_exp, header_fields[4], header_fields[5], is_gap, exon_class, str(para_annot), q_mark ]) all_meta_data.append(meta_data) # check if there are any exons for name, stat in gene_chain_exon_status.items(): any_exons_left = any(stat.values()) if any_exons_left: continue # projection has no exons: log it name_ = f"{name[0]}.{name[1]}" skipped.append(f"{name_}\tall exons are deleted.") # make bed tracks for chain_id in chain_dir.keys(): # go projection-by-projection: fixed gene, loop over chains block_starts = [] block_sizes = [] ranges = ranges_chain[chain_id] name = f"{gene}.{chain_id}" # projection name for bed file if len(ranges) == 0: # this projection is completely missing skipped.append(f"{name}\tall exons are deleted.") continue direct = chain_dir[chain_id] exon_nums = sorted(ranges.keys()) if direct else sorted( ranges.keys(), reverse=True) # get basic coordinates chrom = ranges[exon_nums[0]]["chrom"] chrom_start = ranges[exon_nums[0]]["start"] if direct else ranges[ exon_nums[0]]["end"] chrom_end = ranges[exon_nums[-1]]["end"] if direct else ranges[ exon_nums[-1]]["start"] # we do not predict UTRs: thickStart/End = chrom_start/End thickStart = chrom_start thick_end = chrom_end strand = "+" if direct else "-" block_count = len(exon_nums) # need to convert to "block starts" \ "block sizes" format for exon_num in exon_nums: ex_range = ranges[exon_num] block_sizes.append(abs(ex_range["end"] - ex_range["start"])) blockStart = ex_range[ "start"] - chrom_start if direct else ex_range[ "end"] - chrom_start block_starts.append(blockStart) # need this as strings to save it in a text file block_starts_str = ",".join(map(str, block_starts)) + "," block_sizes_str = ",".join(map(str, block_sizes)) + "," # join in a bed line bed_list = map(str, [ chrom, chrom_start, chrom_end, name, DEFAULT_SCORE, strand, thickStart, thick_end, BLACK, block_count, block_sizes_str, block_starts_str ]) bed_line = "\t".join(bed_list) bed_lines.append(bed_line) # arrange fasta content fasta_lines_lst = [] for gene, chain_exon_seq in pred_seq_chain.items(): # write target gene info t_gene_seq_dct = t_exon_seqs.get(gene) if t_gene_seq_dct is None: # no sequence data for this transcript? eprint(f"Warning! Missing data for {gene}") skipped.append(f"{gene}\tmissing data after cesar stage") continue # We have sequence fragments split between different exons t_exon_nums = sorted(t_gene_seq_dct.keys()) t_header = ">ref_{0}\n".format(gene) t_seq = "".join([t_gene_seq_dct[num] for num in t_exon_nums]) + "\n" # append data to fasta strings fasta_lines_lst.append(t_header) fasta_lines_lst.append(t_seq) # and query info for chain_id, exon_seq in chain_exon_seq.items(): track_header = ">{0}.{1}\n".format(gene, chain_id) exon_nums = sorted(exon_seq.keys()) # also need to assemble different exon sequences seq = "".join([exon_seq[num] for num in exon_nums]) + "\n" fasta_lines_lst.append(track_header) fasta_lines_lst.append(seq) # save corrupted exons as bed-6 track # to make it possible to save them and visualize in the browser trash_exons = [] for elem in wrong_exons: elem_fields = [s.replace(" ", "") for s in elem.split("|")] # need to fill the following: # chrom, start, end, name, score, strand gene_name = elem_fields[0] exon_num = elem_fields[1] chain_id = elem_fields[2] label = ".".join([gene_name, exon_num, chain_id]) grange = elem_fields[3].split(":") chrom, (start, end) = grange[0], grange[1].split("-") strand = "+" score = str(int(float(elem_fields[4]) * 10)) bed_6 = "\t".join([chrom, start, end, label, score, strand]) + "\n" trash_exons.append(bed_6) # join output strings meta_str = "\n".join(all_meta_data) + "\n" skipped_str = "\n".join(skipped) + "\n" prot_fasta = "".join(prot_data) fasta_lines = "".join(fasta_lines_lst) return bed_lines, trash_exons, fasta_lines, meta_str, prot_fasta, skipped_str
def prepare_bed_file(bed_file, output, ouf=False, save_rejected=None, only_chrom=None): """Filter the bed file given and save the updated version.""" new_lines = [] # keep updated lines rejected = [] # keep IDs of skipped transcripts + the reason why names = Counter() # we need to make sure that all names are unique f = open(bed_file, "r") for num, line in enumerate(f, 1): # parse bed file according to specification line_data = line.rstrip().split("\t") if len(line_data) != 12: f.close() # this is for sure an error # it is possible only if something except a bed12 was provided die("Error! Bed 12 file is required! Got a file with {len(line_data)} fields instead" ) chrom = line_data[0] if only_chrom and chrom != only_chrom: # TOGA allows to perform the analysis on a specific chromosome only # is so, we can skip all transcripts that located on other chromosomes continue chromStart = int(line_data[1]) chromEnd = int(line_data[2]) name = line_data[3] # gene_name usually # bed_score = int(line_data[4]) # never used # strand = line_data[5] # otherwise: # strand = True if line_data[5] == '+' else False thickStart = int(line_data[6]) thickEnd = int(line_data[7]) # itemRgb = line_data[8] # never used blockCount = int(line_data[9]) blockSizes = [int(x) for x in line_data[10].split(',') if x != ''] blockStarts = [int(x) for x in line_data[11].split(',') if x != ''] blockEnds = [blockStarts[i] + blockSizes[i] for i in range(blockCount)] blockAbsStarts = [ blockStarts[i] + chromStart for i in range(blockCount) ] blockAbsEnds = [blockEnds[i] + chromStart for i in range(blockCount)] blockNewStarts, blockNewEnds = [], [] names[name] += 1 if thickStart > thickEnd: f.close( ) # according to bed12 specification this should never happen sys.stderr.write(f"Problem occurred at line {num}, gene {name}\n") die("Error! Bed file is corrupted, thickEnd MUST be >= thickStart") elif thickStart == thickEnd: # this means that this is a non-coding transcript # TOGA cannot process them: we can skip it rejected.append((name, "No CDS")) continue if thickStart < chromStart or thickEnd > chromEnd: # a very strange (but still possible) case f.close() # for sure an error with input data sys.stderr.write(f"Problem occurred at line {num}, gene {name}\n") die("Error! Bed file is corrupted, thickRange is outside chromRange!" ) # now select CDS only # we keep UTRs in the filtered file # however, we need CDS to check whether it's correct (% 3 == 0) for block_num in range(blockCount): blockStart = blockAbsStarts[block_num] blockEnd = blockAbsEnds[block_num] # skip the block if it is entirely UTR if blockEnd <= thickStart: continue elif blockStart >= thickEnd: continue # if we are here: this is not an entirely UTR exon # it might intersect the CDS border or to be in the CDS entirely # remove UTRs: block start must be >= CDS_start (thickStart) # block end must be <= CDS_end (thickEnd) blockNewStart = blockStart if blockStart >= thickStart else thickStart blockNewEnd = blockEnd if blockEnd <= thickEnd else thickEnd blockNewStarts.append(blockNewStart - thickStart) blockNewEnds.append(blockNewEnd - thickStart) if len(blockNewStarts) == 0: # even it thickStart != thickEnd this transcript still can be non-coding # but if there are no blocks in the CDS -> we can catch this rejected.append((name, "No CDS")) continue block_new_count = len(blockNewStarts) blockNewSizes = [ blockNewEnds[i] - blockNewStarts[i] for i in range(block_new_count) ] if sum(blockNewSizes) % 3 != 0 and not ouf: # this is an out-of-frame (or incomplete transcript) # ideally CDS length should be divisible by 3 # not ouf means that we like to keep such transcripts for some reason rejected.append((name, "Out-of-frame gene")) continue # if there are non-unique transcript IDs: die # I kill it there, not earlier to show them altogether if any(v > 1 for v in names.values()): eprint("Error! There are non-uniq transcript IDs:") for k in names.keys(): eprint(k) die("Abort") # we keep this transcript: add in to the list new_line = "\t".join([str(x) for x in line_data]) new_lines.append(new_line) f.close() if len(new_lines) == 0: # no transcripts pass the filter: probably an input data mistake sys.exit( f"Error! No reference annotation tracks left after filtering procedure! Abort" ) # write transcripts that passed the filter to the output file f = open(output, "w") if output != "stdout" else sys.stdout f.write("\n".join(new_lines) + "\n") f.close() if output != "stdout" else None if save_rejected: # save transcripts that didn't pass the filter + reason why f = open(save_rejected, "w") for elem in rejected: f.write(f"{elem[0]}\t{elem[1]}\n") f.close()
def verbose(msg): """Eprint for verbose messages.""" eprint(msg + "\n") if VERBOSE else None
def main(): """Entry point.""" t0 = dt.now() args = parse_args() os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" # otherwise it could crash # as default we create CESAR jobs for chains with "orth" or "trans" class # but user could select another set of chain classes fields = "ORTH,TRANS" if args.fields is None else args.fields # read U12 introns: to create a list of U12-containing genes # need it to make subsequent commands u12_data = read_u12_data(args.u12) # get lists of orthologous chains per each gene # skipped_1 - no chains found -> log them batch, chain_gene_field, skipped_1 = read_orthologs(args.orthologs_file, fields, only_o2o=args.o2o_only) # split cesar jobs in different buckets (if user requested so) # like put all jobs that require < 5Gig in the bucket 1 # jobs requiring 5 to 15 Gb to bucket 2 and so on # CESAR might be very memory-consuming -> so we care about this mem_limit, buckets = define_buckets(args.mem_limit, args.buckets) # load reference bed file data; coordinates and exon sizes bed_data = read_bed(args.bed_file) # check if cesar binary exists die(f"Error! Cannot find cesar executable at {args.cesar_binary}!") if \ not os.path.isfile(args.cesar_binary) else None # pre-compute chain : gene : region data # collect the second list of skipped genes # skipped_2 -> too long corresponding regions in query regions, skipped_2 = precompute_regions(batch, bed_data, args.bdb_chain_file, chain_gene_field, args.chains_limit) # start making the jobs all_jobs = {} skipped_3 = [] for gene in batch.keys(): u12_this_gene = u12_data.get(gene) block_sizes = bed_data[gene][3] # proceed to memory estimation # the same procedure as inside CESAR2.0 code num_states, r_length = 0, 0 # required memory depends on numerous params # first, we need reference transcript-related parameters # query-related parameters will be later for block_size in block_sizes: # num_states += 6 + 6 * reference->num_codons + 1 + 2 + 2 + 22 + 6; # /* 22 and 6 for acc and donor states */ num_codons = block_size // 3 num_states += 6 + 6 * num_codons + 1 + 2 + 2 + 22 + 6 # r_length += 11 + 6 * fasta.references[i]->length # + donors[i]->length + acceptors[i]->length; r_length += block_size gene_chains_data = regions.get(gene) # check that there is something for this gene if not gene_chains_data: continue elif len(gene_chains_data) == 0: continue chains = gene_chains_data.keys() chains_arg = ",".join(chains) # chain ids -> one of the cmd args # now compute query sequence-related parameters query_lens = [v for v in gene_chains_data.values()] q_length_max = max(query_lens) # and now compute the amount of required memory memory = (num_states * 4 * 8) + \ (num_states * q_length_max * 4) + \ (num_states * 304) + \ (2 * q_length_max + r_length) * 8 + \ (q_length_max + r_length) * 2 * 1 + EXTRA_MEM # convert to gigs + 0.25 extra gig gig = math.ceil(memory / 1000000000) + 0.25 if gig > mem_limit: # it is going to consume TOO much memory # skip this gene -> save to log skipped_3.append((gene, ",".join(chains), f"memory limit ({mem_limit} gig) exceeded (needs {gig})")) continue # # 0 gene; 1 chains; 2 bed_file; 3 bdb chain_file; 4 tDB; 5 qDB; 6 output; 7 cesar_bin job = WRAPPER_TEMPLATE.format(gene, chains_arg, os.path.abspath(args.bdb_bed_file), os.path.abspath(args.bdb_chain_file), os.path.abspath(args.tDB), os.path.abspath(args.qDB), gig, os.path.abspath(args.cesar_binary), args.uhq_flank) # add some flags if required job = job + " --mask_stops" if args.mask_stops else job job = job + " --check_loss" if args.check_loss else job job = job + " --no_fpi" if args.no_fpi else job # add U12 introns data if this gene has them: job = job + f" --u12 {os.path.abspath(args.u12)}" if u12_this_gene else job all_jobs[job] = gig eprint(f"\nThere are {len(all_jobs.keys())} jobs in total.") eprint("Splitting the jobs.") # split jobs in buckets | compute proportions filled_buckets = fill_buckets(buckets, all_jobs) prop_sum = sum([k * len(v) for k, v in filled_buckets.items()]) # estimate proportion of a bucket in the runtime buckets_prop = {k: (k * len(v)) / prop_sum for k, v in filled_buckets.items()} \ if 0 not in filled_buckets.keys() else {0: 1.0} eprint("Bucket proportions are:") eprint("\n".join([f"{k} -> {v}" for k, v in buckets_prop.items()])) # get number of jobs for each bucket bucket_jobs_num = {k: math.ceil(args.jobs_num * v) for k, v in buckets_prop.items()} # save jobs, get comb lines to_combine = save_jobs(filled_buckets, bucket_jobs_num, args.jobs_dir) # save combined jobs, combined is a file containing paths to separate jobs os.mkdir(args.results) if not os.path.isdir(args.results) else None os.mkdir(args.check_loss) if args.check_loss \ and not os.path.isdir(args.check_loss) else None f = open(args.combined, "w") for num, comb in enumerate(to_combine, 1): basename = os.path.basename(comb).split(".")[0] results_path = os.path.abspath(os.path.join(args.results, basename + ".bdb")) combined_command = f"{CESAR_RUNNER} {comb} {results_path}" if args.check_loss: loss_data_path = os.path.join(args.check_loss, f"{basename}.inact_mut.txt") combined_command += f" --check_loss {loss_data_path}" if args.rejected_log: log_path = os.path.join(args.rejected_log, f"{num}.txt") combined_command += f" --rejected_log {log_path}" f.write(combined_command + "\n") f.close() # save skipped genes if required if args.skipped_genes: skipped = skipped_1 + skipped_2 + skipped_3 f = open(args.skipped_genes, "w") # usually we have gene + reason why skipped # we split them with tab f.write("\n".join(["\t".join(x) for x in skipped]) + "\n") f.close() f = open(args.paralogs_log, "w") # save IDs of paralogous projections for k, v in chain_gene_field.items(): if v != "PARALOG": continue gene_ = f"{k[1]}.{k[0]}\n" f.write(gene_) f.close() eprint(f"Estimated: {dt.now() - t0}") sys.exit(0)
def verbose(msg): """Eprint for verbose messages.""" eprint(msg + "\n")
def __get_proc_pseudogenes_track(self): """Create annotation of processed genes in query.""" eprint("Creating processed pseudogenes track.") proc_pgenes_track = os.path.join(self.wd, "proc_pseudogenes.bed") create_ppgene_track(self.orthologs, self.chain_file, self.index_bed_file, proc_pgenes_track)
def precompute_regions(batch, bed_data, bdb_chain_file, chain_gene_field, limit): """Precompute region for each chain: bed pair.""" eprint("Precompute regions for each gene:chain pair...") chain_to_genes, skipped = defaultdict(list), [] # revert the dict, from gene2chain to chain2genes for gene, chains in batch.items(): if len(chains) == 0: skipped.append((gene, ",".join(chains), "no orthologous chains")) continue chains_ = sorted(chains, key=lambda x: int(x)) chains_ = chains_[:limit] if len(chains) > limit: # skip genes that have > limit orthologous chains skipped.append((gene, ",".join(chains_[limit:]), f"number of chains ({limit} chains) limit exceeded")) for chain in chains_: chain_to_genes[chain].append(gene) # read regions themselves gene_chain_grange = defaultdict(dict) chains_num, iter_num = len(chain_to_genes.keys()), 0 for chain_id, genes in chain_to_genes.items(): # extract chain itself chain_body = chain_extract_id(bdb_chain_file, chain_id).encode() all_gene_ranges = [] for gene in genes: # get genomic coordinates for each gene gene_data = bed_data.get(gene) grange = f"{gene_data[0]}:{gene_data[1]}-{gene_data[2]}" all_gene_ranges.append(grange) # we need to get corresponding regions in the query # for now we have chain blocks coordinates and gene # regions in the reference genome # use chain_coords_converter shared library to # convert target -> query coordinates via chain # first need to convert to C-types c_chain = ctypes.c_char_p(chain_body) c_shift = ctypes.c_int(2) granges_bytes = [s.encode("utf-8") for s in all_gene_ranges] granges_num = len(all_gene_ranges) c_granges_num = ctypes.c_int(granges_num) granges_arr = (ctypes.c_char_p * (granges_num + 1))() granges_arr[:-1] = granges_bytes granges_arr[granges_num] = None # then call the function raw_ch_conv_out = ch_lib.chain_coords_converter(c_chain, c_shift, c_granges_num, granges_arr) chain_coords_conv_out = [] # keep lines here # convert C output to python-readable type for i in range(granges_num + 1): chain_coords_conv_out.append(raw_ch_conv_out[i].decode("utf-8")) for line in chain_coords_conv_out[1:]: # then parse the output line_info = line.rstrip().split() # line info is: region num, region in reference, region in query # one line per one gene, in the same order num = int(line_info[0]) # regions format is chrom:start-end q_grange = line_info[1].split(":")[1].split("-") q_start, q_end = int(q_grange[0]), int(q_grange[1]) que_len = q_end - q_start t_grange = line_info[2].split(":")[1].split("-") t_start, t_end = int(t_grange[0]), int(t_grange[1]) tar_len = t_end - t_start len_delta = abs(tar_len - que_len) delta_gene_times = len_delta / tar_len gene = genes[num] field = chain_gene_field.get((chain_id, gene)) # check that corresponding region in the query is not too long # if so: skip this high_rel_len = delta_gene_times > REL_LENGTH_THR high_abs_len = len_delta > ABS_LENGTH_TRH long_loci_field = field in LONG_LOCI_FIELDS if (high_rel_len or high_abs_len) and long_loci_field: skipped.append((gene, chain_id, "too long query locus")) continue # for each chain-gene pair save query region length # need this for required memory estimation gene_chain_grange[gene][chain_id] = que_len del raw_ch_conv_out # not sure if necessary but... iter_num += 1 # verbosity eprint(f"Chain {iter_num} / {chains_num}", end="\r") return gene_chain_grange, skipped
def verbose(msg, end="\n"): """Eprint for verbose messages.""" eprint(msg + end)
def __init__(self, args): """Initiate toga class.""" self.t0 = dt.now() # check if all files TOGA needs are here self.temp_files = [] # remove at the end, list of temp files self.__modules_addr() self.__check_dependencies() self.__check_completeness() self.nextflow_dir = self.__get_nf_dir(args.nextflow_dir) self.nextflow_config_dir = args.nextflow_config_dir self.__check_nf_config() # to avoid crash on filesystem without locks: os.environ[ "HDF5_USE_FILE_LOCKING"] = "FALSE" # otherwise it could crash eprint("mkdir_and_move_chain in progress...") chain_basename = os.path.basename(args.chain_input) # create project dir self.project_name = chain_basename.split(".")[1] if not args.project_name \ else args.project_name self.wd = args.project_folder if args.project_folder else \ os.path.join(os.getcwd(), self.project_name) # for safety; need this to make paths later self.project_name = self.project_name.replace("/", "") os.mkdir(self.wd) if not os.path.isdir(self.wd) else None # dir to collect log files with rejected reference genes: self.rejected_dir = os.path.join(self.wd, "rejected") os.mkdir(self.rejected_dir) if not os.path.isdir( self.rejected_dir) else None # filter chain in this folder g_ali_basename = "genome_alignment" self.chain_file = os.path.join(self.wd, f"{g_ali_basename}.chain") # there is an assumption that chain file has .chain extension # chain indexing was a bit problematic: (i) bsddb3 fits perfectly but is very # painful to install, (ii) sqlite is also fine but might be dysfunctional on some # cluster file systems, so we create chain_ID: (start_byte, offset) dictionary for # instant extraction of a particular chain from the chain file # we save these dictionaries into two files: a text file (tsv) and binary file with BST # depending on the case we will use both (for maximal performance) self.chain_index_file = os.path.join(self.wd, f"{g_ali_basename}.bst") self.chain_index_txt_file = os.path.join( self.wd, f"{g_ali_basename}.chain_ID_position") # make the command, prepare the chain file if not os.path.isfile(args.chain_input): chain_filter_cmd = None self.die(f"Error! File {args.chain_input} doesn't exist!") elif chain_basename.endswith(".gz"): # version for gz chain_filter_cmd = f"gzip -dc {args.chain_input} | "\ f"{self.CHAIN_SCORE_FILTER} stdin "\ f"{args.min_score} > {self.chain_file}" elif args.no_chain_filter: # it is .chain and score filter is not required chain_filter_cmd = f"rsync -a {args.chain_input} {self.chain_file}" else: # it is .chain | score filter required chain_filter_cmd = f"{self.CHAIN_SCORE_FILTER} {args.chain_input} "\ f"{args.min_score} > {self.chain_file}" # filter chains with score < threshold self.__call_proc(chain_filter_cmd, "Please check if you use a proper chain file.") # bed define bed files addresses self.ref_bed = os.path.join(self.wd, "toga_filt_ref_annot.bed") self.index_bed_file = os.path.join(self.wd, "toga_filt_ref_annot.hdf5") # filter bed file bed_filt_rejected_file = "BED_FILTER_REJECTED.txt" bed_filt_rejected = os.path.join(self.rejected_dir, bed_filt_rejected_file) # keeping UTRs! prepare_bed_file(args.bed_input, self.ref_bed, save_rejected=bed_filt_rejected, only_chrom=args.limit_to_ref_chrom) # mics things self.isoforms_arg = args.isoforms if args.isoforms else None self.isoforms = None # will be assigned after completeness check self.chain_jobs = args.chain_jobs_num self.cesar_binary = self.DEFAULT_CESAR if not args.cesar_binary \ else args.cesar_binary self.time_log = args.time_marks self.stop_at_chain_class = args.stop_at_chain_class self.keep_temp = True if args.keep_temp else False # define to call CESAR or not to call self.t_2bit = self.__find_two_bit(args.tDB) self.q_2bit = self.__find_two_bit(args.qDB) self.hq_orth_threshold = 0.95 self.cesar_jobs_num = args.cesar_jobs_num self.cesar_buckets = args.cesar_buckets self.cesar_mem_limit = args.cesar_mem_limit self.cesar_chain_limit = args.cesar_chain_limit self.uhq_flank = args.uhq_flank self.cesar_fields = args.homology_types self.mask_stops = args.mask_stops self.no_fpi = args.no_fpi self.o2o_only = args.o2o_only self.keep_nf_logs = args.do_not_del_nf_logs self.cesar_ok_merged = None self.chain_results_df = os.path.join(self.wd, "chain_results_df.tsv") self.nucl_fasta = os.path.join(self.wd, "nucleotide.fasta") self.prot_fasta = os.path.join(self.wd, "prot.fasta") self.final_bed = os.path.join(self.wd, "query_annotation.bed") self.low_conf_bed = os.path.join(self.wd, "low_confidence.bed") self.meta_data = os.path.join(self.wd, "exons_meta_data.tsv") self.intermediate_bed = os.path.join(self.wd, "intermediate.bed") self.orthology_type = os.path.join(self.wd, "orthology_classification.tsv") self.classification_log = os.path.join(self.wd, "o_class.log") self.trash_exons = os.path.join(self.wd, "trash_exons.bed") self.gene_loss_data = os.path.join(self.wd, "inact_mut_data") self.query_annotation = os.path.join(self.wd, "query_annotation.bed") self.loss_summ = os.path.join(self.wd, "loss_summ_data.tsv") self.u12_arg = args.u12 self.u12 = None # assign after U12 file check self.__check_param_files() # dump input parameters, object state self.toga_params_file = os.path.join(self.wd, "toga_init_state.json") self.toga_args_file = os.path.join(self.wd, "project_args.json") with open(self.toga_params_file, "w") as f: # default=string is a workaround to serialize datetime object json.dump(self.__dict__, f, default=str) with open(self.toga_args_file, "w") as f: json.dump(vars(args), f, default=str) print("TOGA initiated successfully!")
def check_args(args): """Check if args are correct, fill global dict.""" # check the directories global VERBOSE # set verbosity level VERBOSE = True if args.verbose else False WORK_DATA["vv"] = True if args.vv else False try: # check the directories, create if it is necessary os.mkdir(args.jobs) if not os.path.isdir(args.jobs) else None os.mkdir( args.results_dir) if not os.path.isdir(args.results_dir) else None os.mkdir(args.errors_dir) \ if args.errors_dir and not os.path.isdir(args.errors_dir) \ else None WORK_DATA["jobs"] = args.jobs WORK_DATA["results_dir"] = args.results_dir WORK_DATA["errors_dir"] = args.errors_dir verbose( f"Directories in usage: {args.jobs} {args.results_dir} {args.errors_dir}" ) except FileNotFoundError as grepexc: # a one of those tasks failed eprint(f"Arguments are corrupted!\n{str(grepexc)}") die("Cannot create one of the directories requested.") # define about chain and bed files WORK_DATA["chain_file"] = args.chain_file if os.path.isfile(args.chain_file) \ else die(f"Error! Chain file {args.chain_file} is wrong!") WORK_DATA["bed_file"] = args.bed_file if os.path.isfile(args.bed_file) \ else die(f"Error! Bed file {args.bed_file} is wrong!") verbose(f"Use bed file {args.bed_file} and chain file {args.chain_file}") # look for .ID.bb file index_file = args.index_file if args.index_file else args.chain_file.replace( ".chain", ".chain_ID_position") if os.path.isfile(index_file): # check if bb file is here WORK_DATA["index_file"] = index_file verbose(f"And {index_file} as an index file") elif args.make_index: # create index if not exists eprint("make_indexed in progress...") idbb_cmd = f"/modules/chain_bdb_index.py {args.chain_file} {index_file}" call_proc(idbb_cmd) WORK_DATA["index_file"] = index_file else: # die die(f"Error! Cannot find index file at {index_file}\n" "Please define it manually") # define the number of jobs if args.job_size: # easy: WORK_DATA["job_size"] = args.job_size WORK_DATA["jobs_num"] = None else: # we must compute how many jobs to put into one cluster job WORK_DATA["job_size"] = None WORK_DATA["jobs_num"] = args.jobs_num WORK_DATA["bed_index"] = args.bed_index # some defaults WORK_DATA["jobs_file"] = args.jobs_file WORK_DATA["ref"] = args.ref # check if we are on cluster WORK_DATA["on_cluster"] = True verbose("Program-wide dictionary looks like:\n") for k, v in WORK_DATA.items(): verbose(f"{k}: {v}")
def __run_cesar_jobs(self): """Run CESAR jobs using nextflow. At first -> push joblists, there might be a few of them At second -> monitor joblists, wait until all are done. """ # for each bucket I create a separate joblist and config file # different config files because different memory limits project_paths = [] # dirs with logs processes = [] # keep subprocess objects here timestamp = str(time.time()).split(".")[1] # for project name # get a list of buckets if self.cesar_buckets == "0": buckets = [ 0, ] # a single bucket else: # several buckets, each int -> memory limit in gb buckets = [ int(x) for x in self.cesar_buckets.split(",") if x != "" ] print(f"Pushing {len(buckets)} joblists") # cmd to grep bucket-related commands grep_bucket_template = "cat {0} | grep _{1}.bdb" for b in buckets: # create config file # 0 means that that buckets were not split mem_lim = b if b != 0 else self.cesar_mem_limit if not self.local_executor: # running on cluster, need to create config file # for this bucket's memory requirement config_string = self.cesar_config_template.replace( "${_MEMORY_}", f"{mem_lim}") config_file_path = os.path.join(self.wd, f"cesar_config_{b}_queue.nf") config_file_abspath = os.path.abspath(config_file_path) with open(config_file_path, "w") as f: f.write(config_string) self.temp_files.append(config_file_path) else: # no config dir given: use local executor # OK if there is a single bucket config_file_abspath = None # extract jobs related to this bucket (if it's not 0) if b != 0: grep_bucket_cmd = grep_bucket_template.format( self.cesar_combined, b) try: bucket_tasks = subprocess.check_output( grep_bucket_cmd, shell=True).decode("utf-8") except subprocess.CalledProcessError: eprint(f"There are no jobs in the {b} bucket") continue joblist_name = f"cesar_joblist_queue_{b}.txt" joblist_path = os.path.join(self.wd, joblist_name) with open(joblist_path, "w") as f: f.write(bucket_tasks) joblist_abspath = os.path.abspath(joblist_path) self.temp_files.append(joblist_path) else: # nothing to extract, there is a single joblist joblist_abspath = os.path.abspath(self.cesar_combined) # create project directory for logs nf_project_name = f"{self.project_name}_cesar_at_{timestamp}_q_{b}" nf_project_path = os.path.join(self.nextflow_dir, nf_project_name) project_paths.append(nf_project_path) os.mkdir(nf_project_path ) if not os.path.isdir(nf_project_path) else None # create subprocess object nf_cmd = f"nextflow {self.NF_EXECUTE} " \ f"--joblist {joblist_abspath}" if config_file_abspath: nf_cmd += f" -c {config_file_abspath}" p = subprocess.Popen(nf_cmd, shell=True, cwd=nf_project_path) sys.stderr.write(f"Pushed cluster jobs with {nf_cmd}") processes.append(p) time.sleep(CESAR_PUSH_INTERVAL) # monitor jobs iter_num = 0 while True: # Run until all jobs are done (or crashed) all_done = True # default val, re-define if something is not done for p in processes: # check if each process is still running running = p.poll() is None if running: all_done = False if all_done: print("CESAR jobs done") break else: print( f"Iter {iter_num} waiting {ITER_DURATION * iter_num} seconds Not done" ) time.sleep(ITER_DURATION) iter_num += 1 if not self.keep_nf_logs: # remove nextflow intermediate files for path in project_paths: shutil.rmtree(path)
def __make_indexed_bed(self): """Create gene_ID: bed line bdb indexed file.""" eprint("index_bed in progress...") bed_hdf5_index(self.ref_bed, self.index_bed_file) self.temp_files.append(self.index_bed_file) eprint("Bed file indexed")