def parse_psl(psl_file, min_id=0.90, cover_thres=0.96): """ Calculate a truth table by considering the accumulated coverage of query sequences onto the references. The coverage is treated as a binary mask and the total extent a contig covers each reference determines the degree of ownership. Writes out a full truth table to user specified output path. Note: ideally the PSL file should be sorted for descending alignment score. :param psl_file: :param min_id: ignore alignments whose identity is less than this threshold :param cover_thres: query mean coverage threshold required for assignment to a ref be accepted. :return: None -- this method presently breaks the logical flow of this script. """ with open(psl_file, 'r') as h_in: all_hits = 0 rejected = 0 aln_masks = {} # traverse alignment file, build up the masks for each query to reference[s] assocs. for aln in Psl.parse(h_in): all_hits += 1 if aln.percent_id < min_id: rejected += 1 continue if aln.q_name not in aln_masks: aln_masks[aln.q_name] = {} if aln.t_name not in aln_masks[aln.q_name]: aln_masks[aln.q_name][aln.t_name] = np.zeros(int(aln.q_size)) per_id = aln.percent_id mask_slice = aln_masks[aln.q_name][aln.t_name][aln.q_start:aln.q_end+1] mask_slice[np.where(mask_slice < per_id)] = per_id # build dictionary of assignments and weights truth = {} weights = {} for n, ti in enumerate(aln_masks): masks = np.vstack(aln_masks[ti].values()) names = np.array(aln_masks[ti].keys()) covers = np.mean(masks, 1) idx = np.where(covers > cover_thres) if idx[0].shape[0] > 0: truth[ti] = {} for i in np.nditer(idx): truth[ti][str(names[i])] = float(covers[i]) weights[ti] = masks.shape[1] # initialize truthtable ttable = tt.TruthTable() ttable.update(truth, weights) return ttable
def parse_psl(psl_file): """ Parse a PSL converted from MAF :param psl_file: PSL format alignment file :return: dictionary of Alignment objects """ all_hits = 0 rejected = 0 alignment_repo = OrderedDict() with open(psl_file, 'r') as h_in: for aln in Psl.parse(h_in): all_hits += 1 # ignore alignment records which fall below mincov or minid # wrt the length of the alignment vs query sequence. if aln.coverage < args.mincov or aln.percent_id < args.minid: rejected += 1 continue ai = Alignment(aln.q_name, aln.t_name, aln.length, aln.q_size, aln.percent_id) if ai in alignment_repo: alignment_repo[ai].add_bases(aln.length) else: alignment_repo[ai] = ai print 'Rejected {0}/{1} alignments due to constraints on ID {2} and Coverage {3}'.format( rejected, all_hits, args.minid, args.mincov) return alignment_repo
def parse_psl2(psl_file): """ Calculate a truth table by considering the accumulated coverage of query sequences onto the references. The coverage is treated as a binary mask and the total extent a contig covers each reference determines the degree of ownership. Writes out a full truth table to user specified output path. Note: ideally the PSL file should be sorted for descending alignment score. :param psl_file: :return: None -- this method presently breaks the logical flow of this script. """ with open(psl_file, 'r') as h_in: all_hits = 0 rejected = 0 aln_masks = {} for aln in Psl.parse(h_in): all_hits += 1 if aln.percent_id < 90: rejected += 1 continue if aln.q_name not in aln_masks: aln_masks[aln.q_name] = {} if aln.t_name not in aln_masks[aln.q_name]: aln_masks[aln.q_name][aln.t_name] = np.zeros(int(aln.q_size)) per_id = 0.01 * aln.percent_id mask_slice = aln_masks[aln.q_name][aln.t_name][ aln.q_start:aln.q_end + 1] #= aln.percent_id/100. mask_slice[np.where(mask_slice < per_id)] = per_id truth = {} weights = {} for n, ti in enumerate(aln_masks): masks = np.vstack(aln_masks[ti].values()) names = np.array(aln_masks[ti].keys()) covers = np.mean(masks, 1) idx = np.where(covers > 0.96) if idx[0].shape[0] > 0: truth[ti] = {} for i in np.nditer(idx): truth[ti][str(names[i])] = float(covers[i]) weights[ti] = masks.shape[1] t = tt.TruthTable() t.update(truth, weights) t.write(args.output_file[0])
def parse_psl2(psl_file): """ Calculate a truth table by considering the accumulated coverage of query sequences onto the references. The coverage is treated as a binary mask and the total extent a contig covers each reference determines the degree of ownership. Writes out a full truth table to user specified output path. Note: ideally the PSL file should be sorted for descending alignment score. :param psl_file: :return: None -- this method presently breaks the logical flow of this script. """ with open(psl_file, 'r') as h_in: all_hits = 0 rejected = 0 aln_masks = {} for aln in Psl.parse(h_in): all_hits += 1 if aln.percent_id < 90: rejected += 1 continue if aln.q_name not in aln_masks: aln_masks[aln.q_name] = {} if aln.t_name not in aln_masks[aln.q_name]: aln_masks[aln.q_name][aln.t_name] = np.zeros(int(aln.q_size)) per_id = 0.01 * aln.percent_id mask_slice = aln_masks[aln.q_name][aln.t_name][aln.q_start:aln.q_end+1] #= aln.percent_id/100. mask_slice[np.where(mask_slice < per_id)] = per_id truth = {} weights = {} for n, ti in enumerate(aln_masks): masks = np.vstack(aln_masks[ti].values()) names = np.array(aln_masks[ti].keys()) covers = np.mean(masks, 1) idx = np.where(covers > 0.96) if idx[0].shape[0] > 0: truth[ti] = {} for i in np.nditer(idx): truth[ti][str(names[i])] = float(covers[i]) weights[ti] = masks.shape[1] t = tt.TruthTable() t.update(truth, weights) t.write(args.output_file[0])