def build_list_no_assoc(self, to): """ Build list of queries that match with None target, or the opposite :param to: query or target :return: content of the file """ index = self.idx_q if to == "query" else self.idx_t name, contigs_list, contigs, reversed, abs_start, c_len = Index.load(index) contigs_list = set(contigs_list) with open(self.paf, "r") as paf: for line in paf: c_name = line.strip("\n").split("\t")[0 if to == "query" else 5] if c_name in contigs_list: contigs_list.remove(c_name) return "\n".join(contigs_list) + "\n"
def _check_filter(self): """ Load index of fasta file, and determine contigs which must be removed. Remove them only in the index :return: list of contigs which must be removed :rtype: list """ # Load contigs: name, order, contigs, reversed_c, abs_start, c_len = Index.load( index_file=self.index_file, merge_splits=self.split) # Sort contigs: contigs_order = sorted(order, key=lambda x: -contigs[x]) # Find the N90: sum_l = 0 n95_contig = None n95_value = 0.95 * c_len pos = -1 len_small_contigs = 0 len_1_pct = 0.01 * c_len for contig in contigs_order: pos += 1 sum_l += contigs[contig] if contigs[contig] < len_1_pct: len_small_contigs += contigs[contig] if sum_l >= n95_value: n95_contig = contig if self.type_f == "query" and len_small_contigs >= 0.7 * 0.95 * c_len: Path(os.path.join(os.path.dirname(self.fasta), ".do-sort")).touch() # Min length of contigs min_length = 0.05 * contigs[n95_contig] f_outs = [] breakpoint = None for contig in contigs_order[pos:]: if contigs[contig] < min_length: breakpoint = pos break pos += 1 if breakpoint is not None: f_outs = contigs_order[breakpoint:] if len(f_outs) > self.min_filtered: with open( os.path.join(os.path.dirname(self.fasta), ".filter-" + self.type_f), "w") as list_f: list_f.write("\n".join(f_outs) + "\n") kept = contigs_order[:breakpoint] if self.split: f_outs = [] name, contigs_order_split, contigs, reversed_c, abs_start_split, c_len_split = \ Index.load(index_file=self.index_file, merge_splits=False) kept_s = [] for contig in contigs_order_split: match = re.match(r"(.+)_###_\d+", contig) contig_name = contig if match is not None: contig_name = match.group(1) if contig_name in kept: kept_s.append(contig) else: f_outs.append(contig) kept = kept_s else: kept.sort(key=lambda k: order.index(k)) Index.save(index_file=self.index_file, name=name, contigs=contigs, order=kept, reversed_c=reversed_c) else: f_outs = [] return f_outs
def parse_paf(self, merge_index=True, noise=True): """ Parse PAF file :param merge_index: if True, merge too small contigs in index :type merge_index: bool :param noise: if True, remove noise :type noise: bool """ min_idy = 10000000000 max_idy = -10000000000 lines = { "0": [], # idy < 0.25 "1": [], # idy < 0.5 "2": [], # idy < 0.75 "3": [] # idy > 0.75 } try: name_q, q_order, q_contigs, q_reversed, q_abs_start, len_q = Index.load(self.idx_q) self.q_abs_start = q_abs_start if merge_index: q_contigs, q_order = self.parse_index(q_order, q_contigs, len_q) except IOError: self.error = "Index file does not exist for query!" return False try: name_t, t_order, t_contigs, t_reversed, t_abs_start, len_t = Index.load(self.idx_t) self.t_abs_start = t_abs_start if merge_index: t_contigs, t_order = self.parse_index(t_order, t_contigs, len_t) except IOError: self.error = "Index file does not exist for target!" return False lines_lens = [] try: with open(self.paf, "r") as paf_file: nb_lines = 0 for line in paf_file: nb_lines += 1 if nb_lines > self.max_nb_lines: self.sampled = True break parts = line.strip("\n").split("\t") v1 = parts[0] v6 = parts[5] strand = 1 if parts[4] == "+" else -1 idy = int(parts[9]) / int(parts[10]) min_idy = min(min_idy, idy) max_idy = max(max_idy, idy) # x1, x2, y1, y2, idy try: y1 = int(parts[2]) + q_abs_start[v1] y2 = int(parts[3]) + q_abs_start[v1] except KeyError as e: self.error = self.keyerror_message(e, "query") return False try: x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6] x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6] except KeyError as e: self.error = self.keyerror_message(e, "target") return False len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2)) lines_lens.append(len_m) if idy < self.limit_idy[0]: class_idy = "0" elif idy < self.limit_idy[1]: class_idy = "1" elif idy < self.limit_idy[2]: class_idy = "2" else: class_idy = "3" lines[class_idy].append([x1, x2, y1, y2, idy, v1, v6]) except IOError: self.error = "PAF file does not exist!" return False if not noise and nb_lines > 1000: counts, bins, bars = plt.hist(lines_lens, bins=nb_lines//10) counts = list(counts) max_value = max(counts) max_index = counts.index(max_value) limit_index = -1 for i in range(max_index, len(counts)): if counts[i] < max_value / 50: limit_index = i break if limit_index > -1: lines = self.remove_noise(lines, bins[limit_index]) self.parsed = True self.len_q = len_q self.len_t = len_t self.min_idy = min_idy self.max_idy = max_idy self.lines = lines self.q_contigs = q_contigs self.q_order = q_order self.q_reversed = q_reversed self.t_contigs = t_contigs self.t_order = t_order self.name_q = name_q self.name_t = name_t
plotdots = snakemake.params["pixels"] paths = [] idxs = [] for i in (snakemake.input["query"], snakemake.input["reference"]): dir = os.path.dirname(i) file = os.path.basename(i) print(f'Indexing {dir}/{file}...') (success, numctgs, err) = index_file(dir, file, file + ".idx") if not success: print(err) sys.exit(-1) else: paths.append(dir) idxs.append(file) # Load data structures paf_file = snakemake.input["paf"] idx1 = os.path.join(paths[0], idxs[0]) idx2 = os.path.join(paths[1], idxs[1]) paf = Paf(paf_file, idx1, idx2, False) paf.sort() # Calculate values for matrix asize1 = map(sum, Index.load(idx1)[2]) asize2 = map(sum, Index.load(idx2)[2]) awinsize1 = asize1 / plotdots awinsize2 = asize2 / plotdots