Пример #1
0
    def build_list_no_assoc(self, to):
        """
        Build list of queries that match with None target, or the opposite

        :param to: query or target
        :return: content of the file
        """
        index = self.idx_q if to == "query" else self.idx_t
        name, contigs_list, contigs, reversed, abs_start, c_len = Index.load(index)
        contigs_list = set(contigs_list)
        with open(self.paf, "r") as paf:
            for line in paf:
                c_name = line.strip("\n").split("\t")[0 if to == "query" else 5]
                if c_name in contigs_list:
                    contigs_list.remove(c_name)
        return "\n".join(contigs_list) + "\n"
Пример #2
0
    def _check_filter(self):
        """
        Load index of fasta file, and determine contigs which must be removed. Remove them only in the index

        :return: list of contigs which must be removed
        :rtype: list
        """
        # Load contigs:
        name, order, contigs, reversed_c, abs_start, c_len = Index.load(
            index_file=self.index_file, merge_splits=self.split)

        # Sort contigs:
        contigs_order = sorted(order, key=lambda x: -contigs[x])

        # Find the N90:
        sum_l = 0
        n95_contig = None
        n95_value = 0.95 * c_len
        pos = -1
        len_small_contigs = 0
        len_1_pct = 0.01 * c_len
        for contig in contigs_order:
            pos += 1
            sum_l += contigs[contig]
            if contigs[contig] < len_1_pct:
                len_small_contigs += contigs[contig]
            if sum_l >= n95_value:
                n95_contig = contig

        if self.type_f == "query" and len_small_contigs >= 0.7 * 0.95 * c_len:
            Path(os.path.join(os.path.dirname(self.fasta), ".do-sort")).touch()

        # Min length of contigs
        min_length = 0.05 * contigs[n95_contig]

        f_outs = []

        breakpoint = None

        for contig in contigs_order[pos:]:
            if contigs[contig] < min_length:
                breakpoint = pos
                break
            pos += 1

        if breakpoint is not None:
            f_outs = contigs_order[breakpoint:]
            if len(f_outs) > self.min_filtered:
                with open(
                        os.path.join(os.path.dirname(self.fasta),
                                     ".filter-" + self.type_f), "w") as list_f:
                    list_f.write("\n".join(f_outs) + "\n")
                kept = contigs_order[:breakpoint]
                if self.split:
                    f_outs = []
                    name, contigs_order_split, contigs, reversed_c, abs_start_split, c_len_split = \
                        Index.load(index_file=self.index_file, merge_splits=False)
                    kept_s = []
                    for contig in contigs_order_split:
                        match = re.match(r"(.+)_###_\d+", contig)
                        contig_name = contig
                        if match is not None:
                            contig_name = match.group(1)
                        if contig_name in kept:
                            kept_s.append(contig)
                        else:
                            f_outs.append(contig)
                    kept = kept_s
                else:
                    kept.sort(key=lambda k: order.index(k))
                Index.save(index_file=self.index_file,
                           name=name,
                           contigs=contigs,
                           order=kept,
                           reversed_c=reversed_c)
            else:
                f_outs = []

        return f_outs
Пример #3
0
    def parse_paf(self, merge_index=True, noise=True):
        """
        Parse PAF file

        :param merge_index: if True, merge too small contigs in index
        :type merge_index: bool
        :param noise: if True, remove noise
        :type noise: bool
        """
        min_idy = 10000000000
        max_idy = -10000000000
        lines = {
            "0": [],  # idy < 0.25
            "1": [],  # idy < 0.5
            "2": [],  # idy < 0.75
            "3": []  # idy > 0.75
        }
        try:
            name_q, q_order, q_contigs, q_reversed, q_abs_start, len_q = Index.load(self.idx_q)
            self.q_abs_start = q_abs_start
            if merge_index:
                q_contigs, q_order = self.parse_index(q_order, q_contigs, len_q)
        except IOError:
            self.error = "Index file does not exist for query!"
            return False

        try:
            name_t, t_order, t_contigs, t_reversed, t_abs_start, len_t = Index.load(self.idx_t)
            self.t_abs_start = t_abs_start
            if merge_index:
                t_contigs, t_order = self.parse_index(t_order, t_contigs, len_t)
        except IOError:
            self.error = "Index file does not exist for target!"
            return False

        lines_lens = []

        try:
            with open(self.paf, "r") as paf_file:
                nb_lines = 0
                for line in paf_file:
                    nb_lines += 1
                    if nb_lines > self.max_nb_lines:
                        self.sampled = True
                        break
                    parts = line.strip("\n").split("\t")
                    v1 = parts[0]
                    v6 = parts[5]
                    strand = 1 if parts[4] == "+" else -1
                    idy = int(parts[9]) / int(parts[10])
                    min_idy = min(min_idy, idy)
                    max_idy = max(max_idy, idy)
                    # x1, x2, y1, y2, idy
                    try:
                        y1 = int(parts[2]) + q_abs_start[v1]
                        y2 = int(parts[3]) + q_abs_start[v1]
                    except KeyError as e:
                        self.error = self.keyerror_message(e, "query")
                        return False
                    try:
                        x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6]
                        x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6]
                    except KeyError as e:
                        self.error = self.keyerror_message(e, "target")
                        return False
                    len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
                    lines_lens.append(len_m)
                    if idy < self.limit_idy[0]:
                        class_idy = "0"
                    elif idy < self.limit_idy[1]:
                        class_idy = "1"
                    elif idy < self.limit_idy[2]:
                        class_idy = "2"
                    else:
                        class_idy = "3"
                    lines[class_idy].append([x1, x2, y1, y2, idy, v1, v6])
        except IOError:
            self.error = "PAF file does not exist!"
            return False

        if not noise and nb_lines > 1000:
            counts, bins, bars = plt.hist(lines_lens, bins=nb_lines//10)
            counts = list(counts)
            max_value = max(counts)
            max_index = counts.index(max_value)
            limit_index = -1
            for i in range(max_index, len(counts)):
                if counts[i] < max_value / 50:
                    limit_index = i
                    break
            if limit_index > -1:
                lines = self.remove_noise(lines, bins[limit_index])

        self.parsed = True
        self.len_q = len_q
        self.len_t = len_t
        self.min_idy = min_idy
        self.max_idy = max_idy
        self.lines = lines
        self.q_contigs = q_contigs
        self.q_order = q_order
        self.q_reversed = q_reversed
        self.t_contigs = t_contigs
        self.t_order = t_order
        self.name_q = name_q
        self.name_t = name_t
Пример #4
0
plotdots = snakemake.params["pixels"]

paths = []
idxs = []
for i in (snakemake.input["query"], snakemake.input["reference"]):
    dir = os.path.dirname(i)
    file = os.path.basename(i)
    print(f'Indexing {dir}/{file}...')
    (success, numctgs, err) = index_file(dir, file, file + ".idx")
    if not success:
        print(err)
        sys.exit(-1)
    else:
        paths.append(dir)
        idxs.append(file)

# Load data structures

paf_file = snakemake.input["paf"]
idx1 = os.path.join(paths[0], idxs[0])
idx2 = os.path.join(paths[1], idxs[1])
paf = Paf(paf_file, idx1, idx2, False)
paf.sort()

# Calculate values for matrix
asize1 = map(sum, Index.load(idx1)[2])
asize2 = map(sum, Index.load(idx2)[2])

awinsize1 = asize1 / plotdots
awinsize2 = asize2 / plotdots