Exemplo n.º 1
0
def parse_psl(psl_file, min_id=0.90, cover_thres=0.96):
    """
    Calculate a truth table by considering the accumulated coverage of query sequences onto
    the references. The coverage is treated as a binary mask and the total extent a contig
    covers each reference determines the degree of ownership.

    Writes out a full truth table to user specified output path.

    Note: ideally the PSL file should be sorted for descending alignment score.

    :param psl_file:
    :param min_id: ignore alignments whose identity is less than this threshold
    :param cover_thres: query mean coverage threshold required for assignment to a ref be accepted.
    :return: None -- this method presently breaks the logical flow of this script.
    """
    with open(psl_file, 'r') as h_in:

        all_hits = 0
        rejected = 0

        aln_masks = {}

        # traverse alignment file, build up the masks for each query to reference[s] assocs.
        for aln in Psl.parse(h_in):

            all_hits += 1

            if aln.percent_id < min_id:
                rejected += 1
                continue

            if aln.q_name not in aln_masks:
                aln_masks[aln.q_name] = {}

            if aln.t_name not in aln_masks[aln.q_name]:
                aln_masks[aln.q_name][aln.t_name] = np.zeros(int(aln.q_size))

            per_id = aln.percent_id
            mask_slice = aln_masks[aln.q_name][aln.t_name][aln.q_start:aln.q_end+1]
            mask_slice[np.where(mask_slice < per_id)] = per_id

        # build dictionary of assignments and weights
        truth = {}
        weights = {}
        for n, ti in enumerate(aln_masks):
            masks = np.vstack(aln_masks[ti].values())
            names = np.array(aln_masks[ti].keys())
            covers = np.mean(masks, 1)
            idx = np.where(covers > cover_thres)
            if idx[0].shape[0] > 0:
                truth[ti] = {}
                for i in np.nditer(idx):
                    truth[ti][str(names[i])] = float(covers[i])
            weights[ti] = masks.shape[1]

        # initialize truthtable
        ttable = tt.TruthTable()
        ttable.update(truth, weights)

        return ttable
Exemplo n.º 2
0
def parse_psl(psl_file):
    """
    Parse a PSL converted from MAF

    :param psl_file: PSL format alignment file
    :return: dictionary of Alignment objects
    """
    all_hits = 0
    rejected = 0
    alignment_repo = OrderedDict()

    with open(psl_file, 'r') as h_in:

        for aln in Psl.parse(h_in):

            all_hits += 1

            # ignore alignment records which fall below mincov or minid
            # wrt the length of the alignment vs query sequence.
            if aln.coverage < args.mincov or aln.percent_id < args.minid:
                rejected += 1
                continue

            ai = Alignment(aln.q_name, aln.t_name, aln.length, aln.q_size, aln.percent_id)
            if ai in alignment_repo:
                alignment_repo[ai].add_bases(aln.length)
            else:
                alignment_repo[ai] = ai

        print 'Rejected {0}/{1} alignments due to constraints on ID {2} and Coverage {3}'.format(
            rejected, all_hits, args.minid, args.mincov)

    return alignment_repo
def parse_psl(psl_file):
    """
    Parse a PSL converted from MAF

    :param psl_file: PSL format alignment file
    :return: dictionary of Alignment objects
    """
    all_hits = 0
    rejected = 0
    alignment_repo = OrderedDict()

    with open(psl_file, 'r') as h_in:

        for aln in Psl.parse(h_in):

            all_hits += 1

            # ignore alignment records which fall below mincov or minid
            # wrt the length of the alignment vs query sequence.
            if aln.coverage < args.mincov or aln.percent_id < args.minid:
                rejected += 1
                continue

            ai = Alignment(aln.q_name, aln.t_name, aln.length, aln.q_size,
                           aln.percent_id)
            if ai in alignment_repo:
                alignment_repo[ai].add_bases(aln.length)
            else:
                alignment_repo[ai] = ai

        print 'Rejected {0}/{1} alignments due to constraints on ID {2} and Coverage {3}'.format(
            rejected, all_hits, args.minid, args.mincov)

    return alignment_repo
def parse_psl2(psl_file):
    """
    Calculate a truth table by considering the accumulated coverage of query sequences onto
    the references. The coverage is treated as a binary mask and the total extent a contig
    covers each reference determines the degree of ownership.

    Writes out a full truth table to user specified output path.

    Note: ideally the PSL file should be sorted for descending alignment score.

    :param psl_file:
    :return: None -- this method presently breaks the logical flow of this script.
    """
    with open(psl_file, 'r') as h_in:

        all_hits = 0
        rejected = 0

        aln_masks = {}

        for aln in Psl.parse(h_in):

            all_hits += 1

            if aln.percent_id < 90:
                rejected += 1
                continue

            if aln.q_name not in aln_masks:
                aln_masks[aln.q_name] = {}

            if aln.t_name not in aln_masks[aln.q_name]:
                aln_masks[aln.q_name][aln.t_name] = np.zeros(int(aln.q_size))

            per_id = 0.01 * aln.percent_id
            mask_slice = aln_masks[aln.q_name][aln.t_name][
                aln.q_start:aln.q_end + 1]  #= aln.percent_id/100.
            mask_slice[np.where(mask_slice < per_id)] = per_id

        truth = {}
        weights = {}
        for n, ti in enumerate(aln_masks):
            masks = np.vstack(aln_masks[ti].values())
            names = np.array(aln_masks[ti].keys())
            covers = np.mean(masks, 1)
            idx = np.where(covers > 0.96)
            if idx[0].shape[0] > 0:
                truth[ti] = {}
                for i in np.nditer(idx):
                    truth[ti][str(names[i])] = float(covers[i])
            weights[ti] = masks.shape[1]
        t = tt.TruthTable()
        t.update(truth, weights)

        t.write(args.output_file[0])
Exemplo n.º 5
0
def parse_psl2(psl_file):
    """
    Calculate a truth table by considering the accumulated coverage of query sequences onto
    the references. The coverage is treated as a binary mask and the total extent a contig
    covers each reference determines the degree of ownership.

    Writes out a full truth table to user specified output path.

    Note: ideally the PSL file should be sorted for descending alignment score.

    :param psl_file:
    :return: None -- this method presently breaks the logical flow of this script.
    """
    with open(psl_file, 'r') as h_in:

        all_hits = 0
        rejected = 0

        aln_masks = {}

        for aln in Psl.parse(h_in):

            all_hits += 1

            if aln.percent_id < 90:
                rejected += 1
                continue

            if aln.q_name not in aln_masks:
                aln_masks[aln.q_name] = {}

            if aln.t_name not in aln_masks[aln.q_name]:
                aln_masks[aln.q_name][aln.t_name] = np.zeros(int(aln.q_size))

            per_id = 0.01 * aln.percent_id
            mask_slice = aln_masks[aln.q_name][aln.t_name][aln.q_start:aln.q_end+1] #= aln.percent_id/100.
            mask_slice[np.where(mask_slice < per_id)] = per_id

        truth = {}
        weights = {}
        for n, ti in enumerate(aln_masks):
            masks = np.vstack(aln_masks[ti].values())
            names = np.array(aln_masks[ti].keys())
            covers = np.mean(masks, 1)
            idx = np.where(covers > 0.96)
            if idx[0].shape[0] > 0:
                truth[ti] = {}
                for i in np.nditer(idx):
                    truth[ti][str(names[i])] = float(covers[i])
            weights[ti] = masks.shape[1]
        t = tt.TruthTable()
        t.update(truth, weights)

        t.write(args.output_file[0])