Пример #1
0
def ssw_check_parasail(query, target):

    user_matrix = parasail.matrix_create("ACGT", 2, -2)

    alignment_info = {}
    for sQId, sQSeq, sQQual in read(query):

        sQSeq_r = reverse_complement(sQSeq)

        for sTId, sTSeq, STQual in read(target):

            res = parasail.ssw(sQSeq, sTSeq, 3, 1, user_matrix)
            res_r = parasail.ssw(sQSeq_r, sTSeq, 3, 1, user_matrix)

            if res.score1 > res_r.score1:
                score = res.score1
                qstart, qend = res.read_begin1 + 1., res.read_end1 + 1
                tstart, tend = res.ref_begin1 + 1, res.ref_end1 + 1
                strand = '+'
            else:
                score = res_r.score1
                qstart, qend = len(sQSeq) - res_r.read_end1, len(
                    sQSeq) - res_r.read_begin1
                tstart, tend = res_r.ref_begin1 + 1, res_r.ref_end1 + 1
                strand = '-'

            alignment_info[sTId] = [
                score,
                int(qstart),
                int(qend),
                int(tstart),
                int(tend), strand
            ]

    return (alignment_info)
Пример #2
0
def get_refined_bp_sbnd(tconsensus, fasta_file_h, tchr, tstart, tend, tdir, hout_log, margin = 200):

    tconsensus_part = tconsensus[:1000] if len(tconsensus) > 1000 else tconsensus

    ref_len = fasta_file_h.get_reference_length(tchr)
    if tstart < 1: tstart = 1
    if ref_len < tend: tend = ref_len 

    if tdir == '+':
        qseq = fasta_file_h.fetch(tchr, max(int(tstart) - margin, 0), int(tend))
    else:
        qseq = fasta_file_h.fetch(tchr, max(int(tstart) - 1, 0), int(tend) + margin)
        qseq = reverse_complement(qseq)

    user_matrix = parasail.matrix_create("ACGT", 1, -2)
    res = parasail.ssw(qseq, tconsensus, 3, 1, user_matrix)
    if res is None:
        logger.debug(f"Alignment for breakpoint localization failed for {tchr},{tstart},{tend},{tdir}")
        return None


    if tdir == '+':
        bp_pos_reference = tend - (len(qseq) - res.read_end1 - 1)
    else:
        bp_pos_reference = tstart + (len(qseq) - res.read_end1 - 1) 

    tconsensus_after = tconsensus[(res.ref_end1 + 1):]

    return (bp_pos_reference, tconsensus_after)
Пример #3
0
def generate_paf_file(query_fasta, target_fasta, output_file):

    user_matrix = parasail.matrix_create("ACGT", 2, -2)

    with open(target_fasta, 'r') as hin:
        for line in hin:
            if line.startswith('>'):
                tid = line.rstrip('\n').split(' ')[0].lstrip('>')
            else:
                tseq = line.rstrip('\n')

    with open(query_fasta, 'r') as hin, open(output_file, 'w') as hout:
        for line in hin:
            if line.startswith('>'):
                qid = line.rstrip('\n').lstrip('>')
            else:
                qseq = line.rstrip('\n')

                res = parasail.ssw(qseq, tseq, 3, 1, user_matrix)
                if res is not None:
                    print("%s\t%d\t%d\t%d\t+\t%s\t%d\t%d\t%d\t*\t*\t60" %
                          (qid, len(qseq), res.read_begin1, res.read_end1, tid,
                           len(tseq), res.ref_begin1, res.ref_end1),
                          file=hout)
                else:
                    logger.warning(
                        f'Error occured in the alignment of {qid} and {tid} via parasail'
                    )
Пример #4
0
    def generate_paf_file(self, query_fasta, target_fasta, output_file):

        user_matrix = parasail.matrix_create("ACGT", 2, -2)
        paf_rec_count = 0

        with open(target_fasta, 'r') as hin:
            for line in hin:
                if line.startswith('>'): 
                    tid = line.rstrip('\n').split(' ')[0].lstrip('>')
                else:
                    tseq = line.rstrip('\n')

        with open(query_fasta, 'r') as hin, open(output_file, 'w') as hout:
            for line in hin:
                if line.startswith('>'):
                    qid = line.rstrip('\n').lstrip('>')
                else:
                    qseq = line.rstrip('\n')
                    
                    res = parasail.ssw(qseq, tseq, 3, 1, user_matrix)
                    if res is not None:
                        print(f"{qid}\t{len(qseq)}\t{res.read_begin1}\t{res.read_end1}\t+\t" +
                            f"{tid}\t{len(tseq)}\t{res.ref_begin1}\t{res.ref_end1}\t*\t*\t60", file = hout)
                        paf_rec_count = paf_rec_count + 1
                    else:
                        self.parasail_error.append((qid, tid))

        return(paf_rec_count)
Пример #5
0
def ssw_alignment(s1,
                  s2,
                  match_score=2,
                  mismatch_penalty=-2,
                  opening_penalty=3,
                  gap_ext=1):
    user_matrix = parasail.matrix_create("ACGT", match_score, mismatch_penalty)
    result = parasail.ssw(s1, s2, opening_penalty, gap_ext, user_matrix)
    print(result, type(result), dir(result))
    print(dir(result))
    for attr, value in result.__dict__.items():
        print(attr, value)
    # print(result.ref_begin1, result.ref_end1, result.read_begin1, result.read_end1)
    # print()
    return s1_alignment, s2_alignment, cigar_string, cigar_tuples, result.score
Пример #6
0
def generate_paf_file(query_fasta, target_fasta, output_file):

    user_matrix = parasail.matrix_create("ACGT", 2, -2)

    with open(target_fasta, 'r') as hin:
        for line in hin:
            if line.startswith('>'):
                tid = line.rstrip('\n').lstrip('>')
            else:
                tseq = line.rstrip('\n')

    with open(query_fasta, 'r') as hin, open(output_file, 'w') as hout:
        for line in hin:
            if line.startswith('>'):
                qid = line.rstrip('\n').lstrip('>')
            else:
                qseq = line.rstrip('\n')

                res = parasail.ssw(qseq, tseq, 3, 1, user_matrix)
                print("%s\t%d\t%d\t%d\t+\t%s\t%d\t%d\t%d\t*\t*\t60" %
                      (qid, len(qseq), res.read_begin1, res.read_end1, tid,
                       len(tseq), res.ref_begin1, res.ref_end1),
                      file=hout)
                """ 
parser = argparse.ArgumentParser( description='Align inserts to a series of target sequences')
parser.add_argument('--inserts', required=True)
parser.add_argument('--targets', required=True)
args = parser.parse_args()

# read in the control sequences
control_seqs = list()
with open(args.targets) as control_fh:
    count = 0
    name = ""
    for line in control_fh:
        count += 1
        if count % 2 == 0:
            control_seqs.append([name, line.strip()])
        else:
            name = line.strip()

scoring_matrix = parasail.matrix_create("ACGT", 5, -1)
print("Read\tReadLen\tReadStart\tReadEnd\tRef\tRefLen\tRefStart\tRefEnd\tScore\tMatches")
for read in pysam.FastxFile(args.inserts):
    # align this read against all oligos
    for (control_name, control_sequence) in control_seqs:
        #result = parasail.sw_stats_table_scan_16(read.sequence, control_sequence, 5, 4, scoring_matrix)
        if len(read.sequence) == 0:
            continue
        result = parasail.ssw(read.sequence, control_sequence, 5, 4, scoring_matrix)
        result2 = parasail.sw_stats_table_striped_16(read.sequence, control_sequence, 5, 4, scoring_matrix)
        print(read.name+"\t"+str(len(read.sequence))+"\t"+str(result.read_begin1)+"\t"+str(result.read_end1)+"\t"+control_name+"\t"+str(len(control_sequence))+"\t"+str(result.ref_begin1)+"\t"+str(result.ref_end1)+"\t"+str(result.score1)+"\t"+str(result2.matches))


Пример #8
0
    def filter_sv_insertion_match(self,
                                  sv,
                                  ins,
                                  filter_item="Duplicate_with_insertion"):

        # only apply when the first sv is not insertion and the second sv is insertion type
        if len(sv.inseq) >= 100 or len(ins.inseq) < 100: return

        if not (sv.chr1 == ins.chr1 and abs(sv.pos1 - ins.pos1) <= self.bp_dist_margin) and \
            not (sv.chr2 == ins.chr2 and abs(sv.pos2 - ins.pos2) <= self.bp_dist_margin):
            return

        ins_seg = self.reference_h.fetch(
            ins.chr1,
            max(ins.pos1 - self.validate_seg_len - self.bp_dist_margin - 1, 0),
            ins.pos1 - 1)
        ins_seg = ins_seg + ins.inseq
        ins_seg = ins_seg + self.reference_h.fetch(
            ins.chr1, ins.pos2 - 1,
            ins.pos2 + self.validate_seg_len + self.bp_dist_margin - 1)

        if sv.dir1 == '+':
            tseq = self.reference_h.fetch(
                sv.chr1, max(sv.pos1 - self.validate_seg_len - 1, 0),
                sv.pos1 - 1)
        else:
            tseq = self.reference_h.fetch(sv.chr1, sv.pos1 - 1,
                                          sv.pos1 + self.validate_seg_len - 1)
            tseq = reverse_complement(tseq)

        if sv.dir1 == '+':
            sv_seg = tseq + sv.inseq
        else:
            sv_seg = tseq + reverse_complement(sv.inseq)

        if sv.dir2 == '-':
            tseq = self.reference_h.fetch(sv.chr2, sv.pos2 - 1,
                                          sv.pos2 + self.validate_seg_len - 1)
        else:
            tseq = self.reference_h.fetch(
                sv.chr2, max(sv.pos2 - self.validate_seg_len - 1, 0),
                sv.pos2 - 1)
            tseq = reverse_complement(tseq)

        sv_seg = sv_seg + tseq

        user_matrix = parasail.matrix_create("ACGT", 2, -2)
        res = parasail.ssw(sv_seg, ins_seg, 3, 1, user_matrix)
        res_r = parasail.ssw(reverse_complement(sv_seg), ins_seg, 3, 1,
                             user_matrix)

        if res.score1 > res_r.score1:
            match_ratio = float(
                res.score1) / (2 * (res.ref_end1 - res.ref_begin1 + 1))
            if match_ratio > 0.75:
                if res.read_begin1 < 0.1 * len(
                        sv_seg) and res.read_end1 > 0.9 * len(sv_seg):
                    sv.filter.append(filter_item)
                    return
        else:
            match_ratio = float(
                res_r.score1) / (2 * (res_r.ref_end1 - res_r.ref_begin1 + 1))
            if match_ratio > 0.75:
                if res_r.read_begin1 < 0.1 * len(
                        sv_seg) and res_r.read_end1 > 0.9 * len(sv_seg):
                    sv.filter.append(filter_item)
                    return

        return