示例#1
0
def polish_annotation(ref_gene, target_gene, ref_children, target_children, ref_fa, target_fa, output_sam):
    ref_exons = [feature for feature in ref_children if feature.featuretype == "exon"]
    target_exons = [feature for feature in target_children if feature.featuretype == "exon"]
    ref_CDS = [feature for feature in ref_children if feature.featuretype == "CDS"]
    if len(ref_exons) == 0:
        ref_exons = ref_CDS
        target_exons = [feature for feature in target_children if feature.featuretype == "CDS"]
    ref_CDS_intervals = liftoff_utils.merge_children_intervals(ref_CDS)
    splice_sites = add_splice_sites(ref_exons, ref_gene)
    merged_ref_intervals = liftoff_utils.merge_children_intervals(ref_exons)
    exon_group_dict = find_overlapping_exon_groups(merged_ref_intervals, ref_exons)
    matrix = make_scoring_matrix(3)
    for i in range (len(merged_ref_intervals)):
        ref_interval =  merged_ref_intervals[i]
        target_interval = get_target_interval(exon_group_dict[i], target_exons, ref_interval)
        flank = max(0, (ref_interval[1] - ref_interval[0])- (target_interval[1]- target_interval[0])) + 10
        if target_interval != [0,0]:
            is_reverse = ref_gene.strand != target_gene.strand
            ref_seq = get_feature_sequence(ref_interval, ref_fa, ref_gene.seqid,  0 )
            capitalized_ref_seq = cds_and_splice_sites_to_upper(ref_interval, splice_sites, ref_CDS_intervals,
                                                                ref_seq, is_reverse)
            target_seq = get_feature_sequence(target_interval, target_fa, target_gene.seqid, flank)
            alignment = parasail.sg_dx_trace_scan_sat(capitalized_ref_seq, target_seq, 10, 1, matrix)
            write_sam_file(ref_interval, target_interval, ref_gene, target_gene, capitalized_ref_seq, alignment,
                           output_sam, flank)
    remove_splice_sites(ref_exons, ref_gene)
示例#2
0
def get_aligned_blocks(alignment, aln_id, feature_hierarchy, search_type):
    cigar_operations = get_cigar_operations()
    cigar = alignment.cigar
    query_start, query_end = get_query_start_and_end(alignment, cigar, cigar_operations)
    children = feature_hierarchy.children[liftoff_utils.convert_id_to_original(alignment.query_name)]
    parent = feature_hierarchy.parents[liftoff_utils.convert_id_to_original(alignment.query_name)]
    if search_type == "copies" and is_end_to_end_alignment(parent, query_start, query_end) is False:
        return []
    reference_block_start, reference_block_pos = alignment.reference_start, alignment.reference_start
    query_block_start, query_block_pos = query_start, query_start
    new_blocks, mismatches = [], []
    merged_children_coords = liftoff_utils.merge_children_intervals(children)
    for operation, length in cigar:
        if base_is_aligned(operation, cigar_operations):
            query_block_pos, reference_block_pos = add_aligned_base(operation, query_block_pos, reference_block_pos,
                                                                    length, cigar_operations, mismatches)
            if query_block_pos == query_end:
                add_block(query_block_pos, reference_block_pos, aln_id, alignment, query_block_start,
                          reference_block_start, mismatches, new_blocks, merged_children_coords, parent)
        elif is_alignment_gap(operation, cigar_operations):
            add_block(query_block_pos, reference_block_pos, aln_id, alignment, query_block_start, reference_block_start,
                      mismatches, new_blocks, merged_children_coords, parent)
            mismatches, query_block_start, reference_block_start, query_block_pos, reference_block_pos = \
                end_block_at_gap(
                operation, query_block_pos, reference_block_pos, length)
    return new_blocks
示例#3
0
def find_best_mapping(alignments, query_length, parent, feature_heirarchy,
                      previous_feature_start, previous_feature_ref_start,
                      previous_gene_seq, inter, lifted_features_list, args):
    children = feature_heirarchy.children[parent.id]
    children_coords = liftoff_utils.merge_children_intervals(children)
    node_dict, aln_graph = intialize_graph()
    head_nodes = add_single_alignments(
        node_dict, aln_graph, alignments, children_coords, parent,
        previous_feature_start, previous_feature_ref_start, previous_gene_seq,
        inter, feature_heirarchy.parents, lifted_features_list, args)

    chain_alignments(
        head_nodes,
        node_dict,
        aln_graph,
        parent,
        children_coords,
        inter,
        feature_heirarchy.parents,
        lifted_features_list,
        args,
    )
    add_target_node(aln_graph, node_dict, query_length, children_coords,
                    parent, args)
    shortest_path_nodes = find_shortest_path(node_dict, aln_graph)
    if len(shortest_path_nodes) == 0:
        return {}, 0, 0
    mapped_children, alignment_coverage, seq_id = convert_all_children_coords(
        shortest_path_nodes, children, parent)
    return mapped_children, alignment_coverage, seq_id
示例#4
0
def find_best_mapping(alignments, query_length, parent, coords_to_exclude,
                      children_dict, previous_gene_start, copy_tag):
    children = children_dict[parent.id]
    children_coords = liftoff_utils.merge_children_intervals(children)
    node_dict, aln_graph = intialize_graph()
    head_nodes = add_single_alignments(node_dict, aln_graph, alignments,
                                       children_coords, parent,
                                       coords_to_exclude, previous_gene_start)
    chain_alignments(head_nodes, node_dict, aln_graph, coords_to_exclude,
                     parent, children_coords)
    add_target_node(aln_graph, node_dict, query_length, children_coords,
                    parent)
    shortest_path = nx.shortest_path(
        aln_graph,
        source=0,
        target=len(node_dict) - 1,
        weight=lambda u, v, d: get_weight(u, v, d, aln_graph))
    shortest_path_weight = nx.shortest_path_length(
        aln_graph,
        source=0,
        target=len(node_dict) - 1,
        weight=lambda u, v, d: get_weight(u, v, d, aln_graph))

    shortest_path_nodes = []
    for i in range(1, len(shortest_path) - 1):
        node_name = shortest_path[i]
        shortest_path_nodes.append(node_dict[node_name])
    if len(shortest_path_nodes) == 0:
        return {}, shortest_path_weight, 0, 0

    mapped_children, alignment_coverage, seq_id = convert_all_children_coords(
        shortest_path_nodes, children, parent, copy_tag)
    return mapped_children, shortest_path_weight, alignment_coverage, seq_id
示例#5
0
def find_best_mapping(alignments, query_length, parent, coords_to_exclude, children_dict, previous_gene_start):
    children = children_dict[parent.id]
    children_coords = liftoff_utils.merge_children_intervals(children)
    node_dict, aln_graph = intialize_graph()
    head_nodes = add_single_alignments(node_dict, aln_graph, alignments, children_coords, parent, coords_to_exclude,
                                       previous_gene_start)
    chain_alignments(head_nodes, node_dict, aln_graph, coords_to_exclude, parent, children_coords)
    add_target_node(aln_graph, node_dict, query_length, children_coords, parent)
    shortest_path_nodes = find_shortest_path(node_dict, aln_graph)
    if len(shortest_path_nodes) == 0:
        return {}, 0, 0
    mapped_children, alignment_coverage, seq_id = convert_all_children_coords(shortest_path_nodes, children, parent)
    return mapped_children, alignment_coverage, seq_id
示例#6
0
def get_aligned_blocks(alignment, aln_id, children_dict, parent_dict,
                       search_type):
    cigar = alignment.cigar
    query_start = alignment.query_alignment_start
    query_end = alignment.query_alignment_end
    children = children_dict[liftoff_utils.convert_id_to_original(
        alignment.query_name)]
    parent = parent_dict[liftoff_utils.convert_id_to_original(
        alignment.query_name)]
    if parent.end - parent.start + 1 != query_end - query_start and search_type == "copies":
        return []
    reference_block_start = alignment.reference_start
    reference_block_pos = reference_block_start
    if cigar[0][0] == 5:
        query_start += cigar[0][1]
        query_end += cigar[0][1]
    query_block_start = query_start
    query_block_pos = query_block_start
    new_blocks = []
    mismatches = []
    merged_children_coords = liftoff_utils.merge_children_intervals(children)
    for operation, length in cigar:
        if operation == 7 or operation == 8:
            query_block_pos, reference_block_pos = adjust_position(
                operation, query_block_pos, reference_block_pos, length)
            if operation == 8:
                mismatches.append(query_block_pos)
            if query_block_pos == query_end:
                add_block(query_block_pos, reference_block_pos, aln_id,
                          alignment, query_block_start, reference_block_start,
                          mismatches, new_blocks, merged_children_coords,
                          parent)
                mismatches = []
        elif operation == 1 or operation == 2:
            add_block(query_block_pos, reference_block_pos, aln_id, alignment,
                      query_block_start, reference_block_start, mismatches,
                      new_blocks, merged_children_coords, parent)
            mismatches = []
            query_block_pos, reference_block_pos = adjust_position(
                operation, query_block_pos, reference_block_pos, length)
            query_block_start = query_block_pos
            reference_block_start = reference_block_pos
    return new_blocks