def polish_annotation(ref_gene, target_gene, ref_children, target_children, ref_fa, target_fa, output_sam): ref_exons = [feature for feature in ref_children if feature.featuretype == "exon"] target_exons = [feature for feature in target_children if feature.featuretype == "exon"] ref_CDS = [feature for feature in ref_children if feature.featuretype == "CDS"] if len(ref_exons) == 0: ref_exons = ref_CDS target_exons = [feature for feature in target_children if feature.featuretype == "CDS"] ref_CDS_intervals = liftoff_utils.merge_children_intervals(ref_CDS) splice_sites = add_splice_sites(ref_exons, ref_gene) merged_ref_intervals = liftoff_utils.merge_children_intervals(ref_exons) exon_group_dict = find_overlapping_exon_groups(merged_ref_intervals, ref_exons) matrix = make_scoring_matrix(3) for i in range (len(merged_ref_intervals)): ref_interval = merged_ref_intervals[i] target_interval = get_target_interval(exon_group_dict[i], target_exons, ref_interval) flank = max(0, (ref_interval[1] - ref_interval[0])- (target_interval[1]- target_interval[0])) + 10 if target_interval != [0,0]: is_reverse = ref_gene.strand != target_gene.strand ref_seq = get_feature_sequence(ref_interval, ref_fa, ref_gene.seqid, 0 ) capitalized_ref_seq = cds_and_splice_sites_to_upper(ref_interval, splice_sites, ref_CDS_intervals, ref_seq, is_reverse) target_seq = get_feature_sequence(target_interval, target_fa, target_gene.seqid, flank) alignment = parasail.sg_dx_trace_scan_sat(capitalized_ref_seq, target_seq, 10, 1, matrix) write_sam_file(ref_interval, target_interval, ref_gene, target_gene, capitalized_ref_seq, alignment, output_sam, flank) remove_splice_sites(ref_exons, ref_gene)
def get_aligned_blocks(alignment, aln_id, feature_hierarchy, search_type): cigar_operations = get_cigar_operations() cigar = alignment.cigar query_start, query_end = get_query_start_and_end(alignment, cigar, cigar_operations) children = feature_hierarchy.children[liftoff_utils.convert_id_to_original(alignment.query_name)] parent = feature_hierarchy.parents[liftoff_utils.convert_id_to_original(alignment.query_name)] if search_type == "copies" and is_end_to_end_alignment(parent, query_start, query_end) is False: return [] reference_block_start, reference_block_pos = alignment.reference_start, alignment.reference_start query_block_start, query_block_pos = query_start, query_start new_blocks, mismatches = [], [] merged_children_coords = liftoff_utils.merge_children_intervals(children) for operation, length in cigar: if base_is_aligned(operation, cigar_operations): query_block_pos, reference_block_pos = add_aligned_base(operation, query_block_pos, reference_block_pos, length, cigar_operations, mismatches) if query_block_pos == query_end: add_block(query_block_pos, reference_block_pos, aln_id, alignment, query_block_start, reference_block_start, mismatches, new_blocks, merged_children_coords, parent) elif is_alignment_gap(operation, cigar_operations): add_block(query_block_pos, reference_block_pos, aln_id, alignment, query_block_start, reference_block_start, mismatches, new_blocks, merged_children_coords, parent) mismatches, query_block_start, reference_block_start, query_block_pos, reference_block_pos = \ end_block_at_gap( operation, query_block_pos, reference_block_pos, length) return new_blocks
def find_best_mapping(alignments, query_length, parent, feature_heirarchy, previous_feature_start, previous_feature_ref_start, previous_gene_seq, inter, lifted_features_list, args): children = feature_heirarchy.children[parent.id] children_coords = liftoff_utils.merge_children_intervals(children) node_dict, aln_graph = intialize_graph() head_nodes = add_single_alignments( node_dict, aln_graph, alignments, children_coords, parent, previous_feature_start, previous_feature_ref_start, previous_gene_seq, inter, feature_heirarchy.parents, lifted_features_list, args) chain_alignments( head_nodes, node_dict, aln_graph, parent, children_coords, inter, feature_heirarchy.parents, lifted_features_list, args, ) add_target_node(aln_graph, node_dict, query_length, children_coords, parent, args) shortest_path_nodes = find_shortest_path(node_dict, aln_graph) if len(shortest_path_nodes) == 0: return {}, 0, 0 mapped_children, alignment_coverage, seq_id = convert_all_children_coords( shortest_path_nodes, children, parent) return mapped_children, alignment_coverage, seq_id
def find_best_mapping(alignments, query_length, parent, coords_to_exclude, children_dict, previous_gene_start, copy_tag): children = children_dict[parent.id] children_coords = liftoff_utils.merge_children_intervals(children) node_dict, aln_graph = intialize_graph() head_nodes = add_single_alignments(node_dict, aln_graph, alignments, children_coords, parent, coords_to_exclude, previous_gene_start) chain_alignments(head_nodes, node_dict, aln_graph, coords_to_exclude, parent, children_coords) add_target_node(aln_graph, node_dict, query_length, children_coords, parent) shortest_path = nx.shortest_path( aln_graph, source=0, target=len(node_dict) - 1, weight=lambda u, v, d: get_weight(u, v, d, aln_graph)) shortest_path_weight = nx.shortest_path_length( aln_graph, source=0, target=len(node_dict) - 1, weight=lambda u, v, d: get_weight(u, v, d, aln_graph)) shortest_path_nodes = [] for i in range(1, len(shortest_path) - 1): node_name = shortest_path[i] shortest_path_nodes.append(node_dict[node_name]) if len(shortest_path_nodes) == 0: return {}, shortest_path_weight, 0, 0 mapped_children, alignment_coverage, seq_id = convert_all_children_coords( shortest_path_nodes, children, parent, copy_tag) return mapped_children, shortest_path_weight, alignment_coverage, seq_id
def find_best_mapping(alignments, query_length, parent, coords_to_exclude, children_dict, previous_gene_start): children = children_dict[parent.id] children_coords = liftoff_utils.merge_children_intervals(children) node_dict, aln_graph = intialize_graph() head_nodes = add_single_alignments(node_dict, aln_graph, alignments, children_coords, parent, coords_to_exclude, previous_gene_start) chain_alignments(head_nodes, node_dict, aln_graph, coords_to_exclude, parent, children_coords) add_target_node(aln_graph, node_dict, query_length, children_coords, parent) shortest_path_nodes = find_shortest_path(node_dict, aln_graph) if len(shortest_path_nodes) == 0: return {}, 0, 0 mapped_children, alignment_coverage, seq_id = convert_all_children_coords(shortest_path_nodes, children, parent) return mapped_children, alignment_coverage, seq_id
def get_aligned_blocks(alignment, aln_id, children_dict, parent_dict, search_type): cigar = alignment.cigar query_start = alignment.query_alignment_start query_end = alignment.query_alignment_end children = children_dict[liftoff_utils.convert_id_to_original( alignment.query_name)] parent = parent_dict[liftoff_utils.convert_id_to_original( alignment.query_name)] if parent.end - parent.start + 1 != query_end - query_start and search_type == "copies": return [] reference_block_start = alignment.reference_start reference_block_pos = reference_block_start if cigar[0][0] == 5: query_start += cigar[0][1] query_end += cigar[0][1] query_block_start = query_start query_block_pos = query_block_start new_blocks = [] mismatches = [] merged_children_coords = liftoff_utils.merge_children_intervals(children) for operation, length in cigar: if operation == 7 or operation == 8: query_block_pos, reference_block_pos = adjust_position( operation, query_block_pos, reference_block_pos, length) if operation == 8: mismatches.append(query_block_pos) if query_block_pos == query_end: add_block(query_block_pos, reference_block_pos, aln_id, alignment, query_block_start, reference_block_start, mismatches, new_blocks, merged_children_coords, parent) mismatches = [] elif operation == 1 or operation == 2: add_block(query_block_pos, reference_block_pos, aln_id, alignment, query_block_start, reference_block_start, mismatches, new_blocks, merged_children_coords, parent) mismatches = [] query_block_pos, reference_block_pos = adjust_position( operation, query_block_pos, reference_block_pos, length) query_block_start = query_block_pos reference_block_start = reference_block_pos return new_blocks