def get_aligned_blocks(alignment, aln_id, feature_hierarchy, search_type): cigar_operations = get_cigar_operations() cigar = alignment.cigar query_start, query_end = get_query_start_and_end(alignment, cigar, cigar_operations) children = feature_hierarchy.children[liftoff_utils.convert_id_to_original(alignment.query_name)] parent = feature_hierarchy.parents[liftoff_utils.convert_id_to_original(alignment.query_name)] if search_type == "copies" and is_end_to_end_alignment(parent, query_start, query_end) is False: return [] reference_block_start, reference_block_pos = alignment.reference_start, alignment.reference_start query_block_start, query_block_pos = query_start, query_start new_blocks, mismatches = [], [] merged_children_coords = liftoff_utils.merge_children_intervals(children) for operation, length in cigar: if base_is_aligned(operation, cigar_operations): query_block_pos, reference_block_pos = add_aligned_base(operation, query_block_pos, reference_block_pos, length, cigar_operations, mismatches) if query_block_pos == query_end: add_block(query_block_pos, reference_block_pos, aln_id, alignment, query_block_start, reference_block_start, mismatches, new_blocks, merged_children_coords, parent) elif is_alignment_gap(operation, cigar_operations): add_block(query_block_pos, reference_block_pos, aln_id, alignment, query_block_start, reference_block_start, mismatches, new_blocks, merged_children_coords, parent) mismatches, query_block_start, reference_block_start, query_block_pos, reference_block_pos = \ end_block_at_gap( operation, query_block_pos, reference_block_pos, length) return new_blocks
def sort_alignments(parent_dict, alignments): parent_list = [] order = 0 order_dict = {} values = list(alignments.values()) for alignment in alignments: parent_list.append(parent_dict[liftoff_utils.convert_id_to_original( alignments[alignment][0].query_name)]) parent_list.sort(key=lambda x: (x.seqid, x.start)) for parent in parent_list: order_dict[parent.id] = order order += 1 values.sort(key=lambda x: order_dict[liftoff_utils.convert_id_to_original( x[0].query_name)]) return values
def lift_single_feature( threshold, feature_order, features_to_lift, feature_hierarchy, previous_feature_start, previous_feature_ref_start, previous_gene_seq, unmapped_features, aligned_feature, seq_id_threshold, feature_locations, lifted_features_list, distance_factor, ): new_parent_name = aligned_feature[0].query_name original_parent_name = liftoff_utils.convert_id_to_original( new_parent_name) parent = features_to_lift[original_parent_name] if len(aligned_feature) > 0: lifted_children, alignment_coverage, seq_id = find_best_mapping.find_best_mapping( aligned_feature, parent.end - parent.start + 1, parent, feature_hierarchy, previous_feature_start, previous_feature_ref_start, previous_gene_seq, feature_locations, lifted_features_list, distance_factor) lifted_features = merge_lifted_features.merge_lifted_features( lifted_children, parent, unmapped_features, threshold, new_parent_name, feature_order, feature_hierarchy, alignment_coverage, seq_id, seq_id_threshold) else: unmapped_features.append(parent) return lifted_features, aligned_feature[0].query_name
def lift_features_subset(all_overlapping_features, threshold, feature_order, parent_dict, children_dict, intermediate_dict, previous_gene_start, unmapped_features, aligned_feature, seq_id_threshold): new_parent_name = aligned_feature[0].query_name original_parent_name = liftoff_utils.convert_id_to_original( new_parent_name) copy_tag = liftoff_utils.get_copy_tag(new_parent_name) if new_parent_name in all_overlapping_features: overlapping_features = all_overlapping_features[new_parent_name] else: overlapping_features = [] parent = parent_dict[original_parent_name] if len(aligned_feature) > 0: lifted_children, shortest_path_weight, alignment_coverage, seq_id = find_best_mapping.find_best_mapping( aligned_feature, parent.end - parent.start + 1, parent, overlapping_features, children_dict, previous_gene_start, copy_tag) lifted_feature_list, feature_start = merge_lifted_features.merge_lifted_features( lifted_children, parent, unmapped_features, threshold, new_parent_name, feature_order, parent_dict, intermediate_dict, alignment_coverage, seq_id, seq_id_threshold) else: unmapped_features.append(parent) feature_start = 0 return lifted_feature_list, aligned_feature[0].query_name, feature_start
def remove_alignments_without_children(all_aligned_blocks, unmapped_features, feature_hierarchy): features_to_remove = [] for seq in all_aligned_blocks: if all_aligned_blocks[seq] == []: features_to_remove.append(seq) unmapped_features.append(feature_hierarchy.parents[liftoff_utils.convert_id_to_original(seq)]) for feature in features_to_remove: del all_aligned_blocks[feature] return all_aligned_blocks
def get_aligned_blocks(alignment, aln_id, children_dict, parent_dict, search_type): cigar = alignment.cigar query_start = alignment.query_alignment_start query_end = alignment.query_alignment_end children = children_dict[liftoff_utils.convert_id_to_original( alignment.query_name)] parent = parent_dict[liftoff_utils.convert_id_to_original( alignment.query_name)] if parent.end - parent.start + 1 != query_end - query_start and search_type == "copies": return [] reference_block_start = alignment.reference_start reference_block_pos = reference_block_start if cigar[0][0] == 5: query_start += cigar[0][1] query_end += cigar[0][1] query_block_start = query_start query_block_pos = query_block_start new_blocks = [] mismatches = [] merged_children_coords = liftoff_utils.merge_children_intervals(children) for operation, length in cigar: if operation == 7 or operation == 8: query_block_pos, reference_block_pos = adjust_position( operation, query_block_pos, reference_block_pos, length) if operation == 8: mismatches.append(query_block_pos) if query_block_pos == query_end: add_block(query_block_pos, reference_block_pos, aln_id, alignment, query_block_start, reference_block_start, mismatches, new_blocks, merged_children_coords, parent) mismatches = [] elif operation == 1 or operation == 2: add_block(query_block_pos, reference_block_pos, aln_id, alignment, query_block_start, reference_block_start, mismatches, new_blocks, merged_children_coords, parent) mismatches = [] query_block_pos, reference_block_pos = adjust_position( operation, query_block_pos, reference_block_pos, length) query_block_start = query_block_pos reference_block_start = reference_block_pos return new_blocks
def find_neighbor_location(ref_parents, alignment, lifted_feature_list, ref_parent_order): ref_feature = ref_parents[liftoff_utils.convert_id_to_original( alignment[0].query_name)] ref_neighbor_name = liftoff_utils.find_nonoverlapping_upstream_neighbor( ref_parent_order, ref_feature.id) if ref_neighbor_name is not None: ref_neighbor_key = ref_neighbor_name + "_0" if ref_neighbor_key in lifted_feature_list: previous_feature_start = lifted_feature_list[ref_neighbor_key][ 0].start previous_feature_seq = lifted_feature_list[ref_neighbor_key][ 0].seqid previous_feature_ref_start = ref_parents[ref_neighbor_name].start return previous_feature_start, previous_feature_seq, previous_feature_ref_start return 0, "", 0
def parse_alignment(file, parent_dict, children_dict, unmapped_features, search_type, max_alns): all_aligned_blocks = {} sam_file = pysam.AlignmentFile(file, 'r', check_sq=False, check_header=False) sam_file_iter = sam_file.fetch() aln_id = 0 name_dict = {} align_count_dict = {} for ref_seq in sam_file_iter: if ref_seq.is_unmapped is False: ref_seq.query_name = edit_name(search_type, ref_seq, name_dict) aln_id += 1 if ref_seq.query_name in align_count_dict: align_count = align_count_dict[ref_seq.query_name] + 1 else: align_count = 0 align_count_dict[ref_seq.query_name] = align_count if align_count < max_alns: aligned_blocks = get_aligned_blocks(ref_seq, aln_id, children_dict, parent_dict, search_type) if ref_seq.query_name in all_aligned_blocks: all_aligned_blocks[ref_seq.query_name].extend( aligned_blocks) else: all_aligned_blocks[ref_seq.query_name] = aligned_blocks else: unmapped_features.append(parent_dict[ref_seq.query_name]) unaligned_exons = [] for seq in all_aligned_blocks: if all_aligned_blocks[seq] == []: unaligned_exons.append(seq) unmapped_features.append( parent_dict[liftoff_utils.convert_id_to_original(seq)]) for seq in unaligned_exons: del all_aligned_blocks[seq] return all_aligned_blocks
def resolve_overlapping_homologues(all_aligned_segs, lifted_feature_list, features_to_remap, unmapped_features, threshold, parent_dict, intermediate_dict, children_dict, feature_db, original_parent_order, seq_id_threshold): all_overlapping_features = {} starting_remap_feature_num = len(features_to_remap) iter = 0 while len(features_to_remap) > 0: iter += 1 if iter > 10 * starting_remap_feature_num: break features_to_check = {} aligned_segs_to_remap = {} for feature_to_remap in features_to_remap: del lifted_feature_list[feature_to_remap] aligned_segs_to_remap[feature_to_remap] = all_aligned_segs[ feature_to_remap] add_overlapping_feature(features_to_remap, feature_to_remap, all_overlapping_features) lift_features.lift_all_features(aligned_segs_to_remap, all_overlapping_features, threshold, feature_db, parent_dict, children_dict, intermediate_dict, unmapped_features, lifted_feature_list, seq_id_threshold) clean_overlapping_features(lifted_feature_list, all_overlapping_features, parent_dict, features_to_remap, unmapped_features) for feature_to_remap in features_to_remap: if feature_to_remap in lifted_feature_list: features_to_check[feature_to_remap] = lifted_feature_list[ feature_to_remap] features_to_remap = check_homologues(lifted_feature_list, features_to_check, parent_dict, original_parent_order) for feature in features_to_remap: unmapped_features.append( parent_dict[liftoff_utils.convert_id_to_original(feature)]) del lifted_feature_list[feature] return lifted_feature_list
def lift_single_feature(all_overlapping_features, threshold, feature_order, features_to_lift, feature_hierarchy, previous_gene_start, unmapped_features, aligned_feature, seq_id_threshold): new_parent_name = aligned_feature[0].query_name original_parent_name = liftoff_utils.convert_id_to_original( new_parent_name) overlapping_features = get_overlapping_features_list( new_parent_name, all_overlapping_features) parent = features_to_lift[original_parent_name] if len(aligned_feature) > 0: lifted_children, alignment_coverage, seq_id = find_best_mapping.find_best_mapping( aligned_feature, parent.end - parent.start + 1, parent, overlapping_features, feature_hierarchy.children, previous_gene_start) lifted_feature_list, feature_start = merge_lifted_features.merge_lifted_features( lifted_children, parent, unmapped_features, threshold, new_parent_name, feature_order, feature_hierarchy, alignment_coverage, seq_id, seq_id_threshold) else: unmapped_features.append(parent) feature_start = 0 return lifted_feature_list, aligned_feature[0].query_name, feature_start
def remove_unresolved_features(features_to_remap, parent_dict, lifted_feature_list, unmapped_features): for feature in features_to_remap: unmapped_features.append( parent_dict[liftoff_utils.convert_id_to_original(feature)]) del lifted_feature_list[feature]