def check_tiling(reads, breaks, contig_len, debug=False): """Checks if there are reads tiling across breakpoints with no gaps This will be used for checking integrity of breakpoint where there is a novel sequence of considerable size and there is not enough flanking sequences for read pairs to suggest validity of fragment Args: reads: (list) Pysam AlignedRead objects breaks: (tuple) sorted coordinates of breakpoint positions in contigs Returns: Boolean if there are reads spanning across breakpoints with no gaps """ span = None for read in reads: # skip reads that is unmapped, not properly paired, or the second mate, or not fully mapped if not read.alen or not is_fully_mapped(read, contig_len): continue # skip reads that don't overlap the breakpoints if read.pos + read.alen < breaks[0] or read.pos > breaks[1]: continue try: span = span.union(intspan('%d-%d' % (read.pos + 1, read.pos + read.alen))) except: span = intspan('%d-%d' % (read.pos + 1, read.pos + read.alen)) if span is not None: break_span = intspan('%d-%d' % (breaks[0], breaks[1])) # make sure there is no gap in tiling reads and spans the entire breakpoint if len(span.ranges()) == 1 and len(span & break_span) == len(break_span): return True return False
def find_untemplated_sequence(aligns, contig_seq): """Finds untemplated sequence in chimeric breakpoint This corresponds to any sequence at the breakpoint that is not covered by the 2 alignments in the chimera. The sequence will be given is the same strand as the first alignment (the first and second alignments should have the same strand) Args: aligns: (list) 2 Alignment objects of chimera contig_seq: (str) Contig sequence Returns: Untemplated sequence or None """ untemplated_seq = '-' contig_span1 = intspan('%s-%s' % (aligns[0].qstart, aligns[0].qend)) contig_span2 = intspan('%s-%s' % (aligns[1].qstart, aligns[1].qend)) sorted_contig_coords = sorted([aligns[0].qstart, aligns[0].qend, aligns[1].qstart, aligns[1].qend]) whole_span = intspan('%s-%s' % (min(sorted_contig_coords), max(sorted_contig_coords))) unmapped = whole_span - contig_span1 - contig_span2 if len(unmapped) > 0: unmapped_coords = unmapped.ranges() untemplated_seq = contig_seq[unmapped_coords[0][0] - 1 : unmapped_coords[0][1]] # sequence given in relation to strand of first alignment if aligns[0].strand == '-': untemplated_seq = reverse_complement(untemplated_seq) return untemplated_seq
def find_untemplated_sequence(aligns, contig_seq): """Finds untemplated sequence in chimeric breakpoint This corresponds to any sequence at the breakpoint that is not covered by the 2 alignments in the chimera. The sequence will be given is the same strand as the first alignment (the first and second alignments should have the same strand) Args: aligns: (list) 2 Alignment objects of chimera contig_seq: (str) Contig sequence Returns: Untemplated sequence or None """ untemplated_seq = '-' contig_span1 = intspan('%s-%s' % (aligns[0].qstart, aligns[0].qend)) contig_span2 = intspan('%s-%s' % (aligns[1].qstart, aligns[1].qend)) sorted_contig_coords = sorted( [aligns[0].qstart, aligns[0].qend, aligns[1].qstart, aligns[1].qend]) whole_span = intspan( '%s-%s' % (min(sorted_contig_coords), max(sorted_contig_coords))) unmapped = whole_span - contig_span1 - contig_span2 if len(unmapped) > 0: unmapped_coords = unmapped.ranges() untemplated_seq = contig_seq[unmapped_coords[0][0] - 1:unmapped_coords[0][1]] # sequence given in relation to strand of first alignment if aligns[0].strand == '-': untemplated_seq = reverse_complement(untemplated_seq) return untemplated_seq
def create_span(blocks): """Creates intspan for each block""" span = None for block in blocks: if (type(block) is tuple or type(block) is list) and len(block) == 2: if span is None: span = intspan('%s-%s' % (block[0], block[1])) else: span = span.union(intspan('%s-%s' % (block[0], block[1]))) return span
def create_span(cls, blocks): """Creates intspan for each block Used by self.overlap() """ span = None for block in blocks: try: span = span.union(intspan('%s-%s' % (block[0], block[1]))) except: span = intspan('%s-%s' % (block[0], block[1])) return span
def create_span(blocks): """Creates intspan for each block""" span = None for block in blocks: if (isinstance(block, tuple) or isinstance(block, list)) and len(block) == 2: if span is None: span = intspan('{}-{}'.format(block[0], block[1])) else: span = span.union( intspan('{}-{}'.format(block[0], block[1]))) return span
def preprocess_variant(q_i): """ Preprocess question's variant (if necessary) :param q_i: the Q_info object, whose fields are 'id,kind,subkind,nb_source,options,order' :type q_i: Q_info (named tuple) """ if q_i.id == 'order_of_operations': default_variant = { 'order_of_operations': {'variant': '0-23,100-87'} } if ('variant' not in q_i.options or ('variant' in q_i.options and q_i.options['variant'] == '')): q_i.options.update(default_variant[q_i.id]) try: variants_to_pick_from = intspan(q_i.options['variant']) except ParseError: raise ValueError('Incorrect variant in xml file: {}' .format(q_i.options['variant'])) raw_query = '(' last = len(variants_to_pick_from.ranges()) - 1 for i, r in enumerate(variants_to_pick_from.ranges()): if r[0] == r[1]: raw_query += 'nb1 = ' + str(r[0]) else: raw_query += '(nb1 >= {} AND nb1 <= {})'.format(r[0], r[1]) if i < last: raw_query += ' OR ' raw_query += ')' q_i.options.update( {'variant': int(shared .order_of_operations_variants_source .next(**{'raw': raw_query})[0])})
def locate_features(breaks, orients, features): """Find the 'best' gene features of the breakpoints of a given event It will first determine which features overlap which breakpoint. If there are features that overlap both breakpoints, only they will be considered in picking the 'best' suitable one. Args: breaks: (tuple) the 2 breakpoints ((chr1, pos1), (chr2, pos2)) orients: (tuple) the 2 orientations ('L|R', 'L|R') features: (list) Interval objects of a given event from parsing the bedpe overlap file Returns: A tuple of 2 feature (interval object) picked to annotate the 2 breakpoints can be (None, None) if nothing is found) """ # use intspan to intersect individual breakpoint with feature coodinates break1_span = intspan('%s-%s' % (breaks[0][1], breaks[0][1])) break2_span = intspan('%s-%s' % (breaks[1][1], breaks[1][1])) # categories features where 1, 2, or both breakpoints overlap # use Set because there may be redundancy overlaps = {'both': Set(), '1': Set(), '2': Set()} for feature in features: feature_span = intspan('%s-%s' % (feature.start + 1, feature.stop)) overlap1 = True if feature.chrom == breaks[0][ 0] and feature_span & break1_span else False overlap2 = True if feature.chrom == breaks[1][ 0] and feature_span & break2_span else False if overlap1 and overlap2: overlaps['both'].add(feature) elif overlap1: overlaps['1'].add(feature) elif overlap2: overlaps['2'].add(feature) # only considers features that overlap both breakpoints if such are found if overlaps['both']: best_feature1 = pick_feature(breaks[0], orients[0], overlaps['both']) best_feature2 = pick_feature(breaks[1], orients[1], overlaps['both']) else: best_feature1 = pick_feature(breaks[0], orients[0], overlaps['1']) best_feature2 = pick_feature(breaks[1], orients[1], overlaps['2']) return best_feature1, best_feature2
def get_contig_coverage(aligns, end_to_end=False): """Coverage of the contig by the union of the primary_aligns alignments Args: aligns: (list) All Alignments constituting a chimera Returns: Fraction corresponding to coverage """ span = intspan('%d-%d' % (aligns[0].qstart, aligns[0].qend)) for i in range(1, len(aligns)): span = span.union(intspan('%d-%d' % (aligns[i].qstart, aligns[i].qend))) if not end_to_end: return len(span) / float(aligns[0].query_len) else: return (max(span) - min(span) + 1) / float(aligns[0].query_len)
def check_inv_dup(adj, aligns): if adj.rearrangement == 'inv': target_span_before_bp = intspan('%s-%s' % (aligns[0].tstart, aligns[0].tend)) target_span_after_bp = intspan('%s-%s' % (aligns[1].tstart, aligns[1].tend)) if target_span_after_bp < target_span_before_bp: adj.rearrangement = 'inv-dup' # reverse breakpoint and orientation to make it same as a dup if adj.target_breaks[1] == aligns[1].tstart: adj.target_breaks[1] = aligns[1].tend else: adj.target_breaks[1] = aligns[1].tstart if adj.orients[1] == 'L': adj.orients[1] = 'R' else: adj.orients[1] = 'L'
def remove_rows(table_name, id_span): """Remove rows matching the ids from id_span from the table.""" _assert_table_exists(table_name) for id_ in list(intspan(id_span)): _assert_row_exists(table_name, id_) values = _intspan2sqllist(id_span) cmd = f'DELETE FROM {table_name} WHERE id IN {values};' _exec(table_name, cmd) _reset_table_ids(table_name)
def locate_features(breaks, orients, features): """Find the 'best' gene features of the breakpoints of a given event It will first determine which features overlap which breakpoint. If there are features that overlap both breakpoints, only they will be considered in picking the 'best' suitable one. Args: breaks: (tuple) the 2 breakpoints ((chr1, pos1), (chr2, pos2)) orients: (tuple) the 2 orientations ('L|R', 'L|R') features: (list) Interval objects of a given event from parsing the bedpe overlap file Returns: A tuple of 2 feature (interval object) picked to annotate the 2 breakpoints can be (None, None) if nothing is found) """ # use intspan to intersect individual breakpoint with feature coodinates break1_span = intspan('%s-%s' % (breaks[0][1], breaks[0][1])) break2_span = intspan('%s-%s' % (breaks[1][1], breaks[1][1])) # categories features where 1, 2, or both breakpoints overlap # use Set because there may be redundancy overlaps = {'both':Set(), '1':Set(), '2':Set()} for feature in features: feature_span = intspan('%s-%s' % (feature.start + 1, feature.stop)) overlap1 = True if feature.chrom == breaks[0][0] and feature_span & break1_span else False overlap2 = True if feature.chrom == breaks[1][0] and feature_span & break2_span else False if overlap1 and overlap2: overlaps['both'].add(feature) elif overlap1: overlaps['1'].add(feature) elif overlap2: overlaps['2'].add(feature) # only considers features that overlap both breakpoints if such are found if overlaps['both']: best_feature1 = pick_feature(breaks[0], orients[0], overlaps['both']) best_feature2 = pick_feature(breaks[1], orients[1], overlaps['both']) else: best_feature1 = pick_feature(breaks[0], orients[0], overlaps['1']) best_feature2 = pick_feature(breaks[1], orients[1], overlaps['2']) return best_feature1, best_feature2
def update_years2str(year_str: str, years: Optional[List[int]] = None) -> str: """ Given a string of years of the form ``"2014, 2016-2017"``, update the string if necessary to include the given years (default: the current year). """ if years is None: years = [time.localtime().tm_year] yearspan = intspan(year_str) yearspan.update(years) return years2str(yearspan)
def find_columns(lines): """ Given a list of text lines, assume they represent a fixed-width "ASCII table" and guess the column indices therein. Depends on finding typographical "rivers" of spaces running vertically through text indicating column breaks. This is a high-probability heuristic (based on the many tests performed on it). There are some cases where all rows happen to include aligned spaces that do *not* signify a column break. In this case, recommend you modify the table with a separator line (e.g. using --- characters) showing where the columns should be. Since separators are stripped out, adding an explicit set of separators will not alter result data. """ # Partition lines into seps (separators and blank lines) and nonseps (content) nonseps, seps = partition(is_separator, lines) # Find max length of content lines. This defines the "universe" of # available content columns. Use only non-separator lines because they # are the content we care most about. maxlen = max(len(l) for l in nonseps) universe = intspan.from_range(0, maxlen - 1) # If there are separators lines, try to find definitive vertical separation # markers in them to define column boundaries. if seps: # If separators, try to find the column breaks in them indices = col_break_indices(seps) iranges = (universe - indices).ranges() else: indices = None if not seps or not indices: # If horizontal separators not present, or if present but lack the vertical # separation indicators needed to determine column locations, look for # vertical separators common to all rows. A rare, but genuine case. indices = col_break_indices(nonseps, 'intersection_update') if not indices: # Vertical separators not found. Fall back to using vertical # whitespace rivers as column separators. Find where spaces are in # every column. indices = intspan.from_range(0, maxlen - 1) for l in lines: line_spaces = intspan(all_indices(l, ' ')) indices.intersection_update(line_spaces) # indices is now intspan showing where spaces or vertical seps are # Find inclusive ranges where content would be iranges = (universe - indices).ranges() # Convert inclusive ranges to half-open Python ranges hranges = [(s, e+1) for s,e in iranges] return seps, nonseps, hranges
def col_break_indices(lines, combine='update'): """ Given a set of horizontal separator lines, return a guess as to which indices have column breaks, based on common indicator characters. """ all_lines_indices = [vertical_sep_in_line(line) for line in lines] combined = intspan(all_lines_indices[0]) update_func = getattr(combined, combine) for line_indices in all_lines_indices[1:]: update_func(line_indices) return combined
def _coverage(path): spans = [intspan('%d-%d' % (aligns[i].qstart, aligns[i].qend)) for i in path] covered = spans[0] overlaps = [] for i in range(1, len(spans)): covered = covered.union(spans[i]) overlap = spans[i - 1].intersection(spans[i]) if len(overlap) > 0: overlaps.append(overlap) return covered, overlaps
def check_tiling(reads, breaks, contig_len, debug=False): """Checks if there are reads tiling across breakpoints with no gaps This will be used for checking integrity of breakpoint where there is a novel sequence of considerable size and there is not enough flanking sequences for read pairs to suggest validity of fragment Args: reads: (list) Pysam AlignedRead objects breaks: (tuple) sorted coordinates of breakpoint positions in contigs Returns: Boolean if there are reads spanning across breakpoints with no gaps """ span = None for read in reads: # skip reads that is unmapped, not properly paired, or the second mate, # or not fully mapped if not read.alen or not is_fully_mapped(read, contig_len): continue # skip reads that don't overlap the breakpoints if read.pos + read.alen < breaks[0] or read.pos > breaks[1]: continue try: span = span.union( intspan('{}-{}'.format(read.pos + 1, read.pos + read.alen))) except BaseException: span = intspan('{}-{}'.format(read.pos + 1, read.pos + read.alen)) if span is not None: break_span = intspan('{}-{}'.format(breaks[0], breaks[1])) # make sure there is no gap in tiling reads and spans the entire # breakpoint if len(span.ranges()) == 1 and len(span & break_span) == len(break_span): return True return False
def scan_missing_segments(group_name): """Scan for previously missed segments.""" log.info('missing: checking for missed segments') with db_session() as db: # recheck for anything to delete expired = db.query(Miss).filter( Miss.attempts >= config.scan.get('miss_retry_limit')).filter( Miss.group_name == group_name).delete() db.commit() if expired: log.info('missing: deleted {} expired misses'.format(expired)) # get missing articles for this group missing_messages = [ r for r, in db.query(Miss.message).filter( Miss.group_name == group_name).all() ] if missing_messages: # mash it into ranges missing_ranges = intspan(missing_messages).ranges() server = Server() server.connect() status, parts, messages, missed = server.scan( group_name, message_ranges=missing_ranges) # if we got some missing parts, save them if parts: pynab.parts.save_all(parts) # even if they got blacklisted, delete the ones we got from the misses if messages: db.query(Miss).filter(Miss.message.in_(messages)).filter( Miss.group_name == group_name).delete(False) db.commit() if missed: # clear up those we didn't get save_missing_segments(group_name, missed) if server.connection: try: server.connection.quit() except: pass
def find_microhomology(aligns, contig_seq): """Finds micromology given 2 alignments and contig sequence The homology sequence is based on the contig sequence Homology is found based on the fact that BWA-mem will report overlapping contig coordinates in chimeric alignments. Args: aligns: (list) 2 Alignment objects of chimera contig_seq: (str) Contig sequence Returns: Tuple of homology sequence(str) and homology (contig)coordinates((int, int)) """ homol_seq = None homol_coords = None contig_span1 = intspan('%s-%s' % (aligns[0].qstart, aligns[0].qend)) contig_span2 = intspan('%s-%s' % (aligns[1].qstart, aligns[1].qend)) overlap = contig_span1.intersection(contig_span2) if len(overlap) > 0: homol_coords = overlap.ranges()[0] homol_seq = contig_seq[homol_coords[0] - 1 : homol_coords[1]] return homol_seq, homol_coords
def find_microhomology(aligns, contig_seq): """Finds micromology given 2 alignments and contig sequence The homology sequence is based on the contig sequence Homology is found based on the fact that BWA-mem will report overlapping contig coordinates in chimeric alignments. Args: aligns: (list) 2 Alignment objects of chimera contig_seq: (str) Contig sequence Returns: Tuple of homology sequence(str) and homology (contig)coordinates((int, int)) """ homol_seq = None homol_coords = None contig_span1 = intspan('%s-%s' % (aligns[0].qstart, aligns[0].qend)) contig_span2 = intspan('%s-%s' % (aligns[1].qstart, aligns[1].qend)) overlap = contig_span1.intersection(contig_span2) if len(overlap) > 0: homol_coords = overlap.ranges()[0] homol_seq = contig_seq[homol_coords[0] - 1:homol_coords[1]] return homol_seq, homol_coords
def derangify(nodes): data_array = [] for node in nodes: match = re.match('(.*)\[(.*)\](\..*)', node) if match: (prefix,nrange,suffix) = match.groups() length = 0 if re.match('.*\-.*',nrange): (r1,r2) = nrange.split('-') length = len(str(r2)) for num in intspan(nrange): data_array.append("%s%s%s" %(prefix,str(num).zfill(length),suffix)) else: data_array.append(node) return data_array
def derangify(nodes): data_array = [] for node in nodes: match = re.match('(.*)\[(.*)\](\..*)', node) if match: (prefix, nrange, suffix) = match.groups() length = 0 if re.match('.*\-.*', nrange): (r1, r2) = nrange.split('-') length = len(str(r2)) for num in intspan(nrange): data_array.append("%s%s%s" % (prefix, str(num).zfill(length), suffix)) else: data_array.append(node) return data_array
def scan_missing_segments(group_name): """Scan for previously missed segments.""" log.info('missing: checking for missed segments') with db_session() as db: # recheck for anything to delete expired = db.query(Miss).filter(Miss.attempts >= config.scan.get('miss_retry_limit')).filter( Miss.group_name == group_name).delete() db.commit() if expired: log.info('missing: deleted {} expired misses'.format(expired)) # get missing articles for this group missing_messages = [r for r, in db.query(Miss.message).filter(Miss.group_name == group_name).all()] if missing_messages: # mash it into ranges missing_ranges = intspan(missing_messages).ranges() server = Server() server.connect() status, parts, messages, missed = server.scan(group_name, message_ranges=missing_ranges) # if we got some missing parts, save them if parts: pynab.parts.save_all(parts) # even if they got blacklisted, delete the ones we got from the misses if messages: db.query(Miss).filter(Miss.message.in_(messages)).filter(Miss.group_name == group_name).delete(False) db.commit() if missed: # clear up those we didn't get save_missing_segments(group_name, missed) if server.connection: try: server.connection.quit() except: pass
def derangify(nodes): data_array = [] for node in nodes: match = re.match('(.*)\[(.*)\](\-.*)', node) # vsccwn[100-300]-brn.vscc.vrsn.com # ^([a-zA-Z]+)(\d+)([a-zA-Z0-9-]+)(\..*)$', node if match: (prefix, nrange, suffix) = match.groups() length = 0 if re.match('.*\-.*', nrange): (r1, r2) = nrange.split('-') length = len(str(r2)) for num in intspan(nrange): data_array.append("%s%s%s" % (prefix, str(num).zfill(length), suffix)) else: data_array.append(node) return data_array
def update_years2str(year_str, years=None): """ Given a string of years of the form ``"2014, 2016-2017"``, update the string if necessary to include the given years (default: the current year). >>> update_years2str('2015', [2015]) '2015' >>> update_years2str('2015', [2016]) '2015-2016' >>> update_years2str('2015', [2017]) '2015, 2017' >>> update_years2str('2014-2015', [2016]) '2014-2016' >>> update_years2str('2013, 2015', [2016]) '2013, 2015-2016' >>> update_years2str('2013, 2015', [2017, 2014]) '2013-2015, 2017' """ if years is None: years = [time.localtime().tm_year] yearspan = intspan(year_str) yearspan.update(years) return years2str(yearspan)
def _intspan2sqllist(s): """Turn an ints' span (given as str) to a SQLite list of values.""" values = ', '.join([str(n) for n in list(intspan(s))]) return f'({values})'
def test_intspansproduct_filter_packs(): packs_list = \ [[[intspan('1-2'), intspan('1-2')], [intspan('3-4'), intspan('5-6')]], [[intspan('1-2'), intspan('3-4')], [intspan('1-2'), intspan('5-6')]], [[intspan('1-2'), intspan('5-6')], [intspan('1-2'), intspan('3-4')]]] assert IntspansProduct._filter_packs(packs_list) == [] packs_list = \ [[[intspan('1-2'), intspan('1-2'), intspan('3-4')], [intspan('5-6')]], [[intspan('1-2'), intspan('1-2'), intspan('5-6')], [intspan('3-4')]], [[intspan('1-2'), intspan('3-4'), intspan('5-6')], [intspan('1-2')]]] assert IntspansProduct._filter_packs(packs_list) == [] packs_list = \ [[[intspan('1-2'), intspan('1,5')], [intspan('1,3'), intspan('3-4')]], [[intspan('1-2'), intspan('3-4')], [intspan('1,3'), intspan('1,5')]], [[intspan('1-2'), intspan('1,3')], [intspan('1,5'), intspan('3-4')]]] assert IntspansProduct._filter_packs(packs_list) == [[intspan('1'), intspan('3')]] packs_list = \ [[[intspan('1-2'), intspan('1,3'), intspan('1,5')], [intspan('3-4')]], [[intspan('1-2'), intspan('1,5'), intspan('3-4')], [intspan('1,3')]], [[intspan('1-2'), intspan('1,3'), intspan('3-4')], [intspan('1,5')]], [[intspan('1,3'), intspan('1,5'), intspan('3-4')], [intspan('1-2')]]] assert IntspansProduct._filter_packs(packs_list) == [[intspan('1'), intspan('3-4')]] packs_list = \ [[[intspan('20-30'), intspan('20-40'), intspan('20-50')], [intspan('20-60'), intspan('20-90')]], [[intspan('20-30'), intspan('20-40'), intspan('20-60')], [intspan('20-50'), intspan('20-90')]], [[intspan('20-30'), intspan('20-40'), intspan('20-90')], [intspan('20-50'), intspan('20-60')]], [[intspan('20-30'), intspan('20-50'), intspan('20-60')], [intspan('20-40'), intspan('20-90')]], [[intspan('20-30'), intspan('20-50'), intspan('20-90')], [intspan('20-40'), intspan('20-60')]], [[intspan('20-30'), intspan('20-60'), intspan('20-90')], [intspan('20-40'), intspan('20-50')]], [[intspan('20-40'), intspan('20-50'), intspan('20-60')], [intspan('20-30'), intspan('20-90')]], [[intspan('20-40'), intspan('20-50'), intspan('20-90')], [intspan('20-30'), intspan('20-60')]], [[intspan('20-40'), intspan('20-60'), intspan('20-90')], [intspan('20-30'), intspan('20-50')]], [[intspan('20-50'), intspan('20-60'), intspan('20-90')], [intspan('20-30'), intspan('20-40')]]] assert IntspansProduct._filter_packs(packs_list) == [[intspan('20-30'), intspan('20-60')], [intspan('20-30'), intspan('20-50')], [intspan('20-30'), intspan('20-40')], [intspan('20-40'), intspan('20-30')], [intspan('20-50'), intspan('20-30')], ]
def years2str(years: List[int]) -> str: return str(intspan(years)).replace(",", ", ")
def groupVlan(vlanlist): #Grouping vlan e.g. 2,3,4,7,8,10 -> 2-4,7-8,10 vlan_merged = str(intspan(vlanlist)) return vlan_merged
def test_intspansproduct_rebuild_spans_from_packs(): filtered_packs = [[intspan('1'), intspan('3')]] assert IntspansProduct._rebuild_spans_from_packs(filtered_packs, '3_2') \ == [[intspan('1'), intspan('1'), intspan('1'), intspan('3'), intspan('3')]] assert IntspansProduct._rebuild_spans_from_packs(filtered_packs, '2_2') \ == [[intspan('1'), intspan('1'), intspan('3'), intspan('3')]] assert IntspansProduct._rebuild_spans_from_packs(filtered_packs, '1_1') \ == [[intspan('1'), intspan('3')]] filtered_packs = [[intspan('1'), intspan('3')], [intspan('4'), intspan('5-6')]] assert IntspansProduct._rebuild_spans_from_packs(filtered_packs, '3_1') \ == [[intspan('1'), intspan('1'), intspan('1'), intspan('3')], [intspan('4'), intspan('4'), intspan('4'), intspan('5-6')]]
def call_event(align1, align2, query_seq=None, no_sort=False, max_inv_target_olap=30000, debug=False): """Curates adj based on info given by primary_aligns alignments Args: align1: First Alignment object align2: Second Alignment object homol_seq: (str) Microhomology sequence in contig homol_coords: (tuple) Start and end coordinates of microhomology sequence in contig novel_seq: (str) Untemplated sequence at breakpoint query_seq: (str) Query sequence Returns: Adjacency object """ # figure out breakpoints using query positions target_breaks = [None, None] orients = [None, None] query_breaks = [None, None] homol_seq = None homol_seq_coords = None novel_seq = None novel_seq_coords = None align1_tpos = (align1.tstart, align1.tend) if align1.strand == '+' else (align1.tend, align1.tstart) align2_tpos = (align2.tstart, align2.tend) if align2.strand == '+' else (align2.tend, align2.tstart) if align1.qstart < align2.qstart: aligns = [align1, align2] target_breaks[ 0] = align1.tend if align1.strand == '+' else align1.tstart orients[0] = 'L' if max(align1.tstart, align1.tend) == target_breaks[0] else 'R' target_breaks[ 1] = align2.tstart if align2.strand == '+' else align2.tend orients[1] = 'L' if max(align2.tstart, align2.tend) == target_breaks[1] else 'R' query_breaks = [align1.qend, align2.qstart] else: aligns = [align2, align1] target_breaks[ 0] = align2.tend if align2.strand == '+' else align2.tstart orients[0] = 'L' if max(align2.tstart, align2.tend) == target_breaks[0] else 'R' target_breaks[ 1] = align1.tstart if align1.strand == '+' else align1.tend orients[1] = 'L' if max(align1.tstart, align1.tend) == target_breaks[1] else 'R' query_breaks = [align2.qend, align1.qstart] if not no_sort: if (aligns[0].target != aligns[1].target and compare_chr(aligns[0].target, aligns[1].target) > 0) or\ (aligns[0].target == aligns[1].target and target_breaks[0] > target_breaks[1]): aligns.reverse() target_breaks.reverse() orients.reverse() rearrangement = None if aligns[0].target != aligns[1].target: rearrangement = 'trl' elif orients[0] == orients[1]: span1 = intspan('%s-%s' % (aligns[0].tstart, aligns[0].tend)) span2 = intspan('%s-%s' % (aligns[1].tstart, aligns[1].tend)) olap = span1 & span2 if len(olap) <= max_inv_target_olap: rearrangement = 'inv' else: print '%s:potential inv disallowed - target overlap %d bigger than %s' % ( aligns[0].query, len(olap), max_inv_target_olap) elif orients[0] == 'L' and orients[1] == 'L': rearrangement = 'inv' elif orients[0] == 'L' and orients[1] == 'R': if target_breaks[0] < target_breaks[1]: if target_breaks[0] + 1 == target_breaks[1]: # deletion of tandem duplicaton if query_breaks[0] >= query_breaks[1]: rearrangement = 'del' target_breaks = [ target_breaks[1] + 1, target_breaks[0] + (query_breaks[0] - query_breaks[1] + 1) ] else: rearrangement = 'ins' else: # deletion with or without microhology rearrangement = 'del' elif target_breaks[0] > target_breaks[1]: rearrangement = 'dup' else: if query_breaks[0] < query_breaks[1]: rearrangement = 'ins' else: # deletion of tandem duplicaton rearrangement = 'del' target_breaks = [ target_breaks[1] + 1, target_breaks[0] + (query_breaks[0] - query_breaks[1] + 1) ] elif orients[0] == 'R' and orients[1] == 'R': rearrangement = 'inv' elif orients[0] == 'R' and orients[1] == 'L': if target_breaks[0] == target_breaks[1]: rearrangement = 'ins' elif target_breaks[0] < target_breaks[1]: rearrangement = 'dup' else: rearrangement = 'del' # novel seq if query_seq is not None and query_breaks[1] - query_breaks[0] > 1: novel_seq = query_seq[query_breaks[0]:query_breaks[1] - 1] if aligns[0].strand == '-': novel_seq = reverse_complement(novel_seq) novel_seq_coords = (query_breaks[0] + 1, query_breaks[1] - 1) # homol seq if query_seq is not None and query_breaks[0] >= query_breaks[1]: homol_seq_coords = [query_breaks[1], query_breaks[0]] homol_seq = query_seq[query_breaks[1] - 1:query_breaks[0]] if aligns[0].strand == '-': homol_seq = reverse_complement(homol_seq) homol_seq_coords = (query_breaks[1], query_breaks[0]) adj = None if rearrangement is not None: adj = Adjacency( align1.query, (aligns[0].target, aligns[1].target), query_breaks, target_breaks, rearrangement=rearrangement, orients=orients, homol_seq=homol_seq, homol_seq_coords=homol_seq_coords, novel_seq=novel_seq, novel_seq_coords=novel_seq_coords, ) elif debug: sys.stdout.write( "cannot figure out event of primary_aligns alignment contig:%s targets:%s,%s orients:%s breaks:%s query_breaks:%s\n" % (aligns[0].query, aligns[0].target, aligns[1].target, orients, breaks, query_breaks)) return adj
def test_intspansproduct_group_by_packs(): r = IntspansProduct('1,2×1,2×3,4×5,6') assert r._group_by_packs(r.spans, '2_2') == \ [[[intspan('1-2'), intspan('1-2')], [intspan('3-4'), intspan('5-6')]], [[intspan('1-2'), intspan('3-4')], [intspan('1-2'), intspan('5-6')]], [[intspan('1-2'), intspan('5-6')], [intspan('1-2'), intspan('3-4')]]] assert r._group_by_packs(r.spans, '3_1') == \ [[[intspan('1-2'), intspan('1-2'), intspan('3-4')], [intspan('5-6')]], [[intspan('1-2'), intspan('1-2'), intspan('5-6')], [intspan('3-4')]], [[intspan('1-2'), intspan('3-4'), intspan('5-6')], [intspan('1-2')]]] r = IntspansProduct('1,5×1,2×1,3×3,4') assert r._group_by_packs(r.spans, '2_2') == \ [[[intspan('1-2'), intspan('1,5')], [intspan('1,3'), intspan('3-4')]], [[intspan('1-2'), intspan('3-4')], [intspan('1,3'), intspan('1,5')]], [[intspan('1-2'), intspan('1,3')], [intspan('1,5'), intspan('3-4')]]] assert r._group_by_packs(r.spans, '3_1') == \ [[[intspan('1-2'), intspan('1,3'), intspan('1,5')], [intspan('3-4')]], [[intspan('1-2'), intspan('1,5'), intspan('3-4')], [intspan('1,3')]], [[intspan('1-2'), intspan('1,3'), intspan('3-4')], [intspan('1,5')]], [[intspan('1,3'), intspan('1,5'), intspan('3-4')], [intspan('1-2')]]] assert r._group_by_packs(r.spans, '1_1_1_1') == \ [[[intspan('1-2')], [intspan('1,3')], [intspan('1,5')], [intspan('3-4')]]] assert r._group_by_packs(r.spans, '4') == \ [[[intspan('1-2'), intspan('1,3'), intspan('1,5'), intspan('3-4')]]] r = IntspansProduct('1×2,3×2,4') with pytest.raises(ValueError) as excinfo: r._group_by_packs(r.spans, '3_2_1') assert str(excinfo.value) == "dist_code '3_2_1' cannot be used for a "\ 'list of 3 intspans.' assert r._group_by_packs(r.spans, '2_1') == \ [[[intspan('1'), intspan('2-3')], [intspan('2,4')]], [[intspan('1'), intspan('2,4')], [intspan('2-3')]], [[intspan('2-3'), intspan('2,4')], [intspan('1')]]] r = IntspansProduct('20-30×20-40×20-50×20-60×20-90') assert r._group_by_packs(r.spans, '3_2') == \ [[[intspan('20-30'), intspan('20-40'), intspan('20-50')], [intspan('20-60'), intspan('20-90')]], [[intspan('20-30'), intspan('20-40'), intspan('20-60')], [intspan('20-50'), intspan('20-90')]], [[intspan('20-30'), intspan('20-40'), intspan('20-90')], [intspan('20-50'), intspan('20-60')]], [[intspan('20-30'), intspan('20-50'), intspan('20-60')], [intspan('20-40'), intspan('20-90')]], [[intspan('20-30'), intspan('20-50'), intspan('20-90')], [intspan('20-40'), intspan('20-60')]], [[intspan('20-30'), intspan('20-60'), intspan('20-90')], [intspan('20-40'), intspan('20-50')]], [[intspan('20-40'), intspan('20-50'), intspan('20-60')], [intspan('20-30'), intspan('20-90')]], [[intspan('20-40'), intspan('20-50'), intspan('20-90')], [intspan('20-30'), intspan('20-60')]], [[intspan('20-40'), intspan('20-60'), intspan('20-90')], [intspan('20-30'), intspan('20-50')]], [[intspan('20-50'), intspan('20-60'), intspan('20-90')], [intspan('20-30'), intspan('20-40')]]]
def call_event(align1, align2, query_seq=None, no_sort=False, max_inv_target_olap=30000, debug=False): """Curates adj based on info given by primary_aligns alignments Args: align1: First Alignment object align2: Second Alignment object homol_seq: (str) Microhomology sequence in contig homol_coords: (tuple) Start and end coordinates of microhomology sequence in contig novel_seq: (str) Untemplated sequence at breakpoint query_seq: (str) Query sequence Returns: Adjacency object """ # figure out breakpoints using query positions target_breaks = [None, None] orients = [None, None] query_breaks = [None, None] homol_seq = None homol_seq_coords = None novel_seq = None novel_seq_coords = None align1_tpos = (align1.tstart, align1.tend) if align1.strand == '+' else (align1.tend, align1.tstart) align2_tpos = (align2.tstart, align2.tend) if align2.strand == '+' else (align2.tend, align2.tstart) if align1.qstart < align2.qstart: aligns = [align1, align2] target_breaks[0] = align1.tend if align1.strand == '+' else align1.tstart orients[0] = 'L' if max(align1.tstart, align1.tend) == target_breaks[0] else 'R' target_breaks[1] = align2.tstart if align2.strand == '+' else align2.tend orients[1] = 'L' if max(align2.tstart, align2.tend) == target_breaks[1] else 'R' query_breaks = [align1.qend, align2.qstart] else: aligns = [align2, align1] target_breaks[0] = align2.tend if align2.strand == '+' else align2.tstart orients[0] = 'L' if max(align2.tstart, align2.tend) == target_breaks[0] else 'R' target_breaks[1] = align1.tstart if align1.strand == '+' else align1.tend orients[1] = 'L' if max(align1.tstart, align1.tend) == target_breaks[1] else 'R' query_breaks = [align2.qend, align1.qstart] if not no_sort: if (aligns[0].target != aligns[1].target and compare_chr(aligns[0].target, aligns[1].target) > 0) or\ (aligns[0].target == aligns[1].target and target_breaks[0] > target_breaks[1]): aligns.reverse() target_breaks.reverse() orients.reverse() rearrangement = None if aligns[0].target != aligns[1].target: rearrangement = 'trl' elif orients[0] == orients[1]: span1 = intspan('%s-%s' % (aligns[0].tstart, aligns[0].tend)) span2 = intspan('%s-%s' % (aligns[1].tstart, aligns[1].tend)) olap = span1 & span2 if len(olap) <= max_inv_target_olap: rearrangement = 'inv' else: print '%s:potential inv disallowed - target overlap %d bigger than %s' % (aligns[0].query, len(olap), max_inv_target_olap) elif orients[0] == 'L' and orients[1] == 'L': rearrangement = 'inv' elif orients[0] == 'L' and orients[1] == 'R': if target_breaks[0] < target_breaks[1]: if target_breaks[0] + 1 == target_breaks[1]: # deletion of tandem duplicaton if query_breaks[0] >= query_breaks[1]: rearrangement = 'del' target_breaks = [target_breaks[1] + 1, target_breaks[0] + (query_breaks[0] - query_breaks[1] + 1)] else: rearrangement = 'ins' else: # deletion with or without microhology rearrangement = 'del' elif target_breaks[0] > target_breaks[1]: rearrangement = 'dup' else: if query_breaks[0] < query_breaks[1]: rearrangement = 'ins' else: # deletion of tandem duplicaton rearrangement = 'del' target_breaks = [target_breaks[1] + 1, target_breaks[0] + (query_breaks[0] - query_breaks[1] + 1)] elif orients[0] == 'R' and orients[1] == 'R': rearrangement = 'inv' elif orients[0] == 'R' and orients[1] == 'L': if target_breaks[0] == target_breaks[1]: rearrangement = 'ins' elif target_breaks[0] < target_breaks[1]: rearrangement = 'dup' else: rearrangement = 'del' # novel seq if query_seq is not None and query_breaks[1] - query_breaks[0] > 1: novel_seq = query_seq[query_breaks[0] : query_breaks[1] - 1] if aligns[0].strand == '-': novel_seq = reverse_complement(novel_seq) novel_seq_coords = (query_breaks[0] + 1, query_breaks[1] - 1) # homol seq if query_seq is not None and query_breaks[0] >= query_breaks[1]: homol_seq_coords = [query_breaks[1], query_breaks[0]] homol_seq = query_seq[query_breaks[1] - 1 : query_breaks[0]] if aligns[0].strand == '-': homol_seq = reverse_complement(homol_seq) homol_seq_coords = (query_breaks[1], query_breaks[0]) adj = None if rearrangement is not None: adj = Adjacency(align1.query, (aligns[0].target, aligns[1].target), query_breaks, target_breaks, rearrangement = rearrangement, orients = orients, homol_seq = homol_seq, homol_seq_coords = homol_seq_coords, novel_seq = novel_seq, novel_seq_coords = novel_seq_coords, ) elif debug: sys.stdout.write("cannot figure out event of primary_aligns alignment contig:%s targets:%s,%s orients:%s breaks:%s query_breaks:%s\n" % (aligns[0].query, aligns[0].target, aligns[1].target, orients, breaks, query_breaks)) return adj
def inspect_project(dirpath=None): if dirpath is None: dirpath = Path() if not (dirpath / 'setup.py').exists(): raise ValueError('No setup.py in project root') if not (dirpath / 'setup.cfg').exists(): raise ValueError('No setup.cfg in project root') cfg = read_configuration(str(dirpath / 'setup.cfg')) env = { "project_name": cfg["metadata"]["name"], "short_description": cfg["metadata"]["description"], "author": cfg["metadata"]["author"], "author_email": cfg["metadata"]["author_email"], "python_requires": cfg["options"]["python_requires"], "install_requires": cfg["options"].get("install_requires", []), "importable": "version" in cfg["metadata"], } if cfg["options"].get("packages"): env["is_flat_module"] = False env["import_name"] = cfg["options"]["packages"][0] else: env["is_flat_module"] = True env["import_name"] = cfg["options"]["py_modules"][0] env["python_versions"] = [] for clsfr in cfg["metadata"]["classifiers"]: m = re.fullmatch(r'Programming Language :: Python :: (\d+\.\d+)', clsfr) if m: env["python_versions"].append(m.group(1)) env["commands"] = {} try: commands = cfg["options"]["entry_points"]["console_scripts"] except KeyError: pass else: for cmd in commands: k, v = re.split(r'\s*=\s*', cmd, maxsplit=1) env["commands"][k] = v m = re.fullmatch( r'https://github.com/([^/]+)/([^/]+)', cfg["metadata"]["url"], ) assert m, 'Project URL is not a GitHub URL' env["github_user"] = m.group(1) env["repo_name"] = m.group(2) if "Documentation" in cfg["metadata"]["project_urls"]: m = re.fullmatch( r'https?://([-a-zA-Z0-9]+)\.(?:readthedocs|rtfd)\.io', cfg["metadata"]["project_urls"]["Documentation"], ) assert m, 'Documentation URL is not a Read the Docs URL' env["rtfd_name"] = m.group(1) else: env["rtfd_name"] = env["project_name"] if "Say Thanks!" in cfg["metadata"]["project_urls"]: m = re.fullmatch( r'https://saythanks\.io/to/([^/]+)', cfg["metadata"]["project_urls"]["Say Thanks!"], ) assert m, 'Invalid Say Thanks! URL' env["saythanks_to"] = m.group(1) else: env["saythanks_to"] = None if (dirpath / 'tox.ini').exists(): toxcfg = ConfigParser(interpolation=None) toxcfg.read(str(dirpath / 'tox.ini')) env["has_tests"] = toxcfg.has_section("testenv") else: env["has_tests"] = False env["has_travis"] = (dirpath / '.travis.yml').exists() env["has_docs"] = (dirpath / 'docs' / 'index.rst').exists() env["travis_user"] = env["codecov_user"] = env["github_user"] try: with (dirpath / 'README.rst').open(encoding='utf-8') as fp: rdme = Readme.parse(fp) except FileNotFoundError: env["has_pypi"] = False else: for badge in rdme.badges: m = re.fullmatch( r'https://travis-ci\.(?:com|org)/([^/]+)/[^/]+\.svg' r'(?:\?branch=.+)?', badge.href) if m: env["travis_user"] = m.group(1) m = re.fullmatch( r'https://codecov\.io/gh/([^/]+)/[^/]+/branch/.+' r'/graph/badge\.svg', badge.href) if m: env["codecov_user"] = m.group(1) env["has_pypi"] = any(link["label"] == "PyPI" for link in rdme.header_links) with (dirpath / 'LICENSE').open(encoding='utf-8') as fp: for line in fp: m = re.match(r'^Copyright \(c\) (\d[-,\d\s]+\d) \w+', line) if m: env["copyright_years"] = list(intspan(m.group(1))) break else: raise ValueError('Copyright years not found in LICENSE') return env
rdme = Readme.load(fp) except FileNotFoundError: env["has_pypi"] = False else: for badge in rdme.badges: if m := re.fullmatch( r"https://codecov\.io/gh/([^/]+)/[^/]+/branch/.+" r"/graph/badge\.svg", badge.href, ): env["codecov_user"] = m[1] env["has_pypi"] = any(link["label"] == "PyPI" for link in rdme.header_links) with (directory / "LICENSE").open(encoding="utf-8") as fp: for line in fp: if m := re.match(r"^Copyright \(c\) (\d[-,\d\s]+\d) \w+", line): env["copyright_years"] = list(intspan(m[1])) break else: raise InvalidProjectError("Copyright years not found in LICENSE") env["extra_testenvs"] = parse_extra_testenvs( directory / ".github" / "workflows" / "test.yml" ) return env class ModuleInfo(BaseModel): import_name: str is_flat_module: bool src_layout: bool
def years2str(years): return str(intspan(years)).replace(',', ', ')