Пример #1
0
def setUpModule():
    global REFERENCE_GENOME
    REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa'))
    if 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != REFERENCE_GENOME['fake'].seq[0:50].upper():
        raise AssertionError('fake genome file does not have the expected contents')
    global BAM_CACHE
    BAM_CACHE = BamCache(get_data('mini_mock_reads_for_events.sorted.bam'))
Пример #2
0
def setUpModule():
    global REFERENCE_GENOME
    REFERENCE_GENOME = load_reference_genome(
        get_data('mock_reference_genome.fa'))
    if ('CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' !=
            REFERENCE_GENOME[REF_CHR].seq[0:50].upper()):
        raise AssertionError(
            'fake genome file does not have the expected contents')
Пример #3
0
def setUpModule():
    warnings.simplefilter('ignore')
    global REFERENCE_GENOME
    REFERENCE_GENOME = load_reference_genome(REFERENCE_GENOME_FILE)
    if 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != REFERENCE_GENOME[
            'fake'].seq[0:50].upper():
        raise AssertionError(
            'fake genome file does not have the expected contents')
Пример #4
0
def setUpModule():
    global REFERENCE_GENOME
    REFERENCE_GENOME = load_reference_genome(REFERENCE_GENOME_FILE)
    if 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != REFERENCE_GENOME[
            'fake'].seq[0:50].upper():
        raise AssertionError(
            'fake genome file does not have the expected contents')
    global BAM_CACHE
    BAM_CACHE = BamCache(BAM_INPUT)
Пример #5
0
def setUpModule():
    global annotations, reference_genome, template_metadata, genome_bam_fh, trans_bam_fh, masking
    print('setup start')
    annotations = load_reference_genes(FULL_REFERENCE_ANNOTATIONS_FILE_JSON)
    reference_genome = load_reference_genome(REFERENCE_GENOME_FILE)
    template_metadata = load_templates(TEMPLATE_METADATA_FILE)
    genome_bam_fh = pysam.AlignmentFile(FULL_BAM_INPUT)
    trans_bam_fh = pysam.AlignmentFile(TRANSCRIPTOME_BAM_INPUT)
    print('setup loading is complete')
Пример #6
0
 def __init__(self, input_bams, reference_genome, max_event_size=6, buffer=1):
     if isinstance(reference_genome, str):
         log('loading:', reference_genome, time_stamp=True)
         self.reference_genome = load_reference_genome(reference_genome)
     else:
         self.reference_genome = reference_genome
     self._load_bams(input_bams)
     self.bpp_cache = dict()
     self.max_event_size = max_event_size
     self.buffer = buffer
Пример #7
0
 def __init__(self, input_bams, reference_genome, max_event_size=6, buffer=1):
     if isinstance(reference_genome, str):
         logger.info(f'loading: {reference_genome}')
         self.reference_genome = load_reference_genome(reference_genome)
     else:
         self.reference_genome = reference_genome
     self._load_bams(input_bams)
     self.bpp_cache = dict()
     self.max_event_size = max_event_size
     self.buffer = buffer
Пример #8
0
def main():
    args = parse_arguments()
    repeat_sequences = sorted(list(set([s.lower() for s in args.repeat_seq])))
    log('loading:', args.input)
    reference_genome = load_reference_genome(args.input)
    comments = [
        os.path.basename(__file__),
        'input: {}'.format(args.input),
        'min_length: {}'.format(args.min_length),
        'repeat_seq: {}'.format(', '.join(args.repeat_seq)),
    ]
    log('writing:', args.output)
    with open(args.output, 'w') as fh:
        for comment in comments:
            fh.write('## {}\n'.format(comment))
        fh.write('chr\tstart\tend\tname\n')
        visited = set()
        for chrom, seq in sorted(reference_genome.items()):
            if chrom.startswith('chr'):
                chrom = chrom[3:]
            seq = str(seq.seq).lower()
            if seq in visited:
                continue
            else:
                visited.add(seq)
            spans = []
            for repseq in repeat_sequences:
                log(
                    'finding {}_repeat (min_length: {}), for chr{} (length: {})'.format(
                        repseq, args.min_length, chrom, len(seq)
                    )
                )
                index = 0
                while index < len(seq):
                    next_n = seq.find(repseq, index)
                    if next_n < 0:
                        break
                    index = next_n
                    while (
                        index + len(repseq) <= len(seq)
                        and seq[index : index + len(repseq)] == repseq
                    ):
                        index += len(repseq)
                    span = BioInterval(chrom, next_n + 1, index, name='repeat_{}'.format(repseq))
                    if len(span) >= args.min_length and len(span) >= 2 * len(repseq):
                        spans.append(span)
            log('found', len(spans), 'spans', time_stamp=False)
            for span in spans:
                fh.write(
                    '{}\t{}\t{}\t{}\n'.format(
                        span.reference_object, span.start, span.end, span.name
                    )
                )
Пример #9
0
def set_example_genes():
    result = {}
    genes = load_annotations(os.path.join(DATA_DIR, 'example_genes.json'))
    seqs = load_reference_genome(os.path.join(DATA_DIR, 'example_genes.fa'))
    for chr_genes in genes.values():
        for gene in chr_genes:
            if gene.name in seqs:
                gene.seq = str(seqs[gene.name].seq)
            result[gene.name] = gene
            if gene.aliases:
                for alias in gene.aliases:
                    result[alias] = gene
    print(result.keys())
    return result
Пример #10
0
def setUpModule():
    global REFERENCE_GENOME
    REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa'))
    if 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != REFERENCE_GENOME['fake'].seq[0:50].upper():
        raise AssertionError('fake genome file does not have the expected contents')
    global BAM_CACHE
    BAM_CACHE = BamCache(get_data('mini_mock_reads_for_events.sorted.bam'))
    global FULL_BAM_CACHE
    FULL_BAM_CACHE = BamCache(get_data('mock_reads_for_events.sorted.bam'))
    global READS
    READS = {}
    for read in BAM_CACHE.fetch('reference3', 1, 8000):
        if read.qname not in READS:
            READS[read.qname] = [None, None]
        if read.is_supplementary:
            continue
        if read.is_read1:
            READS[read.qname][0] = read
        else:
            READS[read.qname][1] = read