Пример #1
0
def test_pileup_truncate():
    kwargs_notrunc = {'chrom': 'Pf3D7_01_v3',
                      'start': 2000,
                      'end': 2100,
                      'one_based': False,
                      'truncate': False}
    kwargs_trunc = {'chrom': 'Pf3D7_01_v3',
                    'start': 2000,
                    'end': 2100,
                    'one_based': False,
                    'truncate': True}
    for f, needs_ref in pileup_functions:
        debug(f.__name__)
        # test no truncate
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_notrunc)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_notrunc)
        debug(a[:5])
        eq_(1952, a['pos'][0])
        eq_(2154, a['pos'][-1])
        # test truncate
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_trunc)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_trunc)
        eq_(2000, a['pos'][0])
        eq_(2099, a['pos'][-1])
Пример #2
0
def test_pileup_limit():

    for f, needs_ref in pileup_functions:
        debug(f.__name__)

        # test with effectively no limit
        kwargs = dict(fields=['reads_all'], max_depth=1000000)
        if needs_ref:
            a = f(Samfile('fixture/deep.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs)
        else:
            a = f(Samfile('fixture/deep.bam'), **kwargs)
        eq_(26169, a[70])

        # test with specific limit
        kwargs = dict(fields=['reads_all'], max_depth=12000)
        if needs_ref:
            a = f(Samfile('fixture/deep.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs)
        else:
            a = f(Samfile('fixture/deep.bam'), **kwargs)
        eq_(12046, a[70])  # no idea why limit is not exact

        # test with default limit
        kwargs = dict(fields=['reads_all'])
        if needs_ref:
            a = f(Samfile('fixture/deep.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs)
        else:
            a = f(Samfile('fixture/deep.bam'), **kwargs)
        eq_(8052, a[70])  # no idea why limit is not exact
Пример #3
0
    def __getitem__(self, idx):
        if self.fasta is None:
            self.fasta = Fastafile(self.fasta_file)

        interval = self.bt[idx]

        # Intervals can't be bigger than 1000bp
        if (interval.stop - interval.start) > 1000:
            raise Exception("Input sequences should be at maximum 1000bp.")

        # Fetch the fasta line
        seq = self.fasta.fetch(str(interval.chrom), interval.start,
                               interval.stop).upper()

        # Reverse complement input string is requested
        if interval.strand == "-":
            seq = rc_str(seq)
        """
        # generate an id
        id = str(interval.chrom) + ":" + str(interval.start) + "-" + str(interval.stop)
        if interval.name not in ["", ".", "*"]:
            id = interval.name
        """

        return {
            "inputs": seq,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #4
0
def test_binned_pad_wg():
    expected = stat_coverage_binned_refimpl(
        Samfile('fixture/test.bam'),
        Fastafile('fixture/ref.fa'))

    actual = pysamstats.stat_coverage_binned(Samfile('fixture/test.bam'),
                                             Fastafile('fixture/ref.fa'))
    compare_iterators(expected, actual)
    kwargs = {'window_size': 200,
              'window_offset': 100}
    for f, needs_ref in binned_functions:
        debug(f.__name__)
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs)
        assert sorted(set(a['chrom'])) == [b'Pf3D7_01_v3', b'Pf3D7_02_v3',
                                           b'Pf3D7_03_v3']
        eq_(100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0])
        eq_(50100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1])
        eq_(100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0])
        eq_(60100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1])
        eq_(100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][0])
        eq_(70100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][-1])
Пример #5
0
def test_pileup_pad():
    kwargs_nopad = {'chrom': 'Pf3D7_01_v3',
                    'start': 0,
                    'end': 20000,
                    'one_based': False,
                    'pad': False}
    kwargs_pad = {'chrom': 'Pf3D7_01_v3',
                  'start': 0,
                  'end': 20000,
                  'one_based': False,
                  'pad': True}
    for f, needs_ref in pileup_functions:
        debug(f.__name__)
        # test no pad
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_nopad)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_nopad)
        eq_(924, a['pos'][0])
        eq_(9935, a['pos'][-1])
        # test pad
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_pad)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_pad)
        eq_(0, a['pos'][0])
        eq_(19999, a['pos'][-1])
        assert np.all(np.diff(a['pos']) == 1)
Пример #6
0
def test_pileup_pad_wg():
    # whole genome
    expected = stat_coverage_refimpl(Samfile('fixture/test.bam'))
    actual = pysamstats.stat_coverage(Samfile('fixture/test.bam'))
    compare_iterators(expected, actual)
    kwargs_nopad = {'pad': False}
    kwargs_pad = {'pad': True}
    for f, needs_ref in pileup_functions:
        debug(f.__name__)
        # test no pad
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_nopad)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_nopad)
        eq_(sorted(set(a['chrom'])), [b'Pf3D7_01_v3', b'Pf3D7_02_v3'])
        eq_(924, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0])
        eq_(9935, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1])
        eq_(926, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0])
        eq_(10074, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1])
        # test pad
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_pad)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_pad)
        eq_(sorted(set(a['chrom'])),
            [b'Pf3D7_01_v3', b'Pf3D7_02_v3', b'Pf3D7_03_v3'])
        eq_(0, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0])
        eq_(50000, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1])
        eq_(0, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0])
        eq_(60000, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1])
        eq_(0, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][0])
        eq_(70000, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][-1])
Пример #7
0
def bias_correction(bam, signal, fBiasDict, rBiasDict, genome_file_name, chrName, start, end):

  # Parameters
  window = 50
  defaultKmerValue = 1.0

  # Initialization
  fastaFile = Fastafile(genome_file_name)
  k_nb = len(fBiasDict.keys()[0])
  p1 = start; p2 = end
  p1_w = p1 - (window/2); p2_w = p2 + (window/2)
  p1_wk = p1_w - (k_nb/2); p2_wk = p2_w + (k_nb/2)

  # Raw counts
  nf = [0.0] * (p2_w-p1_w); nr = [0.0] * (p2_w-p1_w)
  for r in bam.fetch(chrName, p1_w, p2_w):
    if((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos-p1_w] += 1.0
    if((r.is_reverse) and ((r.aend-1) < p2_w)): nr[r.aend-1-p1_w] += 1.0

  # Smoothed counts
  Nf = []; Nr = [];
  fSum = sum(nf[:window]); rSum = sum(nr[:window]);
  fLast = nf[0]; rLast = nr[0]
  for i in range((window/2),len(nf)-(window/2)):
    Nf.append(fSum)
    Nr.append(rSum)
    fSum -= fLast; fSum += nf[i+(window/2)]; fLast = nf[i-(window/2)+1]
    rSum -= rLast; rSum += nr[i+(window/2)]; rLast = nr[i-(window/2)+1]

  # Fetching sequence
  currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper()
  currRevComp = revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper())

  # Iterating on sequence to create signal
  af = []; ar = []
  for i in range((k_nb/2),len(currStr)-(k_nb/2)+1):
    fseq = currStr[i-(k_nb/2):i+(k_nb/2)]
    rseq = currRevComp[len(currStr)-(k_nb/2)-i:len(currStr)+(k_nb/2)-i]
    try: af.append(fBiasDict[fseq])
    except Exception: af.append(defaultKmerValue)
    try: ar.append(rBiasDict[rseq])
    except Exception: ar.append(defaultKmerValue)

  # Calculating bias and writing to wig file
  fSum = sum(af[:window]); rSum = sum(ar[:window]);
  fLast = af[0]; rLast = ar[0]
  bias_corrected_signal = []
  for i in range((window/2),len(af)-(window/2)):
    nhatf = Nf[i-(window/2)]*(af[i]/fSum)
    nhatr = Nr[i-(window/2)]*(ar[i]/rSum)
    zf = log(nf[i]+1)-log(nhatf+1)
    zr = log(nr[i]+1)-log(nhatr+1)
    bias_corrected_signal.append(zf+zr)
    fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1]
    rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1]

  # Termination
  fastaFile.close()
  return bias_corrected_signal
Пример #8
0
    def __init__(self, intervals_file, fasta_file, use_linecache=False):

        # intervals
        if use_linecache:
            self.bt = BedToolLinecache(intervals_file)
        else:
            self.bt = BedTool(intervals_file)
        self.fasta = Fastafile(fasta_file)
Пример #9
0
    def test_should_get_correct_chrom_length(self):
        chrom = 'chr20'
        seq = "TAGCATTATTATTATTATTATTATTA"
        fasta_file = self.__build_fasta_file({
            chrom: seq,
        })

        fasta_file = Fastafile(fasta_file.filename)
        self.assertEqual(fasta_file.get_reference_length(chrom), len(seq))
Пример #10
0
    def test_should_return_capitalised_sequence_from_ref_file(self):
        fasta_file = self.__build_fasta_file({
            'chr20':
            "tagcattattattattattattatta",
        })

        fasta_file = Fastafile(fasta_file.filename)
        self.assertEqual(
            fasta_file.fetch('chr20', 10, 20).upper(), "ATTATTATTA")
Пример #11
0
    def test_should_be_able_to_fetch_section_of_genome(self):
        fasta_file = self.__build_fasta_file({
            'chr20':
            "TAGCATTATTATTATTATTATTATTA",
        })

        fasta_file = Fastafile(fasta_file.filename)
        self.assertEqual(
            fasta_file.fetch('chr20', 10, 20).upper(), "ATTATTATTA")
Пример #12
0
  def __init__(self, gene_db, reference_fasta):
    self.reference  = Fastafile(reference_fasta)
    self.con        = open_genedb(gene_db)
    self.gene_cache = OrderedDict()

    self.band_map = band_map = defaultdict(IntervalTree)
    for band in get_cytobands(self.con):
      band_map[band.chrom].insert(band.start,band.end,band)
      if band.chrom.startswith('chr') and band.chrom[3:] not in band_map:
        band_map[band.chrom[3:]] = band_map[band.chrom]

    trans = get_transcripts(self.con)
    trans = progress_loop(trans, label='Loading transcripts: ', units='transcripts')

    self.feature_map = feature_map = defaultdict(IntervalTree)
    for gene in trans:
      feature_map[gene.chrom].insert(gene.txStart,gene.txEnd,gene)

      if 0: # DEBUG
        parts = self.decode_gene(gene)
        for part in parts:
          if part.type not in ('intron','UTR5','UTR3','UTR') and '_' not in part.chrom:
            print '\t'.join(map(str,[part.chrom,part.start,part.end,gene.symbol]))

    sys.stderr.write('Loading complete.\n')
Пример #13
0
Файл: ORF.py Проект: neevor/grit
def find_gene_orfs_worker( input_queue, gtf_ofp, fa_ofp, fasta_fn ):
    # open fasta file in each thread separately
    fasta = Fastafile( fasta_fn )
    
    # process genes for orfs until input queue is empty
    while not input_queue.empty():
        try:
            gene = input_queue.get(block=False)
        except Queue.Empty:
            break
        
        if VERBOSE: print >> sys.stderr, '\tProcessing ' + gene.id
        ann_trans = find_cds_for_gene( gene, fasta, ONLY_USE_LONGEST_ORF )
        op_str = "\n".join( [ tr.build_gtf_lines( gene.id, {} ) 
                              for tr in ann_trans ] )
        gtf_ofp.write( op_str + "\n" )
        
        if fa_ofp != None:
            for trans in ann_trans:
                fa_ofp.write( ">%s\n" % trans.id )
                for line in iter_x_char_lines(trans.coding_sequence):
                    fa_ofp.write(line+"\n")
                
        if VERBOSE: print >> sys.stderr, '\tFinished ' + gene.id
    
    return
Пример #14
0
def compare_stats_withref(impl, refimpl, bam_fn='fixture/test.bam',
                          fasta_fn='fixture/ref.fa'):
    # no read filters
    kwargs = {'chrom': 'Pf3D7_01_v3',
              'start': 0,
              'end': 2000,
              'one_based': False}
    expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    compare_iterators(expected, actual)
    # read filters
    kwargs['min_mapq'] = 1
    kwargs['no_dup'] = True
    expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    compare_iterators(expected, actual)
Пример #15
0
def profile_withrefseq(fun, end=1000):
    samfile = Samfile('fixture/test.bam')
    fafile = Fastafile('fixture/ref.fa')
    count = 0
    f = getattr(pysamstats, fun)
    for _ in f(samfile, fafile, chrom='Pf3D7_01_v3', start=0, end=end):
        count += 1
Пример #16
0
def normalize(args):
    """Normalize variants."""
    refs = Fastafile(expanduser(args.reference))
    variants = VariantFile(args.sample)

    with VariantFile(args.output, 'w', header=variants.header) as out:
        # Create parallel locus iterator by chromosome
        for _, ref, loci in records_by_chromosome(refs, [variants], [None], args):
            loci = sort_almost_sorted(loci[0], key=NormalizedLocus.left_order_key)

            for locus in loci:
                record  = locus.record
                start   = locus.left.start
                stop    = locus.left.stop
                alleles = locus.left.alleles

                if '' in alleles:
                    pad = ref[start - 1:start]
                    start -= 1
                    alleles = [pad + a for a in alleles]

                record.alleles = alleles
                record.start   = start
                record.stop    = stop

                out.write(record)
Пример #17
0
    def test_should_be_able_to_list_all_chromosomes(self):
        fasta_file = self.__build_fasta_file({
            'chr5': "T",
            'chrX': "T",
            'chr20': "T",
        })

        fasta_file = Fastafile(fasta_file.filename)
        self.assertEqual(sorted(fasta_file.references),
                         sorted(['chr5', 'chr20', 'chrX']))
Пример #18
0
def main():
    # setup a reverse_complement translation
    rev_table=string.maketrans('ACGTacgt', 'TGCAtgca')
    def revcomp(seq, rev_table):
        return seq.translate(rev_table)
        
    # open your fasta file
    fasta  = Fastafile("bedtools/tests/data/chr21.fa")
    # open your bed file
    bed    = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed")
    
    # for each bed, grab the the DNA in that interval 
    for b in bed:
        # grab the seq, rev. comp if necessary
        seq = fasta.fetch(b.chrom, b.start, b.end)  
        if b.strand == "-":
            seq = revcomp(seq, rev_table)
        # print the interval and the seq
        print b.chrom, b.start, b.end, b.strand, seq
Пример #19
0
    def removeHomopolymers(self, variants, outFile, distance):
        startTime = Helper.getTime()
        Helper.info(
            " [%s] remove Missmatches from homopolymers " %
            (startTime.strftime("%c")), self.rnaEdit.logFile,
            self.rnaEdit.textField)

        tempBedFile = open(outFile + "_tmp.bed", "w+")
        tempSeqFile = outFile + "_tmp.tsv"

        refGenome = "/media/Storage/databases/rnaEditor_annotations/human/human_g1k_v37.fasta"
        fastaFile = Fastafile(self.rnaEdit.params.refGenome)
        mmNumberTotal = len(variants.variantDict)
        # print temporary BedFile
        numberPassed = 0
        for key in variants.variantDict.keys():
            chr, position, ref, alt = key
            startPos = position - distance if position >= distance else 0
            endpos = position + distance
            sequence = fastaFile.fetch(chr, startPos, endpos)
            pattern = ref * distance
            """ !!!Test if this gives better results
                !!!ONLY DELETE IF MM IS AT THE END OF A HOMOPOLYMER NUKLEOTIDES
            if sequence.startswith(pattern):
                del mmDict[site] 
            elif sequence.endswith(pattern):
                del mmDict[site]
            """
            if pattern in sequence:
                try:
                    del variants.variantDict[key]
                except KeyError:
                    pass
            else:
                numberPassed += 1

        # output statistics
        Helper.info(
            "\t\t %d out of %d passed the Homopolymer-Filter" %
            (numberPassed, mmNumberTotal), self.rnaEdit.logFile,
            self.rnaEdit.textField)
        Helper.printTimeDiff(startTime, self.rnaEdit.logFile,
                             self.rnaEdit.textField)
Пример #20
0
def main():
    # setup a reverse_complement translation
    rev_table = string.maketrans('ACGTacgt', 'TGCAtgca')

    def revcomp(seq, rev_table):
        return seq.translate(rev_table)

    # open your fasta file
    fasta = Fastafile("bedtools/tests/data/chr21.fa")
    # open your bed file
    bed = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed")

    # for each bed, grab the the DNA in that interval
    for b in bed:
        # grab the seq, rev. comp if necessary
        seq = fasta.fetch(b.chrom, b.start, b.end)
        if b.strand == "-":
            seq = revcomp(seq, rev_table)
        # print the interval and the seq
        print b.chrom, b.start, b.end, b.strand, seq
Пример #21
0
    def __init__(self, genome_fasta_file, logger):
        """
        Create new ReferenceGenome

        Args:
            genome_fasta_file (string): Path to whole genome FASTA file
            logger (logging.Logger): Logger for reporting warnings/errors

        Returns:
            ReferenceGenome
        """
        self._logger = logger
        self.genome_fasta_file = genome_fasta_file
        self._validate_reference_file()

        try:
            self._fasta_file = Fastafile(self.genome_fasta_file)
        except:
            raise IOError("Could not read genome file: " +
                          self.genome_fasta_file)
Пример #22
0
def build_transcripts_worker(elements, output, gtf_ofp, tracking_ofp, fasta_fp,
                             ref_genes):
    # if appropriate, open the fasta file
    if fasta_fp is not None: fasta = Fastafile(fasta_fp.name)
    else: fasta = None
    while True:
        #config.log_statement("Waiting for gene to process. (%i)" % elements.qsize())
        gene_elements = elements.get()
        if gene_elements == 'FINISHED':
            return
        build_and_write_gene(gene_elements, output, gtf_ofp, tracking_ofp,
                             fasta, ref_genes)
    return
Пример #23
0
class SeqDataset(Dataset):
    """
    Args:
        intervals_file: bed3 file containing intervals
        fasta_file: file path; Genome sequence
        target_file: file path; path to the targets in the csv format
    """
    def __init__(self, intervals_file, fasta_file, use_linecache=True):

        # intervals
        if use_linecache:
            self.bt = BedToolLinecache(intervals_file)
        else:
            self.bt = BedTool(intervals_file)
        self.fasta_file = fasta_file
        self.fasta = None

    def __len__(self):
        return len(self.bt)

    def __getitem__(self, idx):
        if self.fasta is None:
            self.fasta = Fastafile(self.fasta_file)

        interval = self.bt[idx]

        # Intervals can't be bigger than 1000bp
        if (interval.stop - interval.start) > 1000:
            raise Exception("Input sequences should be at maximum 1000bp.")

        # Fetch the fasta line
        seq = self.fasta.fetch(str(interval.chrom), interval.start,
                               interval.stop).upper()

        # Reverse complement input string is requested
        if interval.strand == "-":
            seq = rc_str(seq)
        """
        # generate an id
        id = str(interval.chrom) + ":" + str(interval.start) + "-" + str(interval.stop)
        if interval.name not in ["", ".", "*"]:
            id = interval.name
        """

        return {
            "inputs": seq,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #24
0
def test_binned_pad_region():
    kwargs = {'chrom': 'Pf3D7_01_v3',
              'start': 1000,
              'end': 20000,
              'one_based': False,
              'window_size': 200,
              'window_offset': 100}
    for f, needs_ref in binned_functions:
        debug(f.__name__)
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs)
        assert set(a['chrom']) == {b'Pf3D7_01_v3'}
        eq_(1100, a['pos'][0])
        eq_(19900, a['pos'][-1])
Пример #25
0
def main():
    gtf_fp, fasta_fn, ofp = parse_arguments()
    mapping = load_id_mapping()
    genes = load_gtf(gtf_fp.name)
    fa = Fastafile(fasta_fn)
    for gene in genes:
        for i, t in enumerate(gene.transcripts):
            if not t.is_protein_coding: continue
            protein_seq = find_coding_sequence(t, fa)
            domains = find_domains(protein_seq, min_eval=1e-5)
            for d_id, region in convert_to_genome_coords(
                    t, protein_seq, domains):
                name = "%s.%i.%s" % (gene.meta_data['gene_name'], i,
                                     mapping[d_id])
                print create_gff_line(GenomicInterval(*region), name)

    return
Пример #26
0
def match_database2(args):
    """Match a genome to a database of alleles."""
    refs = Fastafile(expanduser(args.reference))
    db = VariantFile(expanduser(args.database))
    sample = VariantFile(expanduser(args.sample))

    try:
        sample_name = sample.header.samples[args.name]
    except TypeError:
        sample_name = args.name

    if db.index is None:
        raise ValueError('database file must be indexed')
    if sample.index is None:
        raise ValueError('sample file must be indexed')

    # Open tabluar output file, if requested
    table = None
    if args.table:
        tablefile = open(args.table, 'w') if args.table != '-' else sys.stdout
        table = csv.writer(tablefile, delimiter='\t', lineterminator='\n')
        write_table_header(table)

    update_info_header(sample.header)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        for superlocus, matches in generate_matches(refs, sample, db, args):
            clear_info_fields(superlocus)

            for allele_locus, allele, match in matches:
                dbvar = allele.record
                var_id = dbvar.id or f'{dbvar.chrom}_{dbvar.start+1}_{dbvar.stop}_{dbvar.alts[0]}'

                status, times = translate_match(match)

                for locus in allele_locus:
                    info = locus.record.info
                    info[status] = info.get(status, ()) + (var_id, ) * times

                write_table_row(table, sample_name, var_id, allele_locus,
                                status, match)

            for locus in sorted(superlocus,
                                key=NormalizedLocus.record_order_key):
                out.write(locus.record)
Пример #27
0
def match_database(args):
    """Match a genome to a database of alleles."""
    refs = Fastafile(expanduser(args.reference))
    db = VariantFile(expanduser(args.database))
    sample = VariantFile(expanduser(args.sample))

    format_meta, info_meta = build_new_metadata(db, sample)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        for superlocus, matches in generate_matches(refs, sample, db, args):
            for allele_locus, allele, match in matches:
                # Annotate results of search
                status, times = translate_match(match)
                suffix = '_' + status

                for locus in allele_locus:
                    annotate_info(locus, allele, info_meta, suffix, times)
                    annotate_format(locus, allele, format_meta, suffix, times)

            for locus in sorted(superlocus,
                                key=NormalizedLocus.record_order_key):
                out.write(locus.record)
Пример #28
0
      p2 = min(p1 + rightExt, self.length)
      for i in range(p1, p2): self.vectorF[i] += 1.0 
    else:
      p2 = min(alignment.aend+rShift+leftExt,self.end)-self.start
      p1 = max(p2 - rightExt, 0)
      for i in range(p1, p2): self.vectorR[i] += 1.0

revDict = {"A":"T", "T":"A", "C":"G", "G":"C", "N":"N"}

#################################################################################################
# INITIALIZATION
#################################################################################################

# Initializing bam and fasta
bamFile = Samfile(bamFileName, "rb")
fastaFile = Fastafile(fastaFileName)

# Reading bias dictionaries
fBiasDict = dict(); rBiasDict = dict()
fBiasFile = open(fBiasFileName,"r"); rBiasFile = open(rBiasFileName,"r")
fBiasFile.readline(); rBiasFile.readline()
for line in fBiasFile:
  ll = line.strip().split("\t")
  fBiasDict[ll[0]] = float(ll[1])
for line in rBiasFile:
  ll = line.strip().split("\t")
  rBiasDict[ll[0]] = float(ll[1])
fBiasFile.close(); rBiasFile.close()
k_nb = len(fBiasDict.keys()[0])

# Creating output file
Пример #29
0
header = "CHR\tSTART\tEND\tTC_50\tTC_100\tTC_200"
counter = 1
for e in pfmFileNameListShort:
    myvec = []
    if ("HUMAN" in e): factorname = "\tHOCOMOCO"
    else:
        factorname = "\tDENOVO_POS_" + str(counterPos)
        counter += 1
    for k in metricForEachMotif:
        myvec.append(factorname + "_" + k)
    header += "".join(myvec)

# Iterating on main bed file
bedFile = open(bedFileName, "r")
outFile = open(outputFileName, "w")
genomeFile = Fastafile(genomeFileName)
for line in bedFile:

    # Fetching line
    ll = line.strip().split("\t")
    chrName = ll[0]
    p1 = int(ll[1])
    p2 = int(ll[2])

    # Starting result structures
    regionTagCount = 0
    resVec = []
    for m in motifList:
        vec = []
        # For each motif print: 0. Best motif bit-score; 1. Relative motif position; 2. FS of motif; 3. PS of motif;
        vec.append(globalMin)
Пример #30
0
###########################################################

# Fetching motifs & evaluating global minimum
motifList = []
for pfmFileName in pfmFileNameList:
  motifList.append(Motif(pfmFileName))
globalMin = min([e.min for e in motifList])

# Opening BAM files for DNase-seq and footprints (footprint bed files were converted to BAM for efficiency)
dnaseBam = Samfile(dnaseBamFileName,"rb")
fpBam = Samfile(footprintBamFileName,"rb")

# Iterating on main bed file
bedFile = open(bedFileName,"r")
outFile = open(outputFileName,"w")
genomeFile = Fastafile(genomeFileName)
for line in bedFile:

  # Fetching line
  ll = line.strip().split("\t")
  chrName = ll[0]; p1 = int(ll[1]); p2 = int(ll[2])

  # Starting result structures
  regionTagCount = 0
  resVec = [globalMin,0,0,0,0,0] # BIT-SCORE, MOTIF_P1, MOTIF_P2, FP_OVERLAP, FP_P1, FP_P2
  counter = 0

  # Evaluating Overall TC
  try: regionTagCount = tag_count(chrName, p1, p2, dnaseBam, tcHalfWindow)
  except Exception: 
    print "Exception TC raised in "+line
Пример #31
0
from collections import defaultdict as dd
from string import maketrans


def usage():
    return "usage: %s <reference genome fasta> <refGenes.txt.gz>" % sys.argv[0]


def rc(dna):
    ''' reverse complement '''
    complements = maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
    return dna.translate(complements)[::-1]


if len(sys.argv) == 3:
    fa = Fastafile(sys.argv[1])

    assert sys.argv[2].endswith('.gz'), "refGenes.txt must be gzipped"

    genes = dd(list)

    with open(sys.argv[2], 'r') as ref:
        for line in ref:

            (bin,
            name,
            chrom,
            strand,
            txStart,
            txEnd,
            cdsStart,
Пример #32
0
    def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end,
                        forward_shift, reverse_shift, strands_specific):
        """
        Performs bias correction.

        Keyword arguments:
        signal -- Input signal.
        bias_table -- Bias table.

        Return:
        bias_corrected_signal -- Bias-corrected sequence.
        """

        if (not bias_table): return signal

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))
        if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            if (not read.is_reverse):
                cut_site = read.pos + forward_shift
                if cut_site >= start and cut_site < end:
                    nf[cut_site - p1_w] += 1.0
                    # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)):
                    #    nf[i - start] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if cut_site >= start and cut_site < end:
                    nr[cut_site - p1_w] += 1.0
                    # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)):
                    #    nr[i - start] += 1.0

                    # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0
                    # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper())
        #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
         #                                                            p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal = []
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            zf = log(nf[i] + 1) - log(nhatf + 1)
            zr = log(nr[i] + 1) - log(nhatr + 1)
            bias_corrected_signal_forward.append(zf)
            bias_corrected_signal_reverse.append(zr)
            bias_corrected_signal.append(zf + zr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Fixing the negative number in bias corrected signal
        min_value = abs(min(bias_corrected_signal_forward))
        bias_fixed_signal_forward = [e + min_value for e in bias_corrected_signal_forward]

        min_value = abs(min(bias_corrected_signal_reverse))
        bias_fixed_signal_reverse = [e + min_value for e in bias_corrected_signal_reverse]

        min_value = abs(min(bias_corrected_signal))
        bias_fixed_signal = [e + min_value for e in bias_corrected_signal]

        # Termination
        fastaFile.close()
        if not strands_specific:
            return bias_corrected_signal
        else:
            return bias_fixed_signal_forward, bias_fixed_signal_reverse
Пример #33
0
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name,
                    forward_shift, reverse_shift):
    # Parameters
    window = 50
    defaultKmerValue = 1.0

    # Initialization
    fastaFile = Fastafile(genome_file_name)
    fBiasDict = bias_table[0]
    rBiasDict = bias_table[1]
    k_nb = len(list(fBiasDict.keys())[0])
    p1 = start
    p2 = end
    p1_w = p1 - (window // 2)
    p2_w = p2 + (window // 2)
    p1_wk = p1_w - int(floor(k_nb / 2.))
    p2_wk = p2_w + int(ceil(k_nb / 2.))
    if p1 <= 0 or p1_w <= 0 or p1_wk <= 0 or p2_wk <= 0:
        # Return raw counts
        bc_signal = [0.0] * (p2 - p1)
        for read in bam.fetch(chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0

        return bc_signal

    # Raw counts
    nf = [0.0] * (p2_w - p1_w)
    nr = [0.0] * (p2_w - p1_w)
    for read in bam.fetch(chrom, p1_w, p2_w):
        # check if the read is unmapped, according to issue #112
        if read.is_unmapped:
            continue

        if not read.is_reverse:
            cut_site = read.pos + forward_shift
            if p1_w <= cut_site < p2_w:
                nf[cut_site - p1_w] += 1.0
        else:
            cut_site = read.aend + reverse_shift - 1
            if p1_w <= cut_site < p2_w:
                nr[cut_site - p1_w] += 1.0

    # Smoothed counts
    Nf = []
    Nr = []
    f_sum = sum(nf[:window])
    r_sum = sum(nr[:window])
    f_last = nf[0]
    r_last = nr[0]
    for i in range(int(window / 2), len(nf) - int(window / 2)):
        Nf.append(f_sum)
        Nr.append(r_sum)
        f_sum -= f_last
        f_sum += nf[i + int(window / 2)]
        f_last = nf[i - int(window / 2) + 1]
        r_sum -= r_last
        r_sum += nr[i + int(window / 2)]
        r_last = nr[i - int(window / 2) + 1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper()
    currRevComp = AuxiliaryFunctions.revcomp(
        str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper())

    # Iterating on sequence to create signal
    af = []
    ar = []
    for i in range(int(ceil(k_nb / 2.)),
                   len(currStr) - int(floor(k_nb / 2)) + 1):
        fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
        rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) -
                           i:len(currStr) + int(floor(k_nb / 2.)) - i]
        try:
            af.append(fBiasDict[fseq])
        except Exception:
            af.append(defaultKmerValue)
        try:
            ar.append(rBiasDict[rseq])
        except Exception:
            ar.append(defaultKmerValue)

    # Calculating bias and writing to wig file
    f_sum = sum(af[:window])
    r_sum = sum(ar[:window])
    f_last = af[0]
    r_last = ar[0]
    bc_signal = []
    for i in range(int(window / 2), len(af) - int(window / 2)):
        nhatf = Nf[i - int(window / 2)] * (af[i] / f_sum)
        nhatr = Nr[i - int(window / 2)] * (ar[i] / r_sum)
        bc_signal.append(nhatf + nhatr)
        f_sum -= f_last
        f_sum += af[i + int(window / 2)]
        f_last = af[i - int(window / 2) + 1]
        r_sum -= r_last
        r_sum += ar[i + int(window / 2)]
        r_last = ar[i - int(window / 2) + 1]

    # Termination
    fastaFile.close()
    return bc_signal
Пример #34
0
    def print_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                     initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None,
                     raw_signal_file=None, bc_signal_file=None, norm_signal_file=None, strand_specific=False):

        if raw_signal_file:
            pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
            if ps_version == "0.7.5":
                self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
            else:
                iter = self.bam.fetch(reference=ref, start=start, end=end)
                for alignment in iter:
                    pileup_region.__call__(alignment)
            raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

            f = open(raw_signal_file, "a")
            f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                [str(e) for e in nan_to_num(raw_signal)]) + "\n")
            f.close()

        if bc_signal_file or norm_signal_file:
            # Parameters
            window = 50
            defaultKmerValue = 1.0

            # Initialization
            fasta = Fastafile(genome_file_name)
            fBiasDict = bias_table[0]
            rBiasDict = bias_table[1]
            k_nb = len(fBiasDict.keys()[0])
            p1 = start
            p2 = end
            p1_w = p1 - (window / 2)
            p2_w = p2 + (window / 2)
            p1_wk = p1_w - int(k_nb / 2.)
            p2_wk = p2_w + int(k_nb / 2.)

            currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
            currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

            # Iterating on sequence to create the bias signal
            signal_bias_f = []
            signal_bias_r = []
            for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
                fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
                rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
                try:
                    signal_bias_f.append(fBiasDict[fseq])
                except Exception:
                    signal_bias_f.append(defaultKmerValue)
                try:
                    signal_bias_r.append(rBiasDict[rseq])
                except Exception:
                    signal_bias_r.append(defaultKmerValue)

            # Raw counts
            signal_raw_f = [0.0] * (p2_w - p1_w)
            signal_raw_r = [0.0] * (p2_w - p1_w)
            for read in self.bam.fetch(ref, p1_w, p2_w):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1_w <= cut_site < p2_w:
                        signal_raw_f[cut_site - p1_w] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1_w <= cut_site < p2_w:
                        signal_raw_r[cut_site - p1_w] += 1.0

            # Smoothed counts
            Nf = []
            Nr = []
            fSum = sum(signal_raw_f[:window])
            rSum = sum(signal_raw_r[:window])
            fLast = signal_raw_f[0]
            rLast = signal_raw_r[0]
            for i in range((window / 2), len(signal_raw_f) - (window / 2)):
                Nf.append(fSum)
                Nr.append(rSum)
                fSum -= fLast
                fSum += signal_raw_f[i + (window / 2)]
                fLast = signal_raw_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_raw_r[i + (window / 2)]
                rLast = signal_raw_r[i - (window / 2) + 1]

            # Calculating bias and writing to wig file
            fSum = sum(signal_bias_f[:window])
            rSum = sum(signal_bias_r[:window])
            fLast = signal_bias_f[0]
            rLast = signal_bias_r[0]
            signal_bc = []
            signal_bc_f = []
            signal_bc_r = []
            for i in range((window / 2), len(signal_bias_f) - (window / 2)):
                nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
                nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
                signal_bc.append(nhatf + nhatr)
                signal_bc_f.append(nhatf)
                signal_bc_r.append(nhatr)
                fSum -= fLast
                fSum += signal_bias_f[i + (window / 2)]
                fLast = signal_bias_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_bias_r[i + (window / 2)]
                rLast = signal_bias_r[i - (window / 2) + 1]

            if bc_signal_file:
                f = open(bc_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    bc_signal_file_f = prefix + "_Forward" + ".bc.wig"
                    bc_signal_file_r = prefix + "_Reverse" + ".bc.wig"
                    f = open(bc_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_f)]) + "\n")
                    f.close()
                    f = open(bc_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_r)]) + "\n")
                    f.close()

            if norm_signal_file:
                norm_signal_bc = self.boyle_norm(signal_bc)
                perc = scoreatpercentile(norm_signal_bc, 98)
                std = np.std(norm_signal_bc)
                norm_signal_bc = self.hon_norm_atac(norm_signal_bc, perc, std)
                f = open(norm_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(norm_signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    norm_signal_file_f = prefix + "_Forward" + ".norm.wig"
                    norm_signal_file_r = prefix + "_Reverse" + ".norm.wig"

                    signal_norm_f = self.boyle_norm(signal_bc_f)
                    perc = scoreatpercentile(signal_norm_f, 98)
                    std = np.std(signal_norm_f)
                    signal_norm_f = self.hon_norm_atac(signal_norm_f, perc, std)

                    signal_norm_r = self.boyle_norm(signal_bc_r)
                    perc = scoreatpercentile(signal_norm_r, 98)
                    std = np.std(signal_norm_r)
                    signal_norm_r = self.hon_norm_atac(signal_norm_r, perc, std)

                    f = open(norm_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_f)]) + "\n")
                    f.close()
                    f = open(norm_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_r)]) + "\n")
                    f.close()
Пример #35
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Пример #36
0
def main_matching():
    """
    Performs motif matching.

    Authors: Eduardo G. Gusmao.
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing Error Handler
    main_error_handler = ErrorHandler()

    # Parameters
    usage_message = "%prog --matching [options] <experiment_matrix>"

    # Initializing Option Parser
    parser = PassThroughOptionParser(usage = usage_message)

    # Parameters Options
    parser.add_option("--organism", dest = "organism", type = "string", metavar="STRING", default = "hg19",
                      help = ("Organism considered on the analysis. Check our full documentation for all available "
                              "options. All default files such as genomes will be based on the chosen organism "
                              "and the data.config file."))
    parser.add_option("--fpr", dest = "fpr", type = "float", metavar="FLOAT", default = 0.0001,
                      help = ("False positive rate cutoff for motif matching."))
    parser.add_option("--precision", dest = "precision", type = "int", metavar="INT", default = 10000,
                      help = ("Score distribution precision for determining false positive rate cutoff."))
    parser.add_option("--pseudocounts", dest = "pseudocounts", type = "float", metavar="FLOAT", default = 0.1,
                      help = ("Pseudocounts to be added to raw counts of each PFM."))
    parser.add_option("--rand-proportion", dest = "rand_proportion", type = "float", metavar="FLOAT", default = 10.0,
                      help = ("If random coordinates need to be created (for further motif enrichment),"
                              "then it will be created a number of coordinates that equals this"
                              "parameter x the number of input regions (in case of multiple regions, the"
                              "larger is considered). If zero (0) is passed, then no random coordinates are created."))
    parser.add_option("--norm-threshold", dest = "norm_threshold", action = "store_true", default = False,
                      help = ("If this option is used, the thresholds for all PWMs will be normalized by their length."
                              "In this scheme, the threshold cutoff is evaluated in the regular way by the given fpr."
                              "Then, all thresholds are divided by the lenght of the motif. The final threshold consists"
                              "of the average between all normalized motif thresholds. This single threshold will be"
                              "applied to all motifs."))

    # Output Options
    parser.add_option("--output-location", dest = "output_location", type = "string", metavar="PATH", default = os.getcwd(),
                      help = ("Path where the output files will be written."))
    parser.add_option("--bigbed", dest = "bigbed", action = "store_true", default = False,
                      help = ("If this option is used, all bed files will be written as bigbed."))
    parser.add_option("--normalize-bitscore", dest = "normalize_bitscore", action = "store_false", default = True,
                      help = ("In order to print bigbed files the scores need to be normalized between 0 and 1000."
                              "This option should be used if real bitscores should be printed in the resulting bed file."
                              "In this case, a bigbed file will not be created."))

    # Processing Options
    options, arguments = parser.parse_args()

    # Additional Parameters
    matching_folder_name = "Match"
    random_region_name = "random_regions"

    ###################################################################################################
    # Initializations
    ###################################################################################################

    # Output folder
    matching_output_location = os.path.join(options.output_location,matching_folder_name)
    try:
        if(not os.path.isdir(matching_output_location)): os.makedirs(matching_output_location)
    except Exception: main_error_handler.throw_error("MM_OUT_FOLDER_CREATION")

    # Default genomic data
    genome_data = GenomeData(options.organism)

    # Default motif data
    motif_data = MotifData()

    ###################################################################################################
    # Reading Input Matrix
    ###################################################################################################

    # Reading arguments
    try:
        input_matrix = arguments[0]
        if(len(arguments) > 1): main_error_handler.throw_warning("MM_MANY_ARG")
    except Exception: main_error_handler.throw_error("MM_NO_ARGUMENT")

    # Create experimental matrix
    try:
        exp_matrix = ExperimentalMatrix()
        exp_matrix.read(input_matrix)
    except Exception: main_error_handler.throw_error("MM_WRONG_EXPMAT")

    ###################################################################################################
    # Reading Regions
    ###################################################################################################

    # Initialization
    max_region_len = 0
    max_region = None
    input_regions = []

    try:
        exp_matrix_objects_dict = exp_matrix.objectsDict
    except Exception: main_error_handler.throw_error("MM_WRONG_EXPMAT")

    # Iterating on experimental matrix objects
    for k in exp_matrix_objects_dict.keys():

        curr_genomic_region = exp_matrix_objects_dict[k]

        # If the object is a GenomicRegionSet
        if(isinstance(curr_genomic_region,GenomicRegionSet)):

            # Sorting input region
            curr_genomic_region.sort()

            # Append label and GenomicRegionSet
            input_regions.append(curr_genomic_region)

            # Verifying max_region_len for random region generation
            curr_len = len(curr_genomic_region)
            if(curr_len > max_region_len):
                max_region_len = curr_len
                max_region = exp_matrix_objects_dict[k]

    ###################################################################################################
    # Creating random region
    ###################################################################################################

    # Create random coordinates
    rand_region = None
    if(options.rand_proportion > 0):

        # Create random coordinates and name it random_regions
        rand_region = max_region.random_regions(options.organism, multiply_factor=options.rand_proportion, chrom_X=True)
        rand_region.sort()
        rand_region.name = random_region_name

        # Put random regions in the end of the input regions
        input_regions.append(rand_region)

        # Writing random regions
        output_file_name = os.path.join(matching_output_location, random_region_name)
        rand_bed_file_name = output_file_name+".bed"
        rand_region.write_bed(rand_bed_file_name)

        # Verifying condition to write bb
        if(options.bigbed):

            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            rand_bb_file_name = output_file_name+".bb"
            try:
                os.system(" ".join(["bedToBigBed", rand_bed_file_name, chrom_sizes_file, rand_bb_file_name, "-verbose=0"]))
                os.remove(rand_bed_file_name)
            except Exception: pass # WARNING

    else: main_error_handler.throw_error("MM_WRONG_RANDPROP")

    ###################################################################################################
    # Creating PWMs
    ###################################################################################################

    # Initialization
    motif_list = []

    # Creating thresholds object
    thresholds = Thresholds(motif_data)

    # Fetching list with all motif file names
    motif_file_names = []
    for motif_repository in motif_data.get_pwm_list():
        for motif_file_name in glob(os.path.join(motif_repository,"*.pwm")):
            motif_file_names.append(motif_file_name)

    # Iterating on grouped file name list
    for motif_file_name in motif_file_names:

        # Append motif motif_list
        motif_list.append(Motif(motif_file_name, options.pseudocounts, options.precision, options.fpr, thresholds))

    # Performing normalized threshold strategy if requested
    if(options.norm_threshold):
        threshold_list = [motif.threshold/motif.len for motif in motif_list]
        unique_threshold = sum(threshold_list)/len(threshold_list)
    else: unique_threshold = None
        

    ###################################################################################################
    # Motif Matching
    ###################################################################################################

    # Creating genome file
    genome_file = Fastafile(genome_data.get_genome())

    # Iterating on list of genomic regions
    for genomic_region_set in input_regions:

        # Initializing output bed file
        output_file_name = os.path.join(matching_output_location, genomic_region_set.name+"_mpbs")
        bed_file_name = output_file_name+".bed"
        output_file = open(bed_file_name,"w")
        
        # Iterating on genomic regions
        for genomic_region in genomic_region_set.sequences:

            # Reading sequence associated to genomic_region
            sequence = str(genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

            # Splitting the sequence in smaller sequences to remove the "N" regions
            sequence_list = filter(None,sequence.split("N"))

            # Perform motif matching for each motif in each sequence
            for seq in sequence_list:
                for motif in motif_list: match_single(motif, seq, genomic_region, output_file, unique_threshold, options.normalize_bitscore)

        # Closing file
        output_file.close()

        # Verifying condition to write bb
        if(options.bigbed and options.normalize_bitscore):

            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            sort_file_name = output_file_name+"_sort.bed"
            bb_file_name = output_file_name+".bb"
            os.system("sort -k1,1 -k2,2n "+bed_file_name+" > "+sort_file_name)
            os.system(" ".join(["bedToBigBed", sort_file_name, chrom_sizes_file, bb_file_name, "-verbose=0"]))
            os.remove(bed_file_name); os.remove(sort_file_name)
Пример #37
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Пример #38
0
class VariantAnnotator(object):
  def __init__(self, gene_db, reference_fasta):
    self.reference  = Fastafile(reference_fasta)
    self.con        = open_genedb(gene_db)
    self.gene_cache = OrderedDict()

    self.band_map = band_map = defaultdict(IntervalTree)
    for band in get_cytobands(self.con):
      band_map[band.chrom].insert(band.start,band.end,band)
      if band.chrom.startswith('chr') and band.chrom[3:] not in band_map:
        band_map[band.chrom[3:]] = band_map[band.chrom]

    trans = get_transcripts(self.con)
    trans = progress_loop(trans, label='Loading transcripts: ', units='transcripts')

    self.feature_map = feature_map = defaultdict(IntervalTree)
    for gene in trans:
      feature_map[gene.chrom].insert(gene.txStart,gene.txEnd,gene)

      if 0: # DEBUG
        parts = self.decode_gene(gene)
        for part in parts:
          if part.type not in ('intron','UTR5','UTR3','UTR') and '_' not in part.chrom:
            print '\t'.join(map(str,[part.chrom,part.start,part.end,gene.symbol]))

    sys.stderr.write('Loading complete.\n')


  def decode_gene(self,gene):
    gene_cache = self.gene_cache
    key = gene.id

    try:
      parts = gene_cache.pop(key)
    except KeyError:
      partlist = list(decode_gene(gene))

      parts = IntervalTree()
      for part in partlist:
        parts.insert(part.start,part.end,part)

      if len(gene_cache)>=300:
        gene_cache.popitem(0)

    # Add result to end of LRU
    gene_cache[key] = parts

    return parts


  def annotate(self, chrom, ref_start, ref_end, variant, nsonly=False):
    variant = variant.replace('-','')

    ref_nuc = self.reference.fetch(chrom,ref_start,ref_end).upper()
    var_nuc = variant.upper()

    evidence = []
    for feature in self.feature_map[chrom].find(ref_start, ref_end):
      evidence.extend( self.classify_feature(feature.value, ref_start, ref_end, ref_nuc, var_nuc) )

    #ns = any('NON-SYNONYMOUS' in e[3] for e in evidence)
    #if nsonly and not ns:
    #  return []

    # If not in a gene, check to see if there are any genes nearby
    if not evidence:
      five_prime  = set()
      three_prime = set()

      for feature in self.feature_map[chrom].find(ref_start-2000, ref_end+2000):
        gene = feature.value
        if (0<ref_end-gene.txStart<=2000) ^ (gene.strand=='-'):
          five_prime.add(gene)
        else:
          three_prime.add(gene)

      for gene in five_prime:
        evidence.append( ['UPSTREAM_GENE',gene,'',False,'','',ref_nuc,var_nuc,'',''] )

      for gene in three_prime:
        evidence.append( ['DOWNSTREAM_GENE',gene,'',False,'','',ref_nuc,var_nuc,'',''] )

    if not evidence:
      evidence.append( ['intergenic','','',False,'','',ref_nuc,var_nuc,'',''] )

    evidence = group_evidence(evidence)
    cytoband = cytoband_name(self.band_map[chrom].find_values(ref_start,ref_end))
    context  = [ chrom,cytoband,ref_start,ref_end ]

    if 0: # evidence:
      print
      for e in evidence:
        values = context+e
        for f,v in zip(GeneEvidence.__slots__,values):
          print '%15s = %s' % (f,v)
        print

    evidence = [ GeneEvidence._make(context+e) for e in evidence ]

    return evidence


  def classify_feature(self, gene, ref_start, ref_end, ref_nuc, var_nuc):
    gene_parts = self.decode_gene(gene)

    intersect = defaultdict(list)
    for part in gene_parts.find_values(ref_start, ref_end):
      intersect[part.type].append(part)

    evidence = []

    parts    = set(intersect)
    mut_type = set()

    for splice in gene_parts.find_values(ref_start-5,ref_end+5):
      if splice.type=='CDS' or 'UTR' in splice.type:
        if (0<splice.start-ref_end<=5) or (0<ref_start-splice.end<=5):
          mut_type.add('POSSIBLE_INTRONIC_SPLICE_VARIANT')

    parts    = ','.join(sorted(parts))
    mut_type = ','.join(sorted(mut_type))

    if len(intersect)==1 and len(intersect['CDS'])==1:
      e = self.classify_exonic_variant(gene, gene_parts, intersect['CDS'][0],
                                       ref_start, ref_end, ref_nuc, var_nuc)
      evidence.append(e)
    elif len(intersect['CDS']):
      evidence.append([parts,gene,'',True,'NON-SYNONYMOUS',mut_type,ref_nuc,var_nuc,'',''])
    elif mut_type:
      evidence.append([parts,gene,'',True,'PREDICTED-DISRUPT-TRANSCRIPT',mut_type,ref_nuc,var_nuc,'',''])
    elif len(intersect['UTR5'])+len(intersect['UTR3']):
      evidence.append([parts,gene,'',False,'UNKNOWN-UTR',mut_type,ref_nuc,var_nuc,'',''])
    elif len(intersect['intron']):
      evidence.append([parts,gene,'',False,'UNKNOWN-INTRONIC',mut_type,ref_nuc,var_nuc,'',''])
    else:
      evidence.append([parts,gene,'',False,'UNKNOWN-INTERGENIC',mut_type,ref_nuc,var_nuc,'',''])

    return evidence


  def classify_exonic_variant(self, gene, gene_parts, cds, ref_start, ref_end, ref_nuc, var_nuc):
    result = ['CDS',gene,'mRNA=%s:protein=%s:exon=%d:strand=%s' % \
                         (gene.mRNA,gene.protein,cds.exon_num,gene.strand)]

    exon_start = ref_start - cds.start
    exon_end   = ref_end   - cds.start

    # FIXME: Report ref and var nuc relative to gene strand

    var_nuc = var_nuc.upper()

    #print gene.chrom,ref_start,ref_end,ref_nuc,var_nuc
    #assert len(ref_nuc)==(ref_end-ref_start)

    if ref_nuc==var_nuc:
      result += [False,'SYNONYMOUS','REFERENCE',ref_nuc,var_nuc,'','']
      return result

    ref_frame  = len(ref_nuc)%3
    var_frame  = len(var_nuc)%3
    frameshift = (len(ref_nuc)-len(var_nuc))%3

    if 0:
      print '  REF_FRAME: %d' % ref_frame
      print '  VAR_FRAME: %d' % var_frame

    mut_type = []

    if len(ref_nuc)==len(var_nuc):
      mut_type.append('SUBSTITUTION')
    elif len(ref_nuc)>len(var_nuc):
      mut_type.append('DELETION')
    else:
      mut_type.append('INSERTION')

    if exon_start<5:
      mut_type.append('POSSIBLE-SPLICE5')
    if cds.end-exon_end<5:
      mut_type.append('POSSIBLE-SPLICE3')

    if ref_frame!=var_frame:
      mut_type.append('FRAMESHIFT')
      mut_type = ','.join(sorted(mut_type))
      result += [True,'NON-SYNONYMOUS',mut_type,ref_nuc,var_nuc,'','']
      return result

    # FIXME: Request 100 bases beyond end of transcription
    ref_var_start = 0
    ref_cds_seq   = []
    for part in gene_parts:
      if part.type=='CDS':
        seq = Seq(self.reference.fetch(part.chrom,part.start,part.end))
        #assert len(seq)==(end-start)
        ref_cds_seq.append(seq)
        if part.cds_index<cds.cds_index:
          ref_var_start += len(seq)
        elif part.cds_index==cds.cds_index:
          ref_var_start += exon_start

    #assert ref_nuc==str(ref_cds_seq[cds.cds_index][exon_start:exon_end]).upper()

    if 0:
      print '  CDS  : %d-%d' % (cds.start,cds.end)
      print '  VAR  : %d-%d' % (ref_start,ref_end)
      print '  LOCAL: %d-%d (size=%d)' % (exon_start,exon_end,len(ref_cds_seq[cds.cds_index]))

    var_cds_seq = ref_cds_seq[:]

    v = list(var_cds_seq[cds.cds_index])
    v[exon_start:exon_end] = list(var_nuc)
    var_cds_seq[cds.cds_index] = ''.join(v)

    ref_cds = Seq(''.join(str(s) for s in ref_cds_seq))
    var_cds = Seq(''.join(str(s) for s in var_cds_seq))

    if gene.strand=='-':
      ref_var_start = len(ref_cds)-ref_var_start-1
      ref_cds       = ref_cds.reverse_complement()
      var_cds       = var_cds.reverse_complement()
      ref_cds_nuc   = str(Seq(ref_nuc).reverse_complement())
      var_cds_nuc   = str(Seq(var_nuc).reverse_complement())
    else:
      ref_cds_nuc   = ref_nuc
      var_cds_nuc   = var_nuc

    try:
      ref_cds_aa = ref_cds.translate()
      var_cds_aa = var_cds.translate()
    except TranslationError:
      mut_type.append('INVALID_TRANSLATION')
      mut_type = ','.join(sorted(mut_type))
      result += [True,'PRESUMED_NON-SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,'','']
      return result

    ref_aa,var_aa,aa_position = reduce_match(str(ref_cds_aa),str(var_cds_aa))

    if not ref_aa and not var_aa:
      mut_type = ','.join(sorted(mut_type))

      codon_start  = ref_var_start-ref_var_start%3
      codon_end    = ref_var_start+len(ref_nuc)
      if codon_end%3:
        codon_end += 3-codon_end%3

      aa_position = codon_start//3
      ref_frame   = ref_cds[codon_start:codon_end]
      ref_aa      = ref_frame.translate()

      #assert len(ref_aa)

      result[-1] += ':aa=%d' % (aa_position+1)
      result += [False,'SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,str(ref_aa),str(ref_aa)]
      return result

    # Classify non-synonymous change by comparing AA sequences

    # Make sure ref protein doesn't appear to have spurious stops

    r = ref_cds_aa.rstrip('*')
    v = var_cds_aa.rstrip('*')

    ref_stop = r.find('*')
    var_stop = v.find('*')

    if ref_stop==-1:
      if var_stop!=-1 and not v.startswith(r):
        mut_type.append('PREMATURE_STOP')
      elif ref_cds_aa[-1]=='*' and var_cds_aa[-1]!='*':
        mut_type.append('LOSS_OF_STOP')

    if 0:
      print '  REF_NUC:',ref_cds_nuc
      print '  VAR_NUC:',var_cds_nuc
      print '   REF_AA:',ref_aa
      print '   VAR_AA:',var_aa
      #print '  NUC_DIFF:',levenshtein_sequence(str(ref_cds),str(var_cds))
      #print '  AA_DIFF: ',levenshtein_sequence(str(ref_aa), str(var_aa) )

      ref_size = ref_end-ref_start
      cds_size = len(ref_cds)
      print '  CDS_SIZE=%d (%.1f codons)' % (cds_size,cds_size/3.0)
      print '  CDS SEQ=%s' % ref_cds

      assert not ref_cds or str(ref_cds[:3])=='ATG'

    mut_type = ','.join(sorted(mut_type))
    result[-1] += ':aa=%d' % (aa_position+1)
    result     += [True,'NON-SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,str(ref_aa),str(var_aa)]

    return result
Пример #39
0
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict(); obsDictR = dict()
        expDictF = dict(); expDictR = dict()

        ct_reads_r=0
        ct_reads_f=0
        ct_kmers=0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################

            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                else: p1 = r.aend - (k_nb/2) + 1 - shift
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if(p1 == prevPos): trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if(trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
                except Exception: continue
                if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if(not r.is_reverse):
                    ct_reads_r+=1
                    try: obsDictF[currStr] += 1
                    except Exception: obsDictF[currStr] = 1
                else:
                    ct_reads_f+=1
                    try: obsDictR[currStr] += 1
                    except Exception: obsDictR[currStr] = 1 


            # Evaluating expected frequencies ####################################

            # Fetching whole sequence
            try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
            except Exception: continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0,len(currStr)-k_nb):
                ct_kmers+=1
                # Counting k-mer in dictionary
                s = currStr[i:i+k_nb]
                try: expDictF[s] += 1
                except Exception: expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i+k_nb]
                try: expDictR[s] += 1
                except Exception: expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A","C","G","T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e,0.0) for e in kmerComb]) 
        bias_table_R = dict([(e,0.0) for e in kmerComb]) 
        for kmer in kmerComb:
            try: obsF = obsDictF[kmer] + pseudocount
            except Exception: obsF = pseudocount
            try: expF = expDictF[kmer] + pseudocount
            except Exception: expF = pseudocount
            bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6)
            try: obsR = obsDictR[kmer] + pseudocount
            except Exception: obsR = pseudocount
            try: expR = expDictR[kmer] + pseudocount
            except Exception: expR = pseudocount
            bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6)

        # Return
        return [bias_table_F, bias_table_R]
Пример #40
0
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local',
                 logfile=sys.stdout, debugfile=sys.stderr):
    """Main function of the workflow"""
    ref_genome = assembly.fasta_by_chrom
    sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())]

    logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush()
    vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}}
    bams = {}
    # Launch the jobs
    for gid in sorted(job.files.keys()):
        # Merge all bams belonging to the same group
        runs = [r['bam'] for r in job.files[gid].itervalues()]
        bam = Samfile(runs[0])
        header = bam.header
        headerfile = unique_filename_in()
        for h in header["SQ"]:
            if h["SN"] in assembly.chrmeta:
                h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
        head = Samfile( headerfile, "wh", header=header )
        head.close()
        if len(runs) > 1:
            _b = merge_bam(ex,runs)
            index_bam(ex,_b)
            bams[gid] = _b
        else:
            bams[gid] = runs[0]
        # Samtools mpileup + bcftools + vcfutils.pl
        for chrom,ref in ref_genome.iteritems():
            vcf = unique_filename_in()
            vcfs[chrom][gid] = (vcf,
                                pileup.nonblocking(ex, bams[gid], ref, header=headerfile,
                                                   via=via, stdout=vcf))
        logfile.write("  ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush()
    # Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
    for gid in sorted(job.files.keys()):
        for chrom,ref in ref_genome.iteritems():
            vcfs[chrom][gid][1].wait()
            vcfs[chrom][gid] = vcfs[chrom][gid][0]
        logfile.write("  ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush()
    # Targz the pileup files (vcf)
    tarname = unique_filename_in()
    tarfh = tarfile.open(tarname, "w:gz")
    for chrom,v in vcfs.iteritems():
        for gid,vcf in v.iteritems():
            tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom))
    tarfh.close()
    ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') )

    logfile.write("\n* Merge info from vcf files\n"); logfile.flush()
    outall = unique_filename_in()
    outexons = unique_filename_in()
    with open(outall,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
                                 ['gene','location_type','distance'])+'\n')
    with open(outexons,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
                                  + ['new_aa_'+s for s in sample_names])+'\n')
    msa_table = dict((s,'') for s in [assembly.name]+sample_names)
    for chrom,v in vcfs.iteritems():
        logfile.write("  > Chromosome '%s'\n" % chrom); logfile.flush()
    # Put together info from all vcf files
        logfile.write("  - All SNPs\n"); logfile.flush()
        allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly,
                           sample_names,mincov,float(minsnp),logfile,debugfile)
    # Annotate SNPs and check synonymy
        logfile.write("  - Exonic SNPs\n"); logfile.flush()
        exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile)
        for snprow in allsnps:
            for n,k in enumerate([assembly.name]+sample_names):
                msa_table[k] += snprow[3+n][0]
    description = set_file_descr("allSNP.txt",step="SNPs",type="txt")
    ex.add(outall,description=description)
    description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt")
    ex.add(outexons,description=description)
    msafile = unique_filename_in()
    with open(msafile,"w") as msa:
        msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0])))
        for name,seq in msa_table.iteritems():
            msa.write("%s\t%s\n" %(name,seq))
    msa_table = {}
    description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt")
    ex.add(msafile,description=description)
    # Create UCSC bed tracks
    logfile.write("\n* Create tracks\n"); logfile.flush()
    create_tracks(ex,outall,sample_names,assembly)
    # Create quantitative tracks
    logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush()

    def _process_pileup(pileups, seq, startpos, endpos):
        atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        vectors = ([],[],[])
        for pileupcolumn in pileups:
            position = pileupcolumn.pos
            if position < startpos: continue
            if position >= endpos: break
            coverage = pileupcolumn.n
            ref_symbol = seq[position-startpos]
            ref = atoi.get(ref_symbol, 4)
            symbols = [0,0,0,0,0]
            quality = 0
            for pileupread in pileupcolumn.pileups:
                symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1
                quality += ord(pileupread.alignment.qual[pileupread.qpos])-33
            quality = float(quality)/coverage
            info = heterozygosity(ref, symbols[0:4])
            if coverage > 0: vectors[0].append((position, position+1, coverage))
            if info > 0: vectors[1].append((position, position+1, info))
            if quality > 0: vectors[2].append((position, position+1, quality))
#            yield (position, position+1, coverage, info, quality)
        return vectors

    if job.options.get('make_bigwigs',False):
        _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'}
        for gid,bamfile in bams.iteritems():
            _descr['groupId'] = gid
            bamtr = track(bamfile,format="bam")
            covname = unique_filename_in()+".bw"
            out_cov = track(covname, chrmeta=assembly.chrmeta)
            hetname = unique_filename_in()+".bw"
            out_het = track(hetname, chrmeta=assembly.chrmeta)
            qualname = unique_filename_in()+".bw"
            out_qual = track(qualname, chrmeta=assembly.chrmeta)
            for chrom, cinfo in assembly.chrmeta.iteritems():
                fasta = Fastafile(ref_genome[chrom])
                #process fasta and bam by 10Mb chunks
                for chunk in range(0,cinfo["length"],10**7):
                    fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7)
                    vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7)
                    out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom)
                    out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom)
                    out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom)
            out_cov.close()
            out_het.close()
            out_qual.close()
            description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr)
            ex.add(covname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr)
            ex.add(hetname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr)
            ex.add(qualname,description=description)

    return 0
Пример #41
0
    def line(self):
        signal = GenomicSignal(self.bam_file)
        signal.load_sg_coefs(slope_window_size=9)
        bias_table = BiasTable()
        bias_table_list = self.bias_table.split(",")
        table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                      table_file_name_R=bias_table_list[1])
        genome_data = GenomeData(self.organism)
        fasta = Fastafile(genome_data.get_genome())
        pwm_dict = dict([("A", [0.0] * self.window_size), ("C", [0.0] * self.window_size),
                        ("G", [0.0] * self.window_size), ("T", [0.0] * self.window_size),
                        ("N", [0.0] * self.window_size)])


        mean_raw_signal = np.zeros(self.window_size)
        mean_bc_signal = np.zeros(self.window_size)
        mean_raw_signal_f = np.zeros(self.window_size)
        mean_bc_signal_f = np.zeros(self.window_size)
        mean_raw_signal_r = np.zeros(self.window_size)
        mean_bc_signal_r = np.zeros(self.window_size)

        mean_bias_signal_f = np.zeros(self.window_size)
        mean_bias_signal_r = np.zeros(self.window_size)
        num_sites = 0

        mpbs_regions = GenomicRegionSet("Motif Predicted Binding Sites")
        mpbs_regions.read_bed(self.motif_file)

        total_nc_signal = 0
        total_nl_signal = 0
        total_nr_signal = 0

        for region in mpbs_regions:
            if str(region.name).split(":")[-1] == "Y":
                num_sites += 1
                # Extend by 50 bp
                mid = (region.initial + region.final) / 2
                p1 = mid - (self.window_size / 2)
                p2 = mid + (self.window_size / 2)

                if not self.strands_specific:
                    # Fetch raw signal
                    raw_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                  downstream_ext=self.atac_downstream_ext,
                                                  upstream_ext=self.atac_upstream_ext,
                                                  forward_shift=self.atac_forward_shift,
                                                  reverse_shift=self.atac_reverse_shift,
                                                  genome_file_name=genome_data.get_genome())

                    mean_raw_signal = np.add(mean_raw_signal, raw_signal)

                    # Fetch bias correction signal
                    bc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())

                    mean_bc_signal = np.add(mean_bc_signal, bc_signal)
                else:
                    raw_signal_f, _, raw_signal_r, _ =  signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2,
                                                                        downstream_ext=self.atac_downstream_ext,
                                                                        upstream_ext=self.atac_upstream_ext,
                                                                        forward_shift=self.atac_forward_shift,
                                                                        reverse_shift=self.atac_reverse_shift,
                                                                        genome_file_name=genome_data.get_genome())
                    mean_raw_signal_f = np.add(mean_raw_signal_f, raw_signal_f)
                    mean_raw_signal_r = np.add(mean_raw_signal_r, raw_signal_r)

                    bc_signal_f, _, bc_signal_r, _ = signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2,
                                                                                  bias_table=table,
                                                                                  downstream_ext=self.atac_downstream_ext,
                                                                                  upstream_ext=self.atac_upstream_ext,
                                                                                  forward_shift=self.atac_forward_shift,
                                                                                  reverse_shift=self.atac_reverse_shift,
                                                                                  genome_file_name=genome_data.get_genome())
                    mean_bc_signal_f = np.add(mean_bc_signal_f, bc_signal_f)
                    mean_bc_signal_r = np.add(mean_bc_signal_r, bc_signal_r)

                # Update pwm
                aux_plus = 1
                dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper()
                if (region.final - region.initial) % 2 == 0:
                    aux_plus = 0
                dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom,
                                                                         p1 + aux_plus, p2 + aux_plus)).upper())
                if region.orientation == "+":
                    for i in range(0, len(dna_seq)):
                        pwm_dict[dna_seq[i]][i] += 1
                elif region.orientation == "-":
                    for i in range(0, len(dna_seq_rev)):
                        pwm_dict[dna_seq_rev[i]][i] += 1

                # Create bias signal
                bias_table_f = table[0]
                bias_table_r = table[1]
                self.k_nb = len(bias_table_f.keys()[0])
                bias_signal_f = []
                bias_signal_r = []
                p1_wk = p1 - int(self.k_nb / 2)
                p2_wk = p2 + int(self.k_nb / 2)
                dna_seq = str(fasta.fetch(region.chrom, p1_wk, p2_wk - 1)).upper()
                dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, p1_wk, p2_wk + 1)).upper())
                for i in range(int(self.k_nb / 2), len(dna_seq) - int(self.k_nb / 2) + 1):
                    fseq = dna_seq[i - int(self.k_nb / 2):i + int(self.k_nb / 2)]
                    rseq = dna_seq_rev[len(dna_seq) - int(self.k_nb / 2) - i:len(dna_seq) + int(self.k_nb / 2) - i]
                    try:
                        bias_signal_f.append(bias_table_f[fseq])
                    except Exception:
                        bias_signal_f.append(1)
                    try:
                        bias_signal_r.append(bias_table_r[rseq])
                    except Exception:
                        bias_signal_r.append(1)

                mean_bias_signal_f = np.add(mean_bias_signal_f, np.array(bias_signal_f))
                mean_bias_signal_r = np.add(mean_bias_signal_r, np.array(bias_signal_r))

                if self.protection_score:
                    # signal in the center of the MPBS
                    p1 = region.initial
                    p2 = region.final
                    nc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nc_signal += sum(nc_signal)
                    p1 = region.final
                    p2 = 2 * region.final - region.initial
                    nr_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nr_signal += sum(nr_signal)
                    p1 = 2 * region.initial - region.final
                    p2 = region.final
                    nl_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nl_signal += sum(nl_signal)


        mean_raw_signal = mean_raw_signal / num_sites
        mean_bc_signal = mean_bc_signal / num_sites

        mean_raw_signal_f = mean_raw_signal_f / num_sites
        mean_raw_signal_r = mean_raw_signal_r / num_sites
        mean_bc_signal_f = mean_bc_signal_f / num_sites
        mean_bc_signal_r = mean_bc_signal_r / num_sites

        mean_bias_signal_f = mean_bias_signal_f / num_sites
        mean_bias_signal_r = mean_bias_signal_r / num_sites

        protection_score = (total_nl_signal + total_nr_signal - 2 * total_nc_signal) / (2 * num_sites)

        # Output PWM and create logo
        pwm_fname = os.path.join(self.output_loc, "{}.pwm".format(self.motif_name))
        pwm_file = open(pwm_fname,"w")
        for e in ["A","C","G","T"]:
            pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]])+"\n")
        pwm_file.close()

        logo_fname = os.path.join(self.output_loc, "{}.logo.eps".format(self.motif_name))
        pwm = motifs.read(open(pwm_fname), "pfm")
        pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line="100",
                    color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="",
                    show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="",
                    show_fineprint=False, show_ends=False)

        # Output the raw, bias corrected signal and protection score
        output_fname = os.path.join(self.output_loc, "{}.txt".format(self.motif_name))
        output_file = open(output_fname, "w")
        if not self.strands_specific:
            output_file.write("raw signal: \n" + np.array_str(mean_raw_signal) + "\n")
            output_file.write("bias corrected signal: \n" + np.array_str(mean_bc_signal) + "\n")
        else:
            output_file.write("raw forward signal: \n" + np.array_str(mean_raw_signal_f) + "\n")
            output_file.write("bias corrected forward signal: \n" + np.array_str(mean_bc_signal_f) + "\n")
            output_file.write("raw reverse signal: \n" + np.array_str(mean_raw_signal_r) + "\n")
            output_file.write("bias reverse corrected signal: \n" + np.array_str(mean_bc_signal_r) + "\n")
        output_file.write("forward bias signal: \n" + np.array_str(mean_bias_signal_f) + "\n")
        output_file.write("reverse bias signal: \n" + np.array_str(mean_bias_signal_r) + "\n")
        if self.protection_score:
            output_file.write("protection score: \n" + str(protection_score) + "\n")
        output_file.close()

        if self.strands_specific:
            fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(12.0, 10.0))
        else:
            fig, (ax1, ax2) = plt.subplots(2)
        x = np.linspace(-50, 49, num=self.window_size)

        ax1.plot(x, mean_bias_signal_f, color='red', label='Forward')
        ax1.plot(x, mean_bias_signal_r, color='blue', label='Reverse')

        ax1.xaxis.set_ticks_position('bottom')
        ax1.yaxis.set_ticks_position('left')
        ax1.spines['top'].set_visible(False)
        ax1.spines['right'].set_visible(False)
        ax1.spines['left'].set_position(('outward', 15))
        ax1.spines['bottom'].set_position(('outward', 5))
        ax1.tick_params(direction='out')

        ax1.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax1.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
        min_bias_signal = min(min(mean_bias_signal_f), min(mean_bias_signal_r))
        max_bias_signal = max(max(mean_bias_signal_f), max(mean_bias_signal_r))
        ax1.set_yticks([min_bias_signal, max_bias_signal])
        ax1.set_yticklabels([str(round(min_bias_signal,2)), str(round(max_bias_signal,2))], rotation=90)

        ax1.text(-48, max_bias_signal, '# Sites = {}'.format(str(num_sites)), fontweight='bold')
        ax1.set_title(self.motif_name, fontweight='bold')
        ax1.set_xlim(-50, 49)
        ax1.set_ylim([min_bias_signal, max_bias_signal])
        ax1.legend(loc="upper right", frameon=False)
        ax1.set_ylabel("Average Bias \nSignal", rotation=90, fontweight='bold')

        if not self.strands_specific:
            mean_raw_signal = self.standardize(mean_raw_signal)
            mean_bc_signal = self.standardize(mean_bc_signal)
            ax2.plot(x, mean_raw_signal, color='red', label='Uncorrected')
            ax2.plot(x, mean_bc_signal, color='green', label='Corrected')
        else:
            mean_raw_signal_f = self.standardize(mean_raw_signal_f)
            mean_raw_signal_r = self.standardize(mean_raw_signal_r)
            mean_bc_signal_f = self.standardize(mean_bc_signal_f)
            mean_bc_signal_r = self.standardize(mean_bc_signal_r)
            ax2.plot(x, mean_raw_signal_f, color='red', label='Forward')
            ax2.plot(x, mean_raw_signal_r, color='green', label='Reverse')
            ax3.plot(x, mean_bc_signal_f, color='red', label='Forward')
            ax3.plot(x, mean_bc_signal_r, color='green', label='Reverse')

        ax2.xaxis.set_ticks_position('bottom')
        ax2.yaxis.set_ticks_position('left')
        ax2.spines['top'].set_visible(False)
        ax2.spines['right'].set_visible(False)
        ax2.spines['left'].set_position(('outward', 15))
        ax2.tick_params(direction='out')
        ax2.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax2.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
        ax2.set_yticks([0, 1])
        ax2.set_yticklabels([str(0), str(1)], rotation=90)
        ax2.set_xlim(-50, 49)
        ax2.set_ylim([0, 1])

        if not self.strands_specific:
            ax2.spines['bottom'].set_position(('outward', 40))
            ax2.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax2.set_ylabel("Average ATAC-seq \nSignal", rotation=90, fontweight='bold')
            ax2.legend(loc="center", frameon=False, bbox_to_anchor=(0.85, 0.06))
        else:
            ax2.spines['bottom'].set_position(('outward', 5))
            ax2.set_ylabel("Average ATAC-seq \n Uncorrected Signal", rotation=90, fontweight='bold')
            ax2.legend(loc="lower right", frameon=False)

            ax3.xaxis.set_ticks_position('bottom')
            ax3.yaxis.set_ticks_position('left')
            ax3.spines['top'].set_visible(False)
            ax3.spines['right'].set_visible(False)
            ax3.spines['left'].set_position(('outward', 15))
            ax3.tick_params(direction='out')
            ax3.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
            ax3.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
            ax3.set_yticks([0, 1])
            ax3.set_yticklabels([str(0), str(1)], rotation=90)
            ax3.set_xlim(-50, 49)
            ax3.set_ylim([0, 1])
            ax3.legend(loc="lower right", frameon=False)
            ax3.spines['bottom'].set_position(('outward', 40))
            ax3.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax3.set_ylabel("Average ATAC-seq \n Corrected Signal", rotation=90, fontweight='bold')
            ax3.text(-48, 0.05, '# K-mer = {}\n# Forward Shift = {}'.format(str(self.k_nb), str(self.atac_forward_shift)),
                     fontweight='bold')

        figure_name = os.path.join(self.output_loc, "{}.line.eps".format(self.motif_name))
        fig.subplots_adjust(bottom=.2, hspace=.5)
        fig.tight_layout()
        fig.savefig(figure_name, format="eps", dpi=300)

        # Creating canvas and printing eps / pdf with merged results
        output_fname = os.path.join(self.output_loc, "{}.eps".format(self.motif_name))
        c = pyx.canvas.canvas()
        c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0))
        if self.strands_specific:
            c.insert(pyx.epsfile.epsfile(2.76, 1.58, logo_fname, width=27.2, height=2.45))
        else:
            c.insert(pyx.epsfile.epsfile(2.5, 1.54, logo_fname, width=16, height=1.75))
        c.writeEPSfile(output_fname)
        os.system("epstopdf " + figure_name)
        os.system("epstopdf " + logo_fname)
        os.system("epstopdf " + output_fname)
Пример #42
0
    exprFile.readline()
    for line in exprFile:
        ll = line.strip().split(",")
        ensg = ll[0].replace("\"", "")
        log2FoldChange = ll[3]
        minusAux = str(round((float(ll[8]) + float(ll[9])) / 2, 2))
        plusAux = str(round((float(ll[10]) + float(ll[11])) / 2, 2))
        exprDictList[i][ensg] = [log2FoldChange, minusAux, plusAux]
    exprFile.close()

###################################################################################################
# Execution
###################################################################################################

# Open bam files
genomeFile = Fastafile(genomeFileName)
regionsFile = Samfile(regionsFileName, "rb")
#chrommHmmFile = Samfile(chrommHmmFileName, "rb")
enhancersFile = Samfile(enhancersFileName, "rb")
signalFileList = [Samfile(e, "rb") for e in signalFileNameList]
controlFileList = [Samfile(e, "rb") for e in controlFileNameList]
motifFileList = [Samfile(e, "rb") for e in motifFileNameList]

# Creating RPM list
rpmList = [1000000. / e for e in signalCountList]
rpmControlList = [1000000. / e for e in controlCountList]

# Fetching index of CTCF motifs
ctcfIndexList = []
for i in range(0, len(motifLabelList)):
    if ("CTCF_" in motifLabelList[i]): ctcfIndexList.append(i)
Пример #43
0
def main(args):
    """
    Performs motif matching.
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing Error Handler
    err = ErrorHandler()

    # Additional Parameters
    matching_folder_name = "match"
    random_region_name = "random_regions"

    filter_values = parse_filter(args.filter)

    ###################################################################################################
    # Initializations
    ###################################################################################################

    # Output folder
    if args.output_location:
        output_location = args.output_location
    else:
        output_location = npath(matching_folder_name)
    print(">> output location:", output_location)

    # Default genomic data
    genome_data = GenomeData(args.organism)

    print(">> genome:", genome_data.organism)
    print(">> pseudocounts:", args.pseudocounts)
    print(">> fpr threshold:", args.fpr)

    ###################################################################################################
    # Reading Input Regions
    ###################################################################################################

    genomic_regions_dict = {}

    # get experimental matrix, if available
    if args.input_matrix:
        try:
            exp_matrix = ExperimentalMatrix()
            exp_matrix.read(args.input_matrix)

            # if the matrix is present, the (empty) dictionary is overwritten
            genomic_regions_dict = exp_matrix.objectsDict

            print(">>> experimental matrix loaded")

        except Exception:
            err.throw_error("MM_WRONG_EXPMAT")
    elif args.input_files:
        # get input files, if available
        for input_filename in args.input_files:
            name, _ = os.path.splitext(os.path.basename(input_filename))

            regions = GenomicRegionSet(name)
            regions.read(npath(input_filename))

            genomic_regions_dict[name] = regions

            print(">>> input file", name, "loaded:", len(regions), "regions")

    # we put this here because we don't want to create the output directory unless we
    # are sure the initialisation (including loading input files) worked
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    annotation = None
    target_genes = None
    # get promoter regions from list of genes (both target and background)
    # TODO: should be more clever, allow precomputed regions etc
    if args.target_genes_filename:
        annotation = AnnotationSet(args.organism, alias_source=args.organism,
                                   protein_coding=True, known_only=True)

        target_genes = GeneSet("target_genes")
        target_genes.read(args.target_genes_filename)

        # TODO: what do we do with unmapped genes? maybe just print them out
        target_regions = annotation.get_promoters(gene_set=target_genes, promoter_length=args.promoter_length)
        target_regions.name = "target_regions"
        target_regions.sort()
        output_file_name = npath(os.path.join(output_location, target_regions.name + ".bed"))
        target_regions.write(output_file_name)

        genomic_regions_dict[target_regions.name] = target_regions

        print(">>> target promoter file created:", len(target_regions), "regions")

    # we make a background in case it's requested, but also in case a list of target genes has not been
    # provided
    if args.promoter_make_background or (args.promoters_only and not args.target_genes_filename):
        if not annotation:
            annotation = AnnotationSet(args.organism, alias_source=args.organism,
                                       protein_coding=True, known_only=True)

        # background is made of all known genes minus the target genes (if any)
        background_genes = GeneSet("background_genes")
        background_genes.get_all_genes(organism=args.organism)

        if target_genes:
            background_genes.subtract(target_genes)

        background_regions = annotation.get_promoters(gene_set=background_genes,
                                                      promoter_length=args.promoter_length)
        background_regions.name = "background_regions"
        background_regions.sort()
        output_file_name = npath(os.path.join(output_location, background_regions.name + ".bed"))
        background_regions.write(output_file_name)

        genomic_regions_dict[background_regions.name] = background_regions

        print(">>> background promoter file created:", len(background_regions), "regions")

    if not genomic_regions_dict:
        err.throw_error("DEFAULT_ERROR", add_msg="You must either specify an experimental matrix, or at least a "
                                                 "valid input file, or one of the 'promoter test' options.")

    max_region_len = 0
    max_region = None
    regions_to_match = []

    # Iterating on experimental matrix objects
    for k in genomic_regions_dict.keys():

        curr_genomic_region = genomic_regions_dict[k]

        # If the object is a GenomicRegionSet
        if isinstance(curr_genomic_region, GenomicRegionSet):

            if args.rmdup:
                # remove duplicates and sort regions
                curr_genomic_region.remove_duplicates(sort=True)
            else:
                # sort regions
                curr_genomic_region.sort()

            # Append label and GenomicRegionSet
            regions_to_match.append(curr_genomic_region)

            # Verifying max_region_len for random region generation
            curr_len = len(curr_genomic_region)
            if curr_len > max_region_len:
                max_region_len = curr_len
                max_region = curr_genomic_region

    print(">> all files loaded")

    ###################################################################################################
    # Creating random regions
    ###################################################################################################

    # if a random proportion is set, create random regions
    if args.rand_proportion:

        # Create random coordinates and name it random_regions
        rand_region = max_region.random_regions(args.organism, multiply_factor=args.rand_proportion, chrom_X=True)
        rand_region.sort()
        rand_region.name = random_region_name

        # Add random regions to the list of regions to perform matching on
        regions_to_match.append(rand_region)

        # Writing random regions
        output_file_name = npath(os.path.join(output_location, random_region_name))
        rand_bed_file_name = output_file_name + ".bed"
        rand_region.write(rand_bed_file_name)

        # Verifying condition to write bb
        if args.bigbed:

            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            try:
                # Converting to big bed
                bed_to_bb(rand_bed_file_name, chrom_sizes_file)

                # removing previously-created BED file
                os.remove(rand_bed_file_name)
            except Exception:
                err.throw_warning("DEFAULT_WARNING")  # FIXME: maybe error instead?

        print(">> random regions file created:", len(rand_region), "regions")

    ###################################################################################################
    # Creating PWMs
    ###################################################################################################

    if args.motif_dbs:
        ms = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True)
        # filter for dbs only if --motif_dbs is not set
        if 'database' in filter_values:
            del filter_values['database']
    else:
        if 'database' in filter_values:
            ms = MotifSet(preload_motifs=filter_values['database'])
        else:
            ms = MotifSet(preload_motifs="default")

    print(">> used database(s):", ",".join([str(db) for db in ms.motif_data.repositories_list]))

    # applying filtering pattern, taking a subset of the motif set
    if args.filter:
        ms = ms.filter(filter_values, search=args.filter_type)

    motif_list = ms.get_motif_list(args.pseudocounts, args.fpr)

    print(">> motifs loaded:", len(motif_list))

    # Performing normalized threshold strategy if requested
    if args.norm_threshold:
        threshold_list = [motif.threshold / motif.len for motif in motif_list]
        unique_threshold = sum(threshold_list) / len(threshold_list)
    else:
        unique_threshold = None

    scanner = scan.Scanner(7)
    pssm_list = []
    thresholds = []
    for motif in motif_list:
        if unique_threshold:
            thresholds.append(0.0)
            thresholds.append(0.0)
        else:
            thresholds.append(motif.threshold)
            thresholds.append(motif.threshold)

        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

    # Performing motif matching
    # TODO: we can expand this to use bg from sequence, for example,
    # or from organism.
    bg = tools.flat_bg(4)
    scanner.set_motifs(pssm_list, bg, thresholds)

    ###################################################################################################
    # Motif Matching
    ###################################################################################################

    # Creating genome file
    genome_file = Fastafile(genome_data.get_genome())

    print()

    # Iterating on list of genomic region sets
    for grs in regions_to_match:

        start = time.time()
        print(">> matching [", grs.name, "], ", len(grs), " regions... ", sep="", end='')
        sys.stdout.flush()

        # Initializing output bed file
        output_bed_file = os.path.join(output_location, grs.name + "_mpbs.bed")

        # must remove it because we append the MPBS
        if os.path.isfile(output_bed_file):
            os.remove(output_bed_file)

        # Iterating on genomic region set
        for genomic_region in grs:

            # Reading sequence associated to genomic_region
            sequence = str(genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

            grs_tmp = match_multiple(scanner, motif_list, sequence, genomic_region)

            # post-processing: if required, remove duplicate regions on opposing strands (keep highest score)
            if len(grs_tmp) > 1 and args.remove_strand_duplicates:
                grs_tmp.sort()
                seqs = grs_tmp.sequences
                seqs_new = []
                cur_pos = 0
                end_pos = len(seqs) - 1
                while cur_pos < end_pos:
                    gr = seqs[cur_pos]

                    new_pos = cur_pos + 1
                    while new_pos < end_pos:
                        gr2 = seqs[new_pos]

                        # if this sequence is unrelated, we move on
                        if gr.name != gr2.name or gr.chrom != gr2.chrom or gr.initial != gr2.initial or gr.final != gr2.final or gr.orientation == gr2.orientation:
                            break

                        if float(gr.data) < float(gr2.data):
                            gr = gr2

                        new_pos = new_pos + 1

                    # adding the currently-selected genomic region
                    seqs_new.append(gr)

                    # at the next loop, we start from the next right-handed sequences
                    cur_pos = new_pos

                # edge case: the last element was not considered
                # (when it is, cur_pos == end_pos+1)
                if cur_pos == end_pos:
                    seqs_new.append(seqs[cur_pos])

                grs_tmp.sequences = seqs_new

            grs_tmp.write(output_bed_file, mode="a")

        del grs.sequences[:]

        # Verifying condition to write bb
        if args.bigbed and args.normalize_bitscore:
            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            bed_to_bb(output_bed_file, chrom_sizes_file)

            # removing BED file
            os.remove(output_bed_file)

        secs = time.time() - start
        print("[", "%02.3f" % secs, " seconds]", sep="")
Пример #44
0
def diff_analysis_run(args):
    # Initializing Error Handler
    err = ErrorHandler()

    output_location = os.path.join(args.output_location, "Lineplots")
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    # check if they have same length
    mpbs_files = args.mpbs_files.strip().split(",")
    reads_files = args.reads_files.strip().split(",")
    conditions = args.conditions.strip().split(",")

    if args.colors is not None:
        colors = args.colors.strip().split(",")
    else:
        colors = [
            "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33",
            "#a65628", "#f781bf", "#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3",
            "#a6d854", "#ffd92f", "#e5c494", "#b3b3b3", "#8dd3c7", "#ffffb3",
            "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5",
            "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", "#e6ab02",
            "#a6761d", "#666666", "#7fc97f", "#beaed4", "#fdc086", "#ffff99",
            "#386cb0", "#f0027f", "#bf5b17", "#666666"
        ]

    assert len(mpbs_files) == len(reads_files) == len(conditions), \
        "Number of motif, read and condition names are not same: {}, {}, {}".format(len(mpbs_files), len(reads_files),
                                                                                    len(conditions))

    # Check if the index file exists
    for reads_file in reads_files:
        base_name = "{}.bai".format(reads_file)
        if not os.path.exists(base_name):
            pysam.index(reads_file)

    mpbs = GenomicRegionSet("Motif Predicted Binding Sites of All Conditions")
    for i, mpbs_file in enumerate(mpbs_files):
        mpbs.read(mpbs_file)

    mpbs.sort()
    mpbs.remove_duplicates()
    mpbs_name_list = list(set(mpbs.get_names()))

    signals = np.zeros(shape=(len(conditions), len(mpbs_name_list),
                              args.window_size),
                       dtype=np.float32)
    motif_len = list()
    motif_num = list()
    motif_pwm = list()

    print((" {} cpus are detected and {} of them will be used...\n".format(
        cpu_count(), args.nc)))

    genome_data = GenomeData(args.organism)
    fasta = Fastafile(genome_data.get_genome())

    print("generating signal for each motif and condition...\n")
    # differential analysis using bias corrected signal
    if args.bc:
        hmm_data = HmmData()
        table_forward = hmm_data.get_default_bias_table_F_ATAC()
        table_reverse = hmm_data.get_default_bias_table_R_ATAC()
        bias_table = BiasTable().load_table(table_file_name_F=table_forward,
                                            table_file_name_R=table_reverse)

        # do not use multi-processing
        if args.nc == 1:
            for i, condition in enumerate(conditions):
                for j, mpbs_name in enumerate(mpbs_name_list):
                    mpbs_regions = mpbs.by_names([mpbs_name])
                    arguments = (mpbs_regions, reads_files[i], args.organism,
                                 args.window_size, args.forward_shift,
                                 args.reverse_shift, bias_table)
                    try:
                        signals[i, j, :] = get_bc_signal(arguments)
                    except Exception:
                        logging.exception("get bias corrected signal failed")

                    # get motif length, number and pwm matrix
                    motif_len.append(mpbs_regions[0].final -
                                     mpbs_regions[0].initial)
                    motif_num.append(len(mpbs_regions))
                    motif_pwm.append(
                        get_pwm(fasta, mpbs_regions, args.window_size))

        # use multi-processing
        else:
            for i, condition in enumerate(conditions):
                print((
                    "generating signal for condition {} \n".format(condition)))
                with Pool(processes=args.nc) as pool:
                    arguments_list = list()
                    for mpbs_name in mpbs_name_list:
                        mpbs_regions = mpbs.by_names([mpbs_name])
                        arguments = (mpbs_regions, reads_files[i],
                                     args.organism, args.window_size,
                                     args.forward_shift, args.reverse_shift,
                                     bias_table)
                        arguments_list.append(arguments)

                        # get motif length, number and pwm matrix
                        motif_len.append(mpbs_regions[0].final -
                                         mpbs_regions[0].initial)
                        motif_num.append(len(mpbs_regions))
                        motif_pwm.append(
                            get_pwm(fasta, mpbs_regions, args.window_size))

                    res = pool.map(get_bc_signal, arguments_list)
                    signals[i] = np.array(res)

    # differential analysis using raw signal
    else:
        # do not use multi-processing
        if args.nc == 1:
            for i, condition in enumerate(conditions):
                for j, mpbs_name in enumerate(mpbs_name_list):
                    mpbs_regions = mpbs.by_names([mpbs_name])
                    arguments = (mpbs_regions, reads_files[i], args.organism,
                                 args.window_size, args.forward_shift,
                                 args.reverse_shift)
                    signals[i, j, :] = get_raw_signal(arguments)

                    # get motif length, number and pwm matrix
                    motif_len.append(mpbs_regions[0].final -
                                     mpbs_regions[0].initial)
                    motif_num.append(len(mpbs_regions))
                    motif_pwm.append(
                        get_pwm(fasta, mpbs_regions, args.window_size))

        # use multi-processing
        else:
            for i, condition in enumerate(conditions):
                print((
                    "generating signal for condition {} \n".format(condition)))
                with Pool(processes=args.nc) as pool:
                    arguments_list = list()
                    for mpbs_name in mpbs_name_list:
                        mpbs_regions = mpbs.by_names([mpbs_name])
                        arguments = (mpbs_regions, reads_files[i],
                                     args.organism, args.window_size,
                                     args.forward_shift, args.reverse_shift)
                        arguments_list.append(arguments)

                        # get motif length, number and pwm matrix
                        motif_len.append(mpbs_regions[0].final -
                                         mpbs_regions[0].initial)
                        motif_num.append(len(mpbs_regions))
                        motif_pwm.append(
                            get_pwm(fasta, mpbs_regions, args.window_size))

                    res = pool.map(get_raw_signal, arguments_list)
                    signals[i] = np.array(res)

    print("signal generation is done!\n")

    # compute normalization facotr for each condition
    factors = compute_factors(signals)
    output_factor(args, factors, conditions)

    # normalize signals by factor and number of motifs
    for i in range(len(conditions)):
        for j in range(len(mpbs_name_list)):
            signals[i, j, :] = signals[i, j, :] / (factors[i] * motif_num[j])

    if args.output_profiles:
        output_profiles(mpbs_name_list, signals, conditions,
                        args.output_location)

    print("generating line plot for each motif...\n")
    if args.nc == 1:
        for i, mpbs_name in enumerate(mpbs_name_list):
            output_line_plot(
                (mpbs_name, motif_num[i], signals[:, i, :], conditions,
                 motif_pwm[i], output_location, args.window_size, colors))
    else:
        with Pool(processes=args.nc) as pool:
            arguments_list = list()
            for i, mpbs_name in enumerate(mpbs_name_list):
                arguments_list.append(
                    (mpbs_name, motif_num[i], signals[:, i, :], conditions,
                     motif_pwm[i], output_location, args.window_size, colors))
            pool.map(output_line_plot, arguments_list)

    ps_tc_results = list()
    for i, mpbs_name in enumerate(mpbs_name_list):
        ps_tc_results.append(
            get_ps_tc_results(signals[:, i, :], motif_len[i],
                              args.window_size))

    # find the significant motifs and generate a scatter plot if two conditions are given
    if len(conditions) == 2:
        ps_tc_results = scatter_plot(args, ps_tc_results, mpbs_name_list,
                                     conditions)

    output_stat_results(ps_tc_results, conditions, mpbs_name_list, motif_num,
                        args)
Пример #45
0
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift):
    # Parameters
    window = 50
    defaultKmerValue = 1.0

    # Initialization
    fastaFile = Fastafile(genome_file_name)
    fBiasDict = bias_table[0]
    rBiasDict = bias_table[1]
    k_nb = len(fBiasDict.keys()[0])
    p1 = start
    p2 = end
    p1_w = p1 - (window / 2)
    p2_w = p2 + (window / 2)
    p1_wk = p1_w - int(floor(k_nb / 2.))
    p2_wk = p2_w + int(ceil(k_nb / 2.))
    if p1 <= 0 or p1_w <= 0 or p2_wk <= 0:
        # Return raw counts
        bc_signal = [0.0] * (p2 - p1)
        for read in bam.fetch(chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0

        return bc_signal

    # Raw counts
    nf = [0.0] * (p2_w - p1_w)
    nr = [0.0] * (p2_w - p1_w)
    for read in bam.fetch(chrom, p1_w, p2_w):
        # check if the read is unmapped, according to issue #112
        if read.is_unmapped:
            continue

        if not read.is_reverse:
            cut_site = read.pos + forward_shift
            if p1_w <= cut_site < p2_w:
                nf[cut_site - p1_w] += 1.0
        else:
            cut_site = read.aend + reverse_shift - 1
            if p1_w <= cut_site < p2_w:
                nr[cut_site - p1_w] += 1.0

    # Smoothed counts
    Nf = []
    Nr = []
    f_sum = sum(nf[:window])
    r_sum = sum(nr[:window])
    f_last = nf[0]
    r_last = nr[0]
    for i in range((window / 2), len(nf) - (window / 2)):
        Nf.append(f_sum)
        Nr.append(r_sum)
        f_sum -= f_last
        f_sum += nf[i + (window / 2)]
        f_last = nf[i - (window / 2) + 1]
        r_sum -= r_last
        r_sum += nr[i + (window / 2)]
        r_last = nr[i - (window / 2) + 1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper()
    currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper())

    # Iterating on sequence to create signal
    af = []
    ar = []
    for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
        fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
        rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
        try:
            af.append(fBiasDict[fseq])
        except Exception:
            af.append(defaultKmerValue)
        try:
            ar.append(rBiasDict[rseq])
        except Exception:
            ar.append(defaultKmerValue)

    # Calculating bias and writing to wig file
    f_sum = sum(af[:window])
    r_sum = sum(ar[:window])
    f_last = af[0]
    r_last = ar[0]
    bc_signal = []
    for i in range((window / 2), len(af) - (window / 2)):
        nhatf = Nf[i - (window / 2)] * (af[i] / f_sum)
        nhatr = Nr[i - (window / 2)] * (ar[i] / r_sum)
        bc_signal.append(nhatf + nhatr)
        f_sum -= f_last
        f_sum += af[i + (window / 2)]
        f_last = af[i - (window / 2) + 1]
        r_sum -= r_last
        r_sum += ar[i + (window / 2)]
        r_last = ar[i - (window / 2) + 1]

    # Termination
    fastaFile.close()
    return bc_signal
Пример #46
0
def create_signal(args, regions):
    def revcomp(s):
        rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")])
        return "".join([rev_dict[e] for e in s[::-1]])

    alphabet = ["A", "C", "G", "T"]
    kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    f_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    r_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    f_exp_dict = dict([(e, 0.0) for e in kmer_comb])
    r_exp_dict = dict([(e, 0.0) for e in kmer_comb])

    bam_file = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fasta_file = Fastafile(genome_data.get_genome())

    for region in regions:
        # Fetching observed reads
        reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final)
        for read in reads:
            if not read.is_reverse:
                p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1
            else:
                p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1
            p2 = p1 + args.k_nb
            try:
                dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if 'N' not in dna_sequence_obs:
                if read.is_reverse:
                    dna_sequence_obs = revcomp(dna_sequence_obs)
                    r_obs_dict[dna_sequence_obs] += 1
                else:
                    f_obs_dict[dna_sequence_obs] += 1

        # Fetching whole sequence
        try:
            dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        dna_sequence_exp_rev = revcomp(dna_sequence_exp)
        for i in range(0, len(dna_sequence_exp) - args.k_nb):
            s = dna_sequence_exp[i:i + args.k_nb]
            if "N" not in s:
                f_exp_dict[s] += 1
            s = dna_sequence_exp_rev[i:i + args.k_nb]
            if "N" not in s:
                r_exp_dict[s] += 1

    output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
    output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
    output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
    output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))

    output_file_f_obs = open(output_fname_f_obs, "w")
    output_file_f_exp = open(output_fname_f_exp, "w")
    output_file_r_obs = open(output_fname_r_obs, "w")
    output_file_r_exp = open(output_fname_r_exp, "w")

    for kmer in r_obs_dict.keys():
        if f_obs_dict[kmer] > 0:
            output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if f_exp_dict[kmer] > 0:
            output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if r_obs_dict[kmer] > 0:
            output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if r_exp_dict[kmer] > 0:
            output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n")

    output_file_f_obs.close()
    output_file_f_exp.close()
    output_file_r_obs.close()
    output_file_r_exp.close()
Пример #47
0
                if k.startswith('PRIMER_RIGHT'):
                    rightprimers[int(k.split('_')[2])] = v

        return leftprimers, rightprimers
    except:
        sys.stderr.write("warning: primer design failed for " + name + "\n")
        return {}, {}

def rc(dna):
    ''' reverse complement '''
    complements = maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
    return dna.translate(complements)[::-1]


if len(sys.argv) == 3:
    ref = Fastafile(sys.argv[1])
    with open(sys.argv[2], 'r') as l1seq:
        for line in l1seq:
            if not line.startswith('Chr'): # header
                c = line.strip().split('\t')
                chrom  = c[0]
                strand = c[6]
                pos = min(int(c[1]), int(c[2]))

                if strand == '+':
                    pos = max(int(c[1]), int(c[2]))

                iname   = 'c' + chrom + 'p' + str(pos) + 'IN'
                oname   = 'c' + chrom + 'p' + str(pos) + 'OUT'

                outerstart = pos - 600