Python fetchSeq 예제들, utils.fetchSeq Python 예제들

예제 #1

0

파일 보기

파일: CRC2.py 프로젝트: melnuesch/CLL_TFnetworks_2018

def generateSubpeakFASTA(TFtoEnhancerDict, subpeaks, genomeDirectory,
                         projectName, projectFolder, constExtension):
    '''
    from a BED file of constituents
    generate a FASTA for the consituients contained within the canidate supers
    '''

    subpeakDict = {}
    subpeakBED = [['track name=' + projectName + ' color=204,0,204']]
    subpeakTable = utils.parseTable(subpeaks, '\t')

    subpeakLoci = [
        utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable
    ]
    subpeakCollection = utils.LocusCollection(subpeakLoci, 50)

    for gene in TFtoEnhancerDict.keys():
        subpeakDict[gene] = []
        for region in TFtoEnhancerDict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [
                utils.makeSearchLocus(x, constExtension, constExtension)
                for x in overlaps
            ]

            overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append(
                    [overlap.chr(),
                     overlap.start(),
                     overlap.end()])
                subpeakDict[gene].append(overlap)

    bedfilename = projectFolder + projectName + '_subpeaks.bed'
    utils.unParseTable(subpeakBED, bedfilename, '\t')

    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|' + subpeak.chr() + '|' + str(
                subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(),
                                       int(subpeak.start() + 1),
                                       int(subpeak.end() + 1))

            fasta.append('>' + fastaTitle)
            fasta.append(upper(fastaLine))

    outname = projectFolder + projectName + '_SUBPEAKS.fa'

    utils.unParseTable(fasta, outname, '')

예제 #2

0

파일 보기

파일: CRCmapper.py 프로젝트: younglab/CRCmapper

def generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension):
    '''
    takes as input a BED file of constituents
    outputs a FASTA  file of merged extended super-enhancer consituents and associated formated name
    '''

    print 'MAKE FASTA'

    subpeakDict = {}
    subpeakBED = [['track name=' + projectName + ' color=204,0,204']]
    subpeakTable = utils.parseTable(subpeaks, '\t')

    subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable]
    subpeakCollection = utils.LocusCollection(subpeakLoci, 50)

    for gene in TFandSuperDict.keys():
        subpeakDict[gene] = []
        for region in TFandSuperDict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [utils.makeSearchLocus(x, motifExtension, motifExtension) for x in overlaps]

            overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()])
                subpeakDict[gene].append(overlap)

    bedfilename = projectFolder + projectName + '_subpeaks.bed'
    utils.unParseTable(subpeakBED, bedfilename, '\t')

    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|'  + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1))

            fasta.append('>' + fastaTitle)
            fasta.append(upper(fastaLine))

    # Output the fasta file of extended SE constituents
    outname = projectFolder + projectName + '_SUBPEAKS.fa'

    utils.unParseTable(fasta, outname, '')

예제 #3

0

파일 보기

파일: CRC3.py 프로젝트: linlabcode/pipeline

def generateSubpeakFASTA(gene_to_enhancer_dict, subpeaks, genome, projectName, projectFolder, constExtension):
    '''
    from a BED file of constituents
    generate a FASTA for the consituients contained within the canidate supers
    '''
    genomeDirectory = genome.directory()
    subpeakDict = {}
    subpeakBED = [['track name=' + projectName + ' color=204,0,204']]
    subpeakTable = utils.parseTable(subpeaks, '\t')

    subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable]
    subpeakCollection = utils.LocusCollection(subpeakLoci, 50)


    for gene in gene_to_enhancer_dict.keys():
        subpeakDict[gene] = []
        for region in gene_to_enhancer_dict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [utils.makeSearchLocus(x, constExtension, constExtension) for x in overlaps]

            overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()])
                subpeakDict[gene].append(overlap)


    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|'  + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), 
                                       int(subpeak.end()+1))

            fasta.append('>' + fastaTitle)
            fasta.append(string.upper(fastaLine))


    return subpeakBED,fasta

예제 #4

0

파일 보기

파일: enhancerPromoter.py 프로젝트: mufrdrk/pipeline

def makePeakTable(paramDict,
                  splitGFFPath,
                  averageTablePath,
                  startDict,
                  geneList,
                  genomeDirectory,
                  tads_path=''):
    '''
    makes the final peak table with ebox info
    '''

    peakTable = [[
        'REGION_ID', 'CHROM', 'START', 'STOP', 'LENGTH', 'TSS', 'CPG',
        'CPG_FRACTION', 'GC_FREQ', 'SIGNAL', 'CANON_EBOX_COUNT',
        'NON_CANON_EBOX_COUNT', 'TOTAL_EBOX_COUNT', 'OVERLAPPING_GENES',
        'PROXIMAL_GENES'
    ]]

    print('LOADING PEAK REGIONS')
    peakGFF = utils.parseTable(splitGFFPath, '\t')

    print('LOADING BINDING DATA')
    signalTable = utils.parseTable(averageTablePath, '\t')

    print('LOADING CPGS ISLANDS')
    cpgBed = utils.parseTable(paramDict['cpgPath'], '\t')
    cpgLoci = []
    for line in cpgBed:
        cpgLoci.append(utils.Locus(line[0], line[1], line[2], '.', line[-1]))
    cpgCollection = utils.LocusCollection(cpgLoci, 50)

    print("MAKING TSS COLLECTIONS")
    if len(geneList) == 0:
        geneList = startDict.keys()

    tss_1kb_loci = []
    tss_50kb_loci = []
    for refID in geneList:
        tss_1kb_loci.append(utils.makeTSSLocus(refID, startDict, 1000, 1000))
        tss_50kb_loci.append(utils.makeTSSLocus(refID, startDict, 50000,
                                                50000))

    #make a 1kb flanking and 50kb flanking collection
    tss_1kb_collection = utils.LocusCollection(tss_1kb_loci, 50)
    tss_50kb_collection = utils.LocusCollection(tss_50kb_loci, 50)

    if len(tads_path) > 0:
        print('LOADING TADS FROM %s' % (tads_path))
        tad_collection = utils.importBoundRegion(tads_path, 'tad')
        use_tads = True

        #building a tad dict keyed by tad ID w/ genes in that tad provided
        tad_dict = defaultdict(list)
        for tss_locus in tss_1kb_loci:
            overlapping_tads = tad_collection.getOverlap(tss_locus, 'both')
            for tad_locus in overlapping_tads:
                tad_dict[tad_locus.ID()].append(tss_locus.ID())

    else:
        use_tads = False

    print('CLASSIFYING PEAKS')
    ticker = 0

    no_tad_count = 0
    for i in range(len(peakGFF)):
        if ticker % 1000 == 0:
            print(ticker)
        ticker += 1

        #getting the particulars of the region
        gffLine = peakGFF[i]
        peakID = gffLine[1]
        chrom = gffLine[0]
        start = int(gffLine[3])
        stop = int(gffLine[4])
        lineLocus = utils.Locus(chrom, start, stop, '.', peakID)

        #getting the mapped signal
        signalLine = signalTable[(i + 1)]
        signalVector = [float(x) for x in signalLine[2:]]

        #setting up the new line
        newLine = [peakID, chrom, start, stop, lineLocus.len()]

        #get the tss status from the gff itself (we are able to do this nicely from the split gff code earlier
        newLine.append(gffLine[7])

        #check cpg status
        if cpgCollection.getOverlap(lineLocus, 'both'):
            newLine.append(1)
        else:
            newLine.append(0)

        #now do fractional cpgOverlap
        overlappingCpGLoci = cpgCollection.getOverlap(lineLocus, 'both')
        overlappingBases = 0
        for locus in overlappingCpGLoci:
            cpgStart = max(locus.start(), lineLocus.start())
            cpgEnd = min(locus.end(), lineLocus.end())
            overlappingBases += (cpgEnd - cpgStart)
        overlapFraction = float(overlappingBases) / lineLocus.len()

        newLine.append(round(overlapFraction, 2))

        #now get the seq
        lineSeq = string.upper(
            utils.fetchSeq(genomeDirectory, chrom, start, stop, True))
        if len(lineSeq) == 0:
            print('UH OH')
            print(lineSeq)
            print(gffLine)
            print(i)
            print(chrom)
            print(start)
            print(stop)
            sys.exit()

        gcFreq = float(lineSeq.count('GC') +
                       lineSeq.count('CG')) / len(lineSeq)
        newLine.append(gcFreq)

        #this is where we add the ChIP-Seq signal
        newLine += signalVector

        eboxMatchList = re.findall('CA..TG', lineSeq)
        if len(eboxMatchList) == 0:
            newLine += [0] * 3
        else:
            totalCount = len(eboxMatchList)
            canonCount = eboxMatchList.count('CACGTG')
            otherCount = totalCount - canonCount
            newLine += [canonCount, otherCount, totalCount]

        #now find the overlapping and proximal genes
        #here each overlapping gene the tss 1kb locus overlaps the peak

        if use_tads:

            tad_loci = tad_collection.getOverlap(lineLocus, 'both')

            tad_id_list = [tad_locus.ID() for tad_locus in tad_loci]
            tad_genes = []
            for tad_id in tad_id_list:
                tad_genes += tad_dict[tad_id]
            if len(tad_genes) == 0:
                #print('no tad for this region')
                #print(gffLine)
                no_tad_count += 1
        else:
            tad_genes = []

        if len(tad_genes) > 0:
            overlappingGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_1kb_collection.getOverlap(lineLocus, 'both')
                if tad_genes.count(locus.ID()) > 0
            ]
            proximalGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_50kb_collection.getOverlap(lineLocus, 'both')
                if tad_genes.count(locus.ID()) > 0
            ]
            # print('linked peak to tad genes')
            # print([startDict[x]['name'] for x in tad_genes])
            # print(tad_id_list)
            # print(gffLine)
            # print(overlappingGenes)
            # print(proximalGenes)
        else:
            overlappingGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_1kb_collection.getOverlap(lineLocus, 'both')
            ]
            proximalGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_50kb_collection.getOverlap(lineLocus, 'both')
            ]

        overlappingGenes = utils.uniquify(overlappingGenes)
        #here the tss 50kb locus overlaps the peak
        #overlap takes priority over proximal
        proximalGenes = [
            gene for gene in proximalGenes if overlappingGenes.count(gene) == 0
        ]
        proximalGenes = utils.uniquify(proximalGenes)

        overlappingString = string.join(overlappingGenes, ',')
        proximalString = string.join(proximalGenes, ',')

        newLine += [overlappingString, proximalString]

        peakTable.append(newLine)

    print('Out of %s regions, %s were assigned to at least 1 tad' %
          (len(peakTable), no_tad_count))
    return peakTable

예제 #5

0

파일 보기

def makeEboxBeds(gff_path, name='', overwrite=False):
    '''
    makes an ebox bed for the corresponding gff
    '''

    if len(name) == 0:
        gff_name = gff_path.split('/')[-1].split('.')[0]
    else:
        gff_name = name

    #set output
    canon_path = '%s%s_CANON_EBOX.bed' % (bedFolder, gff_name)
    non_path = '%s%s_NONCANON_EBOX.bed' % (bedFolder, gff_name)

    #check to see if already done
    if not overwrite:
        if utils.checkOutput(canon_path, 0.1, 0.1) and utils.checkOutput(
                non_path, 0.1, 0.1):
            print('Found bed output at %s and %s' % (canon_path, non_path))
            return canon_path, non_path

    #for each region spit out canonical and non canonical eboxes
    canonBed = []
    nonBed = []

    #open up the repeat filtered gff
    region_gff = utils.parseTable(gff_path, '\t')

    ticker = 0
    for line in region_gff:

        if ticker % 100 == 0:
            print(ticker)
        ticker += 1
        chrom = line[0]
        start = int(line[3])
        end = int(line[4])

        seq = string.upper(
            utils.fetchSeq(genomeDirectory, chrom, start, end, True))

        #get the canonical starts
        canonStarts = [
            start + match.start() for match in re.finditer('CACGTG', seq)
        ]

        #get the non canonical starts
        nonStarts = [
            start + match.start() for match in re.finditer('CA..TG', seq)
        ]

        #filter out the canonicals
        nonStarts = [x for x in nonStarts if canonStarts.count(x) == 0]

        #now fill out the bed
        if len(canonStarts) > 0:
            for ebox_start in canonStarts:
                canonBed.append([chrom, ebox_start, (ebox_start + 6), '.'])

        if len(nonStarts) > 0:
            for ebox_start in nonStarts:
                nonBed.append([chrom, ebox_start, (ebox_start + 6), '.'])

    print('FOUND %s CANONICAL EBOXES' % (len(canonBed)))
    print('FOUND %s NON CANONICAL EBOXES' % (len(nonBed)))
    utils.unParseTable(canonBed, canon_path, '\t')
    utils.unParseTable(nonBed, non_path, '\t')

    return canon_path, non_path

예제 #6

0

파일 보기

파일: extendedSuperNetwork.py 프로젝트: qiusir1/CLL_TFnetworks_2018

    se_id = line[0]
    se_namelist.append(se_id)
    subpeak_dict[se_id] = []

    se_locus = utils.Locus(line[1], line[2], line[3], '.')
    overlaps = subpeak_collection.getOverlap(se_locus)

    for overlap in overlaps:
        subpeak_dict[se_id].append(overlap)

        subpeak = overlap

        fastaTitle = se_id + '|' + subpeak.chr() + '|' + str(
            subpeak.start()) + '|' + str(subpeak.end())
        fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(),
                                   int(subpeak.start() + 1),
                                   int(subpeak.end() + 1))

        fasta.append('>' + fastaTitle)
        fasta.append(upper(fastaLine))

outname = projectFolder + projectName + '_SUBPEAKS.fa'
utils.unParseTable(fasta, outname, '')

# call FIMO and find the motifs within each enhancer

motifConvertFile = '/ark/home/af661/src/coreTFnetwork/annotations/MotifDictionary.txt'
motifDatabaseFile = '/ark/home/af661/src/coreTFnetwork/annotations/VertebratePWMs.txt'

motifDatabase = utils.parseTable(motifConvertFile, '\t')
motifDatabaseDict = {}

예제 #7

0

파일 보기

def rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100):

    '''
    uses the  conserved MYCN sites and ranks eboxes within them
    by average background subtracted signal
    searches 100bp (window variable)  from mycn summits
    '''
    
    window = int(window)

    #bring in the conserved mycn region
    print('making gff of nb mycn summits')
    nb_mycn_gff = utils.parseTable(mycn_gff_path,'\t')

    nb_mycn_collection = utils.gffToLocusCollection(nb_mycn_gff,50)

    dataDict =pipeline_dfci.loadDataTable(nb_all_chip_dataFile)
    names_list = [name for name in dataDict.keys() if name.count('MYCN') == 1]
    names_list.sort()

    summit_loci = []
    #first makes a gff of all summits +/- 100bp for all nb mycn datasets
    for name in names_list:
        summit_bed_path = '%s%s/%s_summits.bed' % (macsFolder,name,name)
        summit_bed = utils.parseTable(summit_bed_path,'\t')
        for line in summit_bed:
            summit_locus = utils.Locus(line[0],int(line[1])-window,int(line[2])+window,'.',line[3])
            if len(nb_mycn_collection.getOverlap(summit_locus)) > 0:
                summit_loci.append(summit_locus)

    summit_collection =utils.LocusCollection(summit_loci,50)
    summit_merged_collection = summit_collection.stitchCollection()
    
    summit_gff = utils.locusCollectionToGFF(summit_merged_collection)
    summit_gff_path = '%sHG19_NB_MYCN_SUMMITS_-%s_+%s.gff' % (gffFolder,window,window)
    utils.unParseTable(summit_gff,summit_gff_path,'\t')

    #this is borrowed from above and maps chip-seq signal to the gff
    print('mapping to nb mycn summits and making signal dict')
    gffList = [summit_gff_path]
    summit_signal_path = pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList)


    mycnSignalTable = utils.parseTable(summit_signal_path,'\t')

    #making a signal dictionary for MYCN binding
    names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE']
    background_list = [dataDict[name]['background'] for name in names_list]
    header = mycnSignalTable[0]
    chip_columns = [header.index(name) for name in names_list]
    background_columns = [header.index(background_name) for background_name in background_list]
    
    mycn_sig_dict = {}
    for line in mycnSignalTable[1:]:
        line_sig = []
        for i in range(len(names_list)):
            line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]]))
        region_id = line[1]
        coords = [int(x) for x in line[1].split(':')[-1].split('-')]
        line_length = coords[1]-coords[0]
        mycn_sig_dict[region_id] = numpy.mean(line_sig)*line_length

    #now for each region find the eboxes and then add up the signal
    print('making ebox ranking')
    ebox_list = ['CACGTG','CAGTTG','CAAGTG','CAGGTG','CAATTG','CAAATG','CATCTG','CAGCTG','CATGTG','CATATG']
    eboxDict = {}
    for ebox in ebox_list:
        eboxDict[ebox] = []
    ticker = 0
    for line in summit_gff:
        if ticker % 1000 == 0:
            print(ticker)
        ticker+=1

        chrom = line[0]
        sense = '.'

        start = int(line[3])
        end = int(line[4])
        region_id = '%s(%s):%s-%s' % (line[0],line[6],line[3],line[4])
        signal = mycn_sig_dict[region_id]

        sequenceLine = utils.fetchSeq(genomeDirectory,chrom,start,end,True)
        
        motifVector = []
        matches = re.finditer('CA..TG',str.upper(sequenceLine))
        if matches:
            for match in matches:
                motifVector.append(match.group())
        
        #count only 1 of each motif type per line
        #motifVector = utils.uniquify(motifVector)
        for motif in motifVector:
            if ebox_list.count(motif) > 0:
                eboxDict[motif].append(signal)
            else:
                eboxDict[utils.revComp(motif)].append(signal)


    eboxTable =[]
    eboxTableOrdered =[['EBOX','OCCURENCES','AVG_HEIGHT']]
    for ebox in eboxDict.keys():
        newLine = [ebox,len(eboxDict[ebox]),numpy.mean(eboxDict[ebox])]
        eboxTable.append(newLine)


    occurenceOrder = utils.order([line[2] for line in eboxTable],decreasing=True)
    
    for x in occurenceOrder:
        eboxTableOrdered.append(eboxTable[x])
    print(eboxTableOrdered)
    ebox_outfile = '%sHG19_NB_MYCN_CONSERVED_SUMMITS_-%s_+%s_EBOX_RANK.txt' % (tableFolder,window,window)
    utils.unParseTable(eboxTableOrdered,ebox_outfile,'\t')
    return ebox_outfile

예제 #8

0

파일 보기

def make_mycn_stats_table(nb_all_chip_dataFile,outFile):

    '''
    making a table of conserved mycn peaks w/ some additional stats
    mycn and h3k27ac signal is avg. background normalized across 4 samples
    active tss defined as the union of all H3K27ac occupied promoters in NB
    active enhancers defined as the union of all H3K27ac sites outside of promoters
    '''
    dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile)

    print('SETTING UP OUTPUT TABLE')
    outTable = [['PEAK_ID','CHROM','START','STOP','LENGTH','ACTIVE_TSS_OVERLAP','ENHANCER_OVERLAP','CPG_ISLAND_OVERLAP','CPG_ISLAND_FRACTION','GC_FREQ','MYCN_RANK','AVG_MYCN_SIGNAL','AVG_H3K27AC_SIGNAL','CANON_EBOX_COUNT','NONCANON_EBOX_COUNT','TOTAL_EBOX_COUNT','CANON_EXP','NON_CANON_EXP','GABPA_COUNT','GABPA_EXP','GATA_COUNT','GATA_EXP']]

    dinuc = nmers(2,['A','T','G','C'])

    #input files
    mycnSignalFile = '%sHG19_NB_MYCN_CONSERVED_-0_+0_NB_ALL_SIGNAL.txt' % (signalFolder)
    h3k27acSignalFile = '%sHG19_NB_MYCN_CONSERVED_-500_+500_NB_ALL_SIGNAL.txt' % (signalFolder)
    mycnRankFile = '%smeta_rose/NB_MYCN/NB_MYCN_0KB_STITCHED_ENHANCER_REGION_RANK_CONSERVED.txt' % (projectFolder)
    activeGeneFile = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder)
    #note, this is the ucsc hg19 cpg islands extended file
    #to download and format run ./beds/download_cpg.sh
    cpgFile = '%sbeds/hg19_cpg_islands.bed' % (projectFolder)
    enhancerFile = '%smeta_rose/NB_H3K27AC/NB_H3K27AC_AllEnhancers.table.txt' % (projectFolder)

    print('LOADING MYCN BINDING DATA')
    mycnSignalTable = utils.parseTable(mycnSignalFile,'\t')

    #making a signal dictionary for MYCN binding
    names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE']
    background_list = [dataDict[name]['background'] for name in names_list]
    header = mycnSignalTable[0]
    chip_columns = [header.index(name) for name in names_list]
    background_columns = [header.index(background_name) for background_name in background_list]
    
    mycn_sig_dict = {}
    #this only works if the first column are unique identifiers
    if len(mycnSignalTable) != len(utils.uniquify([line[0] for line in mycnSignalTable])):
        print('Error: Column 1 of must contain unique identifiers.' % (mycnSignalFile))
        sys.exit()
    for line in mycnSignalTable[1:]:
        line_sig = []
        for i in range(len(names_list)):
            line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]]))
        mycn_sig_dict[line[0]] = numpy.mean(line_sig)


    
    print('LOADING MYCN RANK DATA')
    mycnRankTable = utils.parseTable(mycnRankFile,'\t')

    print('LOADING H3K27AC BINDING DATA')
    h3k27acSignalTable = utils.parseTable(h3k27acSignalFile,'\t')
    #making a signal dictionary for background subtracted H3K27ac binding
    names_list = ['BE2C_H3K27AC','KELLY_H3K27AC','NGP_H3K27AC','SHEP21_0HR_H3K27AC_NOSPIKE']
    background_list = [dataDict[name]['background'] for name in names_list]
    header = h3k27acSignalTable[0]
    chip_columns = [header.index(name) for name in names_list]
    background_columns = [header.index(background_name) for background_name in background_list]
    
    h3k27ac_sig_dict = {}
    #this only works if the first column are unique identifiers
    if len(h3k27acSignalTable) != len(utils.uniquify([line[0] for line in h3k27acSignalTable])):
        print('Error: Column 1 of must contain unique identifiers.' % (h3k27acSignalFile))
        sys.exit()
    for line in h3k27acSignalTable[1:]:
        line_sig = []
        for i in range(len(names_list)):
            line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]]))
        h3k27ac_sig_dict[line[0]] = numpy.mean(line_sig)



    #making the cpg collection
    print('LOADING CPGS ISLANDS')
    cpgBed = utils.parseTable(cpgFile,'\t')
    cpgLoci = []
    for line in cpgBed:
        cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1]))
    cpgCollection = utils.LocusCollection(cpgLoci,50)
        
    #next make the tss collection of active promoters
    print('LOADING ACTIVE PROMOTERS')
    startDict = utils.makeStartDict(annotFile)
    activeTable = utils.parseTable(activeGeneFile,'\t')
    tss_1kb_loci = []
    for line in activeTable:
        tss_1kb_loci.append(utils.makeTSSLocus(line[1],startDict,1000,1000))
    tss_1kb_collection = utils.LocusCollection(tss_1kb_loci,50)


    #enhancer file
    print("LOADING ACTIVE ENHANCERS")
    enhancerTable = utils.parseTable(enhancerFile,'\t')
    print('STARTING WITH THE FOLLOWING NUMBER OF ENHANCERS IN NB')
    print(len(enhancerTable) - 6)
    enhancerLoci = []
    for line in enhancerTable:
        if line[0][0] != '#' and line[0][0] != 'R':
            try:
                lineLocus = utils.Locus(line[1],int(line[2]),int(line[3]),'.',line[0])
                enhancerLoci.append(lineLocus)
            except IndexError:
                print(line)
                sys.exit()
    enhancerCollection = utils.LocusCollection(enhancerLoci,50)

    print('CLASSIFYING MYCN PEAKS')
    ticker = 0
    for i in range(1,len(mycnSignalTable)):
        if ticker%100 == 0:
            print(ticker)
        ticker +=1

        line = mycnSignalTable[i]        

        mycn_signal = round(mycn_sig_dict[line[0]],4)
        h3k27ac_signal = round(h3k27ac_sig_dict[line[0]],4)
        
        peakID = line[0]
        locusString = line[1]
        chrom = locusString.split('(')[0]
        [start,stop] = [int(x) for x in line[1].split(':')[-1].split('-')]
        lineLocus = utils.Locus(chrom,start,stop,'.',peakID)
        
        tssOverlap = 0
        if tss_1kb_collection.getOverlap(lineLocus,'both'):
            tssOverlap = 1

        enhancerOverlap = 0
        if enhancerCollection.getOverlap(lineLocus,'both') and tssOverlap == 0:
            enhancerOverlap = 1

        cpgIslandOverlap = 0
        if cpgCollection.getOverlap(lineLocus,'both'):
            cpgIslandOverlap = 1

        #now do fractional cpgOverlap
        overlappingCpGLoci = cpgCollection.getOverlap(lineLocus,'both')
        overlappingBases = 0
        for locus in overlappingCpGLoci:
            cpgStart = max(locus.start(),lineLocus.start())
            cpgEnd = min(locus.end(),lineLocus.end())
            overlappingBases += (cpgEnd-cpgStart)
        overlapFraction = round(float(overlappingBases)/lineLocus.len(),2)
        
        #now get the seq
        lineSeq = string.upper(utils.fetchSeq(genomeDirectory,chrom,start,stop,True))
        gcFreq = round(float(lineSeq.count('GC') + lineSeq.count('CG'))/len(lineSeq),2)
            
        dinuc_dict = {}
        for nmer in dinuc:
            dinuc_dict[nmer] = float(lineSeq.count('GC'))/len(lineSeq)

        
        mycnRankLine = mycnRankTable[i]
        mycnRank = numpy.mean([float(x) for x in mycnRankLine[6:]])

        canonMatchList = re.findall('CACGTG',lineSeq)
        canon_count = len(canonMatchList)

        eboxMatchList = re.findall('CA..TG',lineSeq)
        ebox_count = len(eboxMatchList)

        non_canon_count = ebox_count-canon_count

        #get the expected values
        canon_exp = dinuc_dict['CA']*dinuc_dict['CG']*dinuc_dict['TG']*(len(lineSeq) - 5)
        canon_exp = round(canon_exp,2)
        notCG = 1- dinuc_dict['CG']
        non_exp = dinuc_dict['CA']*notCG*dinuc_dict['TG']*(len(lineSeq) - 5)
        non_exp = round(non_exp,2)



        #for gata and GABPA
        gabpaMatchList = re.findall('CGGAAG',lineSeq) + re.findall('CTTCCG',lineSeq)
        gabpa_count = len(gabpaMatchList)

        gabpa_exp_f = dinuc_dict['CG'] * dinuc_dict['GA'] * dinuc_dict['AG']*(len(lineSeq) - 5)
        gabpa_exp_r = dinuc_dict['CT'] * dinuc_dict['TC'] * dinuc_dict['CG']*(len(lineSeq) - 5)
        
        gabpa_exp = round(gabpa_exp_f,2) + round(gabpa_exp_r,2)

        gataMatchList = re.findall('GATAA',lineSeq) + re.findall('TTATC',lineSeq)
        gata_count = len(gataMatchList)

        an_freq = 1 - dinuc_dict['AA'] - dinuc_dict['AT'] - dinuc_dict['AG'] -dinuc_dict['AC']
        cn_freq = 1 - dinuc_dict['CA'] - dinuc_dict['CT'] - dinuc_dict['CG'] -dinuc_dict['CC']
        gata_exp_f = dinuc_dict['GA'] * dinuc_dict['TA'] * an_freq*(len(lineSeq) - 5)
        gata_exp_r = dinuc_dict['TT'] * dinuc_dict['AT'] * cn_freq*(len(lineSeq) - 5)
        gata_exp = round(gata_exp_f,2) + round(gata_exp_r,2)

        
        

        newLine = [peakID,chrom,start,stop,lineLocus.len(),tssOverlap,enhancerOverlap,cpgIslandOverlap,overlapFraction,gcFreq,mycnRank,mycn_signal,h3k27ac_signal,canon_count,non_canon_count,ebox_count,canon_exp,non_exp,gabpa_count,gabpa_exp,gata_count,gata_exp]
        outTable.append(newLine)

    utils.unParseTable(outTable,outFile,'\t')
    
    return outFile