示例#1
0
def test_bigWig(inMem):
    print ("Loading data")
    now = datetime.datetime.now()
    bwReader = bigWigReader("../input/ENCFF966IHQ.bigWig",name="Test",genome=genome, inMemory=inMem)
    bwReader = bwReader.readData()
    print ("Time:",datetime.datetime.now() - now)

    print ("Extracting data, inMem=",str(inMem))
    now = datetime.datetime.now()
    start = 10000000
    stop = 101000000
    step = 1000000
    for i in range(start,stop,step):
        res = bwReader.get_interval(Interval("chr1",i,i+step))
    print ("Time:",datetime.datetime.now() - now)
    print (str(len(list(range(start,stop,step))))+" extractions of length "+str(step))
示例#2
0
def calc_sparsity():
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    chr = "chr2"
    faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr])
    faReader = faReader.read_data()

    # load chipSeq
    bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    arr = bwReader1.data[chr]
    print(len(arr))
    nonzero = arr[np.nonzero(arr)]
    print(len(nonzero))
    finite = nonzero[np.isfinite(nonzero)]
    print(len(finite))
示例#3
0
def calc_insulation_around_CTCF(chr, resolution=5000, window_size=20):
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr])
    faReader = faReader.read_data()

    # load chipSeq1
    bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    #load contacts

    hic = hicReader("../input/4DNFI2TK7L2F.hic",
                    genome=faReader,
                    resolution=resolution)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()[chr]

    all_CTCF = bwReader1.get_interval(Interval(chr, 0, total_length))
    all_CTCF = np.nan_to_num(all_CTCF)
    binsize = 1000
    bins = np.arange(0, total_length - 1, binsize)
    sums = [np.sum(all_CTCF[a:a + binsize]) for a in bins]
    peaks = bins[sums > np.percentile(sums, 90)]
    with open("../out/test.bed", "w") as fout:
        for i in peaks:
            fout.write(chr + "\t" + str(i) + "\t" + str(i + binsize) + "\n")
示例#4
0
def calc_corr(chr, resolution=5000, window_size=20):
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr])
    faReader = faReader.read_data()

    # load chipSeq1
    bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    #load contacts

    hic = hicReader("../input/4DNFI2TK7L2F.hic",
                    genome=faReader,
                    resolution=resolution)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()[chr]

    # distance between intercting regions in this particular test, in units of resolution
    sample_size = 5000

    # select random points on chr1
    random_points_starts = np.random.random_integers(
        0, total_length - window_size, sample_size)
    random_points_starts = np.array(
        (random_points_starts // resolution) * resolution, dtype=np.uint64)
    random_points_ends = random_points_starts + window_size

    # for each of selected points get contact between this point and (point + window_size*resolution)
    contacts = []
    chipSignals = []
    seqSignals = []
    now = datetime.datetime.now()  # start timer

    logging.info("Starting data generation")
    for start, end in zip(random_points_starts, random_points_ends):
        interval = Interval(chr, start, end)
        assert window_size >= 5 * resolution
        window = Interval(chr, start + resolution, end)
        contact = hic.get_contact(interval)
        if contact == None:
            contact = 0
        if np.isfinite(contact):
            # chipSignal = np.concatenate((bwReader1.get_interval(Interval(chr,int(start-resolution),int(start+resolution))),
            #                             bwReader1.get_interval(
            #                                 Interval(chr, int(end - resolution), int(end + resolution)))))
            chipSignal = bwReader1.get_interval(window)
            chipSignal = np.nan_to_num(chipSignal)
            chipSignal = np.sum(chipSignal)
            if np.isfinite(chipSignal):
                chipSignals.append(chipSignal)
                seqSignal = np.sum(faReader.get_interval(interval))
                seqSignals.append(seqSignal)
                contacts.append(contact)

    logging.info("Time for data generation: " +
                 str(datetime.datetime.now() - now))
    from scipy.stats import spearmanr, pearsonr

    res = []
    res.append(spearmanr(np.array(contacts), np.array(chipSignals))[0])
    res.append(pearsonr(np.array(contacts), np.array(chipSignals))[0])
    res.append(spearmanr(np.array(contacts), np.array(seqSignals))[0])
    res.append(pearsonr(np.array(contacts), np.array(seqSignals))[0])

    return ("\t".join(list(map(str, res))))
示例#5
0
def simple_test():
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    input_folder = "/home/minja/PycharmProjects/3Dpredictor/nn/input/"
    faReader = fastaReader(input_folder + "hg38/hg38.fa",
                           useOnlyChromosomes=["chr1"])
    faReader = faReader.read_data()
    # load chipSeq
    bwReader1 = bigWigReader(input_folder + "ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    # load chipSeq
    bwReader2 = bigWigReader(input_folder + "ENCFF966IHQ.bigWig",
                             genome=faReader,
                             inMemory=False)
    bwReader2 = bwReader2.readData()

    #load contacts
    resolution = 5000
    hic = hicReader(input_folder + "4DNFI2TK7L2F.hic",
                    genome=faReader,
                    binsize=resolution,
                    indexedData=True)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()["chr1"]

    window_size = 20 * resolution  # distance between intercting regions in this particular test, in units of resolution

    sample_size = 100000

    # select random points on chr1
    random_points_starts = np.random.random_integers(
        0, total_length - window_size, sample_size)
    random_points_starts = np.array(
        (random_points_starts // resolution) * resolution, dtype=np.uint64)
    random_points_ends = random_points_starts + window_size

    # for each of selected points get contact between this point and (point + window_size*resolution)
    contacts = []
    chipSignals = []
    seqSignals = []
    now = datetime.datetime.now()  # start timer

    logging.info("Starting data generation")
    for start, end in zip(random_points_starts, random_points_ends):
        interval = Interval("chr1", start, end)
        contact = hic.get_contact(interval)
        if contact == None:
            continue
        else:
            chipSignal = np.nansum(bwReader1.get_interval(interval))
            if np.isfinite(chipSignal):
                chipSignals.append(chipSignal)
                seqSignal = np.sum(faReader.get_interval(interval))
                seqSignals.append(seqSignal)
                contacts.append(contact)

    logging.info("Time for data generation1: " +
                 str(datetime.datetime.now() - now))
    # now = datetime.datetime.now()
    # chipSignals = []
    # seqSignals = []
    # contacts = []
    # for start,end in zip(random_points_starts,random_points_ends):
    #     interval = Interval("chr1",start,end)
    #     contact = hic.get_contact(interval)
    #     if contact == None:
    #         continue
    #     else:
    #         chipSignal = np.nansum(bwReader2.get_interval(interval))
    #         if np.isfinite(chipSignal):
    #             chipSignals.append(chipSignal)
    #             seqSignal = np.sum(faReader.get_interval(interval))
    #             seqSignals.append(seqSignal)
    #             contacts.append(contact)
    #
    # logging.info("Time for data generation2: " + str(datetime.datetime.now() - now))
    from scipy.stats import spearmanr
    import matplotlib.pyplot as plt

    print(contacts)
    print(chipSignals)

    print(spearmanr(np.array(contacts), np.array(chipSignals)))
    print(np.all(np.isfinite(contacts)))
    print(np.all(np.isfinite(chipSignals)))

    plt.scatter(contacts, chipSignals)
    plt.show()