예제 #1
0
def output_boundary_npa(boundaries, ref_genome_file, segment_size, data_file, label_file, n_features=5, isreverse=True):
    '''
    output boundary data to files
    isreverse: output sequence of the reverse strand as well

    '''
    chrom_list = get_chromlist(ref_genome_file)

    hdf5_file = None
    hdf5_label_file = None

    try:
        hdf5_file = h5py.File(data_file, mode='w')
        dt = hdf5_file.create_dataset('data', (len(boundaries) * 2, segment_size, n_features), compression="gzip", compression_opts=5)

        if label_file:
            hdf5_label_file = h5py.File(label_file, mode='w')
            label = hdf5_label_file.create_dataset('label', (len(boundaries) * 2))

        k = 0
        for b in boundaries:
            # print(b.chrom, b.start, b.end)

            #print('start: {}, end: {}'.format(b.start, b.end))

            seq = mutation_function.infer_true_seq(b, chrom_list[b.chrom], segment_size)

            if len(seq) != segment_size:
                raise Exception('length of sequences is wrong: %d' % (len(seq)))

            seq = line_to_tensor(str(seq).upper())
            dt[k,] = seq
            if hdf5_label_file:
                label[k] = b.label

            k += 1

            # complement sequences
            if isreverse:
                if type(seq) is MutableSeq:
                    seq2 = seq.toseq().reverse_complement()
                else:
                    # print(type(seq1))
                    seq2 = seq2.reverse_complement()

                dt[k,] = seq2

                if hdf5_label_file:
                    label[k] = b.label

                k += 1

    finally:
        if hdf5_file is not None:
            hdf5_file.close()

        if hdf5_label_file is not None:
            hdf5_label_file.close()
예제 #2
0
def output_boundary(boundaries, ref_genome_file, segment_size, data_file, label_file, isreverse=True):
    '''
    output boundary data to files
    isreverse: output sequence of the reverse strand as well

    '''
    chrom_list = get_chromlist(ref_genome_file)

    hdf5_file = None
    hdf5_label_file = None

    try:
        hdf5_file = h5py.File(data_file, mode='w')

        if label_file:
            hdf5_label_file = h5py.File(label_file, mode='w')

        for b in boundaries:
            # print(b.chrom, b.start, b.end)

            #print('start: {}, end: {}'.format(b.start, b.end))

            seq = mutation_function.infer_true_seq(b, chrom_list[b.chrom], segment_size)

            if len(seq) != segment_size:
                raise Exception('length of sequences is wrong: %d' % (len(seq)))

            output_seq(hdf5_file, b.chrom + "_" + str(b.start) + "_" + str(b.end) + "_" + b.suffix + "_1", seq)

            # complement sequences
            if isreverse:
                if type(seq) is MutableSeq:
                    seq2 = seq.toseq().reverse_complement()
                else:
                    # print(type(seq1))
                    seq2 = seq2.reverse_complement()

                output_seq(hdf5_file, b.chrom + "_" + str(b.start) + "_" + str(b.end) + "_" + b.suffix + "_2", seq2)

            if hdf5_label_file:
                hdf5_label_file.create_dataset(b.chrom + "_" + str(b.start) + "_" + str(b.end) + "_" + b.suffix + "_1", data=b.label)

                if isreverse:
                    hdf5_label_file.create_dataset(b.chrom + "_" + str(b.start) + "_" + str(b.end) + "_" + b.suffix + "_2", data=b.label)

    finally:
        if hdf5_file is not None:
            hdf5_file.close()

        if hdf5_label_file is not None:
            hdf5_label_file.close()
예제 #3
0
def output_loop_npa(data,
                    ref_genome_file,
                    segment_size,
                    data_file,
                    label_file,
                    n_feature=5):
    '''
    output loop data (with 2 boundaries) to files
    data_file: contain IDs of 2 boundaries of loops, boundaries refer to common_data_file for sequence data
    lable_file: 0: no loop, 1: loop

    '''

    chrom_list = get_chromlist(ref_genome_file)

    lb = None
    hdf5_file = None
    hdf5_label_file = None

    try:
        hdf5_file = h5py.File(data_file, mode='w')

        #                                       loop, boundary1 or 2, 4000 x 5
        dt = hdf5_file.create_dataset(
            'data', (len(data) * 2, 2, segment_size, n_feature),
            dtype='b1',
            compression="gzip",
            compression_opts=5)

        if label_file:
            hdf5_label_file = h5py.File(label_file, mode='w')
            lb = hdf5_label_file.create_dataset('label', (len(data) * 2),
                                                dtype='b1',
                                                compression="gzip",
                                                compression_opts=5)

        # len(data) * 2 loops -- including reverse, each has 2 boundaries, each boundary is of size: segment_size * n_feature
        #dt = np.zeros((len(data) * 2, segment_size * n_feature * 2))
        #lb = np.zeros((len(data) * 2))

        k = 0
        for loop in data:
            # print(b.chrom, b.start, b.end)
            b1, b2 = loop.b1, loop.b2

            seq1 = mutation_function.infer_true_seq(b1, chrom_list[b1.chrom],
                                                    segment_size)
            seq2 = mutation_function.infer_true_seq(b2, chrom_list[b2.chrom],
                                                    segment_size)

            if len(seq1) != segment_size or len(seq2) != segment_size:
                raise Exception('length of sequences is wrong: %d, %d' %
                                (len(seq1), len(seq2)))

            seq1_code = line_to_tensor(str(seq1).upper())
            seq2_code = line_to_tensor(str(seq2).upper())

            dt[k, 0, ] = seq1_code
            dt[k, 1, ] = seq2_code
            if lb:
                lb[k] = loop.label
            k += 1

            # complement sequences

            seq5 = seq1.toseq().reverse_complement(
            ) if type(seq1) is MutableSeq else seq1.reverse_complement()
            seq6 = seq2.toseq().reverse_complement(
            ) if type(seq2) is MutableSeq else seq2.reverse_complement()

            if seq5 is None or seq6 is None:
                raise Exception('type sequences is wrong {}'.format(
                    type(seq1)))

            if len(seq5) != segment_size or len(seq6) != segment_size:
                raise Exception('length of sequences is wrong: %d, %d' %
                                (len(seq5), len(seq6)))

            seq5_code = line_to_tensor(str(seq5).upper())
            seq6_code = line_to_tensor(str(seq6).upper())

            dt[k, 0, ] = seq6_code
            dt[k, 1, ] = seq5_code
            if lb:
                lb[k] = loop.label
            k += 1

    finally:
        if hdf5_file:
            hdf5_file.close()

        if hdf5_label_file:
            hdf5_label_file.close()
예제 #4
0
def output_loop(data, ref_genome_file, segment_size, data_file, label_file):
    """Output loop data (with 2 boundaries) to files
    data_file: contain IDs of 2 boundaries of loops, boundaries refer
    to common_data_file for sequence data

    lable_file: 0: no loop, 1: loop
    """

    chrom_list = get_chromlist(ref_genome_file)

    hdf5_file = None
    hdf5_label_file = None

    try:
        hdf5_file = h5py.File(data_file, mode='w')
        if label_file:
            hdf5_label_file = h5py.File(label_file, mode='w')

        for loop in data:
            # print(b.chrom, b.start, b.end)
            b1, b2 = loop.b1, loop.b2

            seq1 = mutation_function.infer_true_seq(b1, chrom_list[b1.chrom],
                                                    segment_size)
            seq2 = mutation_function.infer_true_seq(b2, chrom_list[b2.chrom],
                                                    segment_size)

            if len(seq1) != segment_size or len(seq2) != segment_size:
                raise Exception('length of sequences is wrong: %d, %d' %
                                (len(seq1), len(seq2)))

            seq1_code = line_to_tensor(str(seq1).upper())
            seq2_code = line_to_tensor(str(seq2).upper())

            seq_forward = np.hstack((seq1_code, seq2_code))

            hdf5_file.create_dataset(b1.chrom + "_" + str(b1.start) + "_" +
                                     str(b1.end) + "_" + str(b2.start) + "_" +
                                     str(b2.end) + "_1",
                                     data=seq_forward,
                                     compression="gzip",
                                     compression_opts=5)

            # complement sequences

            seq5 = seq1.toseq().reverse_complement(
            ) if type(seq1) is MutableSeq else seq1.reverse_complement()
            seq6 = seq2.toseq().reverse_complement(
            ) if type(seq2) is MutableSeq else seq2.reverse_complement()

            if seq5 is None or seq6 is None:
                raise Exception('type sequences is wrong {}'.format(
                    type(seq1)))

            if len(seq5) != segment_size or len(seq6) != segment_size:
                raise Exception('length of sequences is wrong: %d, %d' %
                                (len(seq5), len(seq6)))

            seq5_code = line_to_tensor(str(seq5).upper())
            seq6_code = line_to_tensor(str(seq6).upper())

            seq_reverse = np.hstack((seq6_code, seq5_code))

            # b.seq = seq
            hdf5_file.create_dataset(b1.chrom + "_" + str(b1.start) + "_" +
                                     str(b1.end) + "_" + str(b2.start) + "_" +
                                     str(b2.end) + "_2",
                                     data=seq_reverse,
                                     compression="gzip",
                                     compression_opts=5)

            if hdf5_label_file:
                hdf5_label_file.create_dataset(
                    b1.chrom + "_" + str(b1.start) + "_" + str(b1.end) + "_" +
                    str(b2.start) + "_" + str(b2.end) + "_1",
                    data=loop.label)
                hdf5_label_file.create_dataset(
                    b1.chrom + "_" + str(b1.start) + "_" + str(b1.end) + "_" +
                    str(b2.start) + "_" + str(b2.end) + "_2",
                    data=loop.label)

    finally:
        if hdf5_file is not None:
            hdf5_file.close()

        if hdf5_label_file is not None:
            hdf5_label_file.close()