예제 #1
0
def main(species,
         window_size,
         motif_metadata,
         bin_sorted_hits,
         group_loci=100000):

    motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None)
    motif_metadata.columns = ['dataset_id', 'factor', 'source']
    motif_metadata = motif_metadata.set_index('dataset_id')
    motif_metadata = motif_metadata.drop_duplicates()

    data = DataInterface(species,
                         window_size=window_size,
                         download_if_not_exists=False,
                         make_new=False,
                         load_genes=False)

    print(data.path)
    raise Exception()

    data.create_binding_dataset(TECHNOLOGY, motif_metadata.index.values,
                                **motif_metadata.to_dict('list'))

    id_to_idx_map = dict(
        zip(data.list_binding_datasets(TECHNOLOGY),
            np.arange(len(data.list_binding_datasets(TECHNOLOGY)))))

    current_pos = 0
    last_added_chunk = 0
    i = 0
    rows, cols, scores = [], [], []

    with open(bin_sorted_hits, 'r') as f:

        for line in f:
            motif_id, bin_num, score = line.strip().split()

            bin_num = int(bin_num)

            if bin_num < current_pos:
                raise Exception('Input file not sorted!')
            elif bin_num > current_pos and i >= group_loci:
                print('Adding matrix segment ...')
                matrix_form = sparse.coo_matrix((scores, (rows, cols))).tocsr()
                data.append_csr(TECHNOLOGY, matrix_form)
                last_added_chunk = bin_num
                i = 0
                rows, cols, scores = [], [], []

            tf_idx = id_to_idx_map[motif_id]
            rows.append(bin_num - last_added_chunk)
            cols.append(tf_idx)
            scores.append(int(score))
            current_pos = bin_num
            i += 1

        if len(rows) > 0:
            matrix_form = sparse.coo_matrix((scores, (rows, cols))).tocsr()
            data.append_csr(TECHNOLOGY, matrix_form)
예제 #2
0
def main(args):

    cistrome_metadata = pd.read_csv(args.cistrome_metadata,
                                    sep='\t').set_index('DCid')
    cistrome_metadata.index = cistrome_metadata.index.astype(str)

    data = DataInterface(args.species,
                         window_size=args.window_size,
                         download_if_not_exists=False,
                         make_new=False,
                         load_genes=True)

    rp_map_styles = data.get_rp_maps()

    if len(rp_map_styles) == 0:

        basic_rp_map, enhanced_rp_map = data.build_binned_rp_map(
            'basic', 10000), data.build_binned_rp_map('enhanced', 10000)

        data.add_rp_map('basic_10K', basic_rp_map)
        data.add_rp_map('enhanced_10K', enhanced_rp_map)

    else:

        basic_rp_map = data.get_rp_map('basic_10K')
        enhanced_rp_map = data.get_rp_map('enhanced_10K')

    for arr_name in args.coverage_arrays:

        coverage_array = np.load(arr_name)

        technology, dataset_id = os.path.basename(arr_name).split('_')

        dataset_id = '.'.join(dataset_id.split('.')[:-1])

        metadata_headers = data.get_metadata_headers(technology)

        meta_dict = cistrome_metadata.loc[dataset_id,
                                          metadata_headers].to_dict()

        data.add_profile_data(technology, dataset_id, coverage_array,
                              [basic_rp_map, enhanced_rp_map],
                              ['basic_10K', 'enhanced_10K'], **meta_dict)
예제 #3
0
def main(species, window_size, cistrome_metadata, motif_metadata, index_files):

    cistrome_metadata = pd.read_csv(cistrome_metadata,
                                    sep='\t').set_index('DCid')
    cistrome_metadata.index = cistrome_metadata.index.astype(str)
    motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None)
    motif_metadata.columns = ['dataset_id', 'factor', 'source']
    motif_metadata = motif_metadata.set_index('dataset_id')
    motif_metadata = motif_metadata.drop_duplicates()

    data = DataInterface(species,
                         window_size=window_size,
                         download_if_not_exists=False,
                         make_new=False,
                         load_genes=False)

    dataset_ids = motif_metadata.index.values

    data.create_binding_dataset('Motif', dataset_ids)
예제 #4
0
def main(species, window_size, cistrome_metadata, motif_metadata, index_files):

    cistrome_metadata = pd.read_csv(cistrome_metadata,
                                    sep='\t').set_index('DCid')
    cistrome_metadata.index = cistrome_metadata.index.astype(str)
    motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None)
    motif_metadata.columns = ['dataset_id', 'factor', 'source']
    motif_metadata = motif_metadata.set_index('dataset_id')
    motif_metadata = motif_metadata.drop_duplicates()

    data = DataInterface(species,
                         window_size=window_size,
                         download_if_not_exists=False,
                         make_new=False,
                         load_genes=False)

    for index_file in index_files:

        with open(index_file, 'r') as f:
            hit_bins = np.array([int(ind.strip()) for ind in f.readlines()])

        technology, dataset_id = os.path.basename(index_file).split('_')

        dataset_id = '.'.join(dataset_id.split('.')[:-1])

        metadata_headers = data.get_metadata_headers(technology)

        if technology == 'Motifs':
            meta_dict = motif_metadata.loc[dataset_id,
                                           metadata_headers].to_dict()
            meta_dict['source'] = 'jaspar'
        else:
            meta_dict = cistrome_metadata.loc[dataset_id,
                                              metadata_headers].to_dict()

        data.add_binding_data(technology, dataset_id, hit_bins, **meta_dict)