Exemplo n.º 1
0
def main(species,
         window_size,
         motif_metadata,
         bin_sorted_hits,
         group_loci=100000):

    motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None)
    motif_metadata.columns = ['dataset_id', 'factor', 'source']
    motif_metadata = motif_metadata.set_index('dataset_id')
    motif_metadata = motif_metadata.drop_duplicates()

    data = DataInterface(species,
                         window_size=window_size,
                         download_if_not_exists=False,
                         make_new=False,
                         load_genes=False)

    print(data.path)
    raise Exception()

    data.create_binding_dataset(TECHNOLOGY, motif_metadata.index.values,
                                **motif_metadata.to_dict('list'))

    id_to_idx_map = dict(
        zip(data.list_binding_datasets(TECHNOLOGY),
            np.arange(len(data.list_binding_datasets(TECHNOLOGY)))))

    current_pos = 0
    last_added_chunk = 0
    i = 0
    rows, cols, scores = [], [], []

    with open(bin_sorted_hits, 'r') as f:

        for line in f:
            motif_id, bin_num, score = line.strip().split()

            bin_num = int(bin_num)

            if bin_num < current_pos:
                raise Exception('Input file not sorted!')
            elif bin_num > current_pos and i >= group_loci:
                print('Adding matrix segment ...')
                matrix_form = sparse.coo_matrix((scores, (rows, cols))).tocsr()
                data.append_csr(TECHNOLOGY, matrix_form)
                last_added_chunk = bin_num
                i = 0
                rows, cols, scores = [], [], []

            tf_idx = id_to_idx_map[motif_id]
            rows.append(bin_num - last_added_chunk)
            cols.append(tf_idx)
            scores.append(int(score))
            current_pos = bin_num
            i += 1

        if len(rows) > 0:
            matrix_form = sparse.coo_matrix((scores, (rows, cols))).tocsr()
            data.append_csr(TECHNOLOGY, matrix_form)
Exemplo n.º 2
0
def main(species, window_size, cistrome_metadata, motif_metadata, index_files):

    cistrome_metadata = pd.read_csv(cistrome_metadata,
                                    sep='\t').set_index('DCid')
    cistrome_metadata.index = cistrome_metadata.index.astype(str)
    motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None)
    motif_metadata.columns = ['dataset_id', 'factor', 'source']
    motif_metadata = motif_metadata.set_index('dataset_id')
    motif_metadata = motif_metadata.drop_duplicates()

    data = DataInterface(species,
                         window_size=window_size,
                         download_if_not_exists=False,
                         make_new=False,
                         load_genes=False)

    dataset_ids = motif_metadata.index.values

    data.create_binding_dataset('Motif', dataset_ids)