def configure(len_size=None, genomic_distance=None, methods_name='ours', dataset_path=None, raw_path='raw', raw_hic='Rao2014-GM12878-DpnII-allreps-filtered.10kb.cool', input_path='input', output_path='output'): resolution = None # assigned by cooler binsizes scale = 4 if len_size is None: len_size = 40 block_size = 2048 # number of entries in one file if genomic_distance is None: genomic_distance = 200000 if dataset_path is None: # assume current directory is the root of project # pathto/proj/data # pathto/proj/our_method dataset_path = os.path.join( operations.redircwd_back_projroot( project_name='refine_resolution'), 'data') print('data path: ', dataset_path) input_file = raw_hic.split('-')[0] + '_' + raw_hic.split( '-')[1] + '_' + raw_hic.split('-')[2] + '_' + raw_hic.split('.')[1] input_path = '_'.join( [input_path, methods_name, str(genomic_distance), str(len_size)]) output_file = input_file output_path = '_'.join( [output_path, methods_name, str(genomic_distance), str(len_size)]) # load raw hic matrix file = os.path.join(dataset_path, raw_path, raw_hic) print('raw hic data: ', file) '''if ~os.path.exists(file): url = 'ftp://cooler.csail.mit.edu/coolers/hg19/'+raw_hic print(url) file = wget.download(url, file)''' cool_hic = cooler.Cooler(file) resolution = cool_hic.binsize return cool_hic, resolution, scale, len_size, genomic_distance,\ block_size, dataset_path, \ [raw_path, raw_hic], \ [input_path, input_file], \ [output_path, output_file]
""" configure data: dataset_path-raw -input -output """ if __name__ == '__main__': raw_list = [ 'Rao2014-CH12LX-MboI-allreps-filtered.10kb.cool', 'Rao2014-GM12878-DpnII-allreps-filtered.10kb.cool', 'Rao2014-GM12878-MboI-allreps-filtered.10kb.cool', 'Rao2014-HMEC-MboI-allreps-filtered.10kb.cool', 'Rao2014-HUVEC-MboI-allreps-filtered.10kb.cool', 'Rao2014-IMR90-MboI-allreps-filtered.10kb.cool', 'Rao2014-K562-MboI-allreps-filtered.10kb.cool', 'Rao2014-KBM7-MboI-allreps-filtered.10kb.cool', 'Rao2014-NHEK-MboI-allreps-filtered.10kb.cool' ] root = operations.redircwd_back_projroot(project_name='refine_resolution') raw_hic = 'Rao2014-GM12878-MboI-allreps-filtered.10kb.cool' # raw_hic='Rao2014-GM12878-DpnII-allreps-filtered.10kb.cool' config = configure(raw_hic=raw_hic, len_size=int(sys.argv[2]), genomic_distance=int(sys.argv[3])) chromosome_list = [str(sys.argv[1])] for chri in chromosome_list: save_samples(config, chromosome=chri)
if hic_lr is None: hic_lr = data['hic'] else: hic_lr = np.concatenate((hic_lr, data['hic']), axis=0) return hic_hr, hic_lr if __name__ == '__main__': # the size of input len_size = int(sys.argv[1]) # 40, 128, 200 scale = 4 # genomic_disstance is used for input path, nothing to do with model genomic_distance = int(sys.argv[2]) # 2000000, 2560000 EPOCHS = 300 BATCH_SIZE = 9 root_path = redircwd_back_projroot(project_name='refine_resolution') data_path = os.path.join(root_path, 'data') # raw_hic = 'Rao2014-GM12878-DpnII-allreps-filtered.10kb.cool' raw_hic = 'Rao2014-GM12878-MboI-allreps-filtered.10kb.cool' input_path = '_'.join(['input', 'ours', str(genomic_distance), str(len_size)]) input_file = raw_hic.split('-')[0] + '_' + raw_hic.split('-')[1] + '_' + raw_hic.split('-')[2] + '_' + raw_hic.split('.')[1] log_dir = os.path.join(root_path, 'our_model', 'logs', 'model') logging.basicConfig(filename=os.path.join(log_dir, 'training.log'), level=logging.INFO) # ['1', '2', '3', '4', '5','6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16'] # ['17', '18'] # ['19', '20', '21', '22', 'X'] train_chr_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16'] valid_chr_list = ['17', '18']