def main(args): import argparse import gc import pathlib from sklearn.externals import joblib import numpy as np from peakachu import scoreUtils, utils np.seterr(divide='ignore', invalid='ignore') pathlib.Path(args.output).mkdir(parents=True, exist_ok=True) model = joblib.load(args.model) # more robust to check if a file is .hic hic_info = utils.read_hic_header(args.path) if hic_info is None: hic = False import cooler Lib = cooler.Cooler(args.path) chromosomes = Lib.chromnames[:] #nam = args.path.split('.cool')[0] else: hic = True chromosomes = utils.get_hic_chromosomes(args.path, args.resolution) #nam = args.path.split('.hic')[0] #nam = nam.split('/')[-1] for key in chromosomes: if key.startswith('chr'): cname = key else: cname = 'chr'+key if not hic: X = scoreUtils.Chromosome(Lib.matrix(balance=args.balance, sparse=True).fetch(key).tocsr(), model=model, cname=cname, lower=args.lower, upper=args.upper, res=args.resolution, width=args.width) else: if args.balance: X = scoreUtils.Chromosome(utils.csr_contact_matrix('KR', args.path, key, key, 'BP', args.resolution), model=model, cname=cname, lower=args.lower, upper=args.upper, res=args.resolution, width=args.width) else: X = scoreUtils.Chromosome(utils.csr_contact_matrix('NONE', args.path, key, key, 'BP', args.resolution), model=model, cname=cname, lower=args.lower, upper=args.upper, res=args.resolution, width=args.width) result, R = X.score() X.writeBed(args.output, result, R)
def main(args): import argparse, gc, pathlib, os import numpy as np from sklearn.externals import joblib from peakachu import scoreUtils, utils np.seterr(divide='ignore', invalid='ignore') pathlib.Path(args.output).mkdir(parents=True, exist_ok=True) model = joblib.load(args.model) hic_info = utils.read_hic_header( args.path) # more robust to check if a file is .hic if hic_info is None: hic = False else: hic = True if not hic: import cooler Lib = cooler.Cooler(args.path) chromosomes = Lib.chromnames[:] else: chromosomes = list(hic_info['chromsizes']) pre = utils.find_chrom_pre(chromosomes) tmp = os.path.split(args.model)[1] # support full path ccname = pre + tmp.split('.pk')[0].lstrip( 'chr') # ccname is consistent with chromosome labels in .hic / .cool cikada = 'chr' + ccname.lstrip('chr') # cikada always has prefix "chr" if not hic: X = scoreUtils.Chromosome(Lib.matrix( balance=args.balance, sparse=True).fetch(ccname).tocsr(), model=model, cname=cikada, lower=args.lower, upper=args.upper, res=args.resolution, width=args.width) else: if args.balance: X = scoreUtils.Chromosome(utils.csr_contact_matrix( 'KR', args.path, ccname, ccname, 'BP', args.resolution), model=model, cname=cikada, lower=args.lower, upper=args.upper, res=args.resolution, width=args.width) else: X = scoreUtils.Chromosome(utils.csr_contact_matrix( 'NONE', args.path, ccname, ccname, 'BP', args.resolution), model=model, cname=cikada, lower=args.lower, upper=args.upper, res=args.resolution, width=args.width) result, R = X.score() X.writeBed(args.output, result, R)
def main(args): from sklearn.externals import joblib import gc import pathlib import straw import numpy as np from peakachu import trainUtils, utils np.seterr(divide='ignore', invalid='ignore') pathlib.Path(args.output).mkdir(parents=True, exist_ok=True) # more robust to check if a file is .hic hic_info = utils.read_hic_header(args.path) if hic_info is None: hic = False else: hic = True coords = trainUtils.parsebed(args.bedpe, lower=2, res=args.resolution) kde, lower, long_start, long_end = trainUtils.learn_distri_kde(coords) if not hic: import cooler Lib = cooler.Cooler(args.path) chromosomes = Lib.chromnames[:] else: chromosomes = utils.get_hic_chromosomes(args.path, args.resolution) # train model per chromosome positive_class = {} negative_class = {} for key in chromosomes: if key.startswith('chr'): chromname = key else: chromname = 'chr' + key print('collecting from {}'.format(key)) if not hic: X = Lib.matrix(balance=args.balance, sparse=True).fetch(key).tocsr() else: if args.balance: X = utils.csr_contact_matrix('KR', args.path, key, key, 'BP', args.resolution) else: X = utils.csr_contact_matrix('NONE', args.path, key, key, 'BP', args.resolution) clist = coords[chromname] try: positive_class[chromname] = np.vstack( (f for f in trainUtils.buildmatrix(X, clist, width=args.width))) neg_coords = trainUtils.negative_generating( X, kde, clist, lower, long_start, long_end) stop = len(clist) negative_class[chromname] = np.vstack( (f for f in trainUtils.buildmatrix( X, neg_coords, width=args.width, positive=False, stop=stop) )) except: print(chromname, ' failed to gather fts') for key in chromosomes: if key.startswith('chr'): chromname = key else: chromname = 'chr' + key Xtrain = np.vstack( (v for k, v in positive_class.items() if k != chromname)) Xfake = np.vstack( (v for k, v in negative_class.items() if k != chromname)) print(chromname, 'pos/neg: ', Xtrain.shape[0], Xfake.shape[0]) model = trainUtils.trainRF(Xtrain, Xfake) joblib.dump(model, args.output + '/' + chromname + '.pkl', compress=('xz', 3))