def main():
    parser=argparse.ArgumentParser(description='Description',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    
    parser.add_argument('-in',help='Hi-C interaction matrix',dest='infile',type=str,required=True)
    parser.add_argument('-out',help='prefix for output files',dest='outfile',type=str,required=True)
    parser.add_argument('-cv',help='evaluate by cross validation',dest='cv',action='store_true')
    parser.add_argument('-p',help='predict chromosome of unplace contigs',dest='predict_unplaced',action='store_true')
    parser.add_argument('-v',help='List of leave-out half-window sizes for CV (in bps)',dest='v_list',nargs='+',type=float,default=[0,0.5e6,1e6,2e6,5e6,10e6])
    parser.add_argument('-x',help='excluded chrs',dest='excluded_chrs',nargs='+',type=str,default=['chrM','chrY'])
    parser.add_argument('-pc',help='placed chrs',dest='placed_chrs',nargs='+',type=str,default=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8','chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15','chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22','chrX'])

      
    args=parser.parse_args()
      
    infile=args.infile
    outfile=args.outfile
    cv=args.cv
    predict_unplaced=args.predict_unplaced
    eval_on_train=args.eval_on_train
    v_list=args.v_list
    excluded_chrs=args.excluded_chrs
    placed_chrs=args.placed_chrs
    
    sys.stderr.write("Loading data\n")
    
    d,bin_chr,bin_position=triangulation.load_data_txt(infile,remove_nans=True,chrs=placed_chrs)
    bin_mean_position=np.mean(bin_position,1)
    chrs=np.unique(bin_chr)
    
    n=d.shape[0]

    d[np.diag_indices(n)]=0
    
    if cv:

        sys.stderr.write("Evaluating in cross-validation\n")
        
        d_sum=triangulation.func_reduce(d,bin_chr,func=np.sum).T
  
        for v in v_list:
            sys.stderr.write("leaving out bins within "+str(v)+" bps\n")
            
            predicted_chr=[]
            predicted_prob=[]

            for i in np.arange(n):
                
                eps=1e-8
               
                proximal_bins = (bin_chr==bin_chr[i]) & (bin_mean_position>=bin_mean_position[i]-v-eps) & (bin_mean_position<=bin_mean_position[i]+v+eps)

                train_vectors=d_sum.copy()
                train_vectors-=triangulation.func_reduce(d[proximal_bins,:],bin_chr[proximal_bins],func=np.sum,allkeys=chrs).T
                train_vectors/=triangulation.func_reduce(np.ones(len(~proximal_bins)),bin_chr[~proximal_bins],func=np.sum,allkeys=chrs).T
                train_vectors=train_vectors[~proximal_bins,:]
                train_labels=bin_chr[~proximal_bins]

                model=triangulation.AugmentationChrPredModel()

                model.fit(train_vectors,train_labels)

                test_d=d[i,~proximal_bins]
                test_bin_chr=bin_chr[~proximal_bins]

                test_vector=triangulation.average_reduce(test_d,test_bin_chr)

                pred_chr,pred_prob=model.predict(test_vector)
                predicted_chr.append(pred_chr[0])
                predicted_prob.append(pred_prob[0])
                
            predicted_chr=np.array(predicted_chr)
            predicted_prob=np.array(predicted_prob)
            np.savetxt(outfile+'_cvpred_v'+str(v)+'.tab',[bin_chr,bin_position,predicted_chr,predicted_prob],fmt='%s',delimiter='\t')


    if predict_unplaced:

        sys.stderr.write("predicting chromosome of unplaced contigs\n")
        
        # train on all data (without diagonal)
        model=triangulation.AugmentationChrPredModel()
       
        d_avg=triangulation.average_reduce(d,bin_chr).T
      
        model.fit(d_avg,bin_chr)
        
        d,bin_chr,bin_position=triangulation.load_data_txt(infile,remove_nans=True)

        chrs=np.unique(bin_chr)
    
        unplaced_chrs=np.unique((set(bin_chr)-set(placed_chrs))-set(excluded_chrs))
        
        unplaced_chr_bins=np.any(bin_chr[None].T==unplaced_chrs,1)

        d=d[unplaced_chr_bins,:]
        
        d_avg=triangulation.average_reduce(d.T,bin_chr).T

        d_avg=d_avg[:,np.any(chrs[None].T==np.array(placed_chrs),1)]

        pred_pos,pred_prob=model.predict(d_avg)
        
        res=np.c_[bin_chr[unplaced_chr_bins],bin_position[unplaced_chr_bins,:].astype(int),pred_pos,pred_prob]

        np.savetxt(outfile+'_predictions.tab',res,fmt='%s',delimiter='\t')
def main():
    parser = argparse.ArgumentParser(
        description='Description',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',
                        help='Hi-C interaction matrix',
                        dest='infile',
                        type=str,
                        required=True)
    parser.add_argument('-out',
                        help='prefix for output files',
                        dest='outfile',
                        type=str,
                        required=True)
    parser.add_argument('-cv',
                        help='evaluate by cross validation',
                        dest='cv',
                        action='store_true')
    parser.add_argument('-p',
                        help='predict chromosome of unplace contigs',
                        dest='predict_unplaced',
                        action='store_true')
    parser.add_argument(
        '-v',
        help='List of leave-out half-window sizes for CV (in bps)',
        dest='v_list',
        nargs='+',
        type=float,
        default=[0, 0.5e6, 1e6, 2e6, 5e6, 10e6])
    parser.add_argument('-x',
                        help='excluded chrs',
                        dest='excluded_chrs',
                        nargs='+',
                        type=str,
                        default=['chrM', 'chrY'])
    parser.add_argument('-pc',
                        help='placed chrs',
                        dest='placed_chrs',
                        nargs='+',
                        type=str,
                        default=[
                            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',
                            'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',
                            'chr13', 'chr14', 'chr15', 'chr16', 'chr17',
                            'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX'
                        ])

    args = parser.parse_args()

    infile = args.infile
    outfile = args.outfile
    cv = args.cv
    predict_unplaced = args.predict_unplaced
    eval_on_train = args.eval_on_train
    v_list = args.v_list
    excluded_chrs = args.excluded_chrs
    placed_chrs = args.placed_chrs

    sys.stderr.write("Loading data\n")

    d, bin_chr, bin_position = triangulation.load_data_txt(infile,
                                                           remove_nans=True,
                                                           chrs=placed_chrs)
    bin_mean_position = np.mean(bin_position, 1)
    chrs = np.unique(bin_chr)

    n = d.shape[0]

    d[np.diag_indices(n)] = 0

    if cv:

        sys.stderr.write("Evaluating in cross-validation\n")

        d_sum = triangulation.func_reduce(d, bin_chr, func=np.sum).T

        for v in v_list:
            sys.stderr.write("leaving out bins within " + str(v) + " bps\n")

            predicted_chr = []
            predicted_prob = []

            for i in np.arange(n):

                eps = 1e-8

                proximal_bins = (bin_chr == bin_chr[i]) & (
                    bin_mean_position >= bin_mean_position[i] - v - eps) & (
                        bin_mean_position <= bin_mean_position[i] + v + eps)

                train_vectors = d_sum.copy()
                train_vectors -= triangulation.func_reduce(
                    d[proximal_bins, :],
                    bin_chr[proximal_bins],
                    func=np.sum,
                    allkeys=chrs).T
                train_vectors /= triangulation.func_reduce(
                    np.ones(len(~proximal_bins)),
                    bin_chr[~proximal_bins],
                    func=np.sum,
                    allkeys=chrs).T
                train_vectors = train_vectors[~proximal_bins, :]
                train_labels = bin_chr[~proximal_bins]

                model = triangulation.AugmentationChrPredModel()

                model.fit(train_vectors, train_labels)

                test_d = d[i, ~proximal_bins]
                test_bin_chr = bin_chr[~proximal_bins]

                test_vector = triangulation.average_reduce(
                    test_d, test_bin_chr)

                pred_chr, pred_prob = model.predict(test_vector)
                predicted_chr.append(pred_chr[0])
                predicted_prob.append(pred_prob[0])

            predicted_chr = np.array(predicted_chr)
            predicted_prob = np.array(predicted_prob)
            np.savetxt(outfile + '_cvpred_v' + str(v) + '.tab',
                       [bin_chr, bin_position, predicted_chr, predicted_prob],
                       fmt='%s',
                       delimiter='\t')

    if predict_unplaced:

        sys.stderr.write("predicting chromosome of unplaced contigs\n")

        # train on all data (without diagonal)
        model = triangulation.AugmentationChrPredModel()

        d_avg = triangulation.average_reduce(d, bin_chr).T

        model.fit(d_avg, bin_chr)

        d, bin_chr, bin_position = triangulation.load_data_txt(
            infile, remove_nans=True)

        chrs = np.unique(bin_chr)

        unplaced_chrs = np.unique((set(bin_chr) - set(placed_chrs)) -
                                  set(excluded_chrs))

        unplaced_chr_bins = np.any(bin_chr[None].T == unplaced_chrs, 1)

        d = d[unplaced_chr_bins, :]

        d_avg = triangulation.average_reduce(d.T, bin_chr).T

        d_avg = d_avg[:, np.any(chrs[None].T == np.array(placed_chrs), 1)]

        pred_pos, pred_prob = model.predict(d_avg)

        res = np.c_[bin_chr[unplaced_chr_bins],
                    bin_position[unplaced_chr_bins, :].astype(int), pred_pos,
                    pred_prob]

        np.savetxt(outfile + '_predictions.tab', res, fmt='%s', delimiter='\t')
示例#3
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Scaffold chromosome de novo from contig interaction matrix.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',
                        help='interaction frequency matrix file',
                        dest='in_file',
                        type=str,
                        required=True)
    parser.add_argument('-out',
                        help='out file prefix',
                        dest='out_file',
                        type=str,
                        required=True)
    parser.add_argument('-it',
                        help='number of times to rerun L-BFGS',
                        dest='iterations',
                        type=int,
                        default=1)
    parser.add_argument('-p',
                        help='number of processors to use',
                        dest='pnum',
                        type=int,
                        default=0)
    parser.add_argument('-seed',
                        help='seed for L-BFGS init',
                        dest='init_seed',
                        type=int,
                        default=0)
    parser.add_argument('-shuffle_seed',
                        help='seed for shuffle',
                        dest='shuffle_seed',
                        type=int,
                        default=0)
    parser.add_argument(
        '-realpos',
        help=
        'file with actual contig positions (sorted same as interaction matrix). "contig\tstart\tend"',
        dest='realposfile',
        type=str,
        default=None)
    parser.add_argument(
        '-best',
        help='sort by original positions to estimate best solution',
        dest='sort_by_realpos',
        action='store_true')
    parser.add_argument(
        '-drop',
        help=
        'leaves every nth bin in the data,  ignoring the rest. 1 will use the whole dataset',
        dest='drop',
        type=int,
        default=1)
    parser.add_argument(
        '-keep_unreal',
        help='keep contigs for which real position is not known',
        dest='keep_unreal',
        action='store_true')

    parser.add_argument('-lbfgs_pgtol',
                        help='pgtol for lbfgs',
                        dest='lbfgs_pgtol',
                        type=float,
                        default=1e-9)
    parser.add_argument('-lbfgs_factr',
                        help='factr for lbfgs',
                        dest='lbfgs_factr',
                        type=float,
                        default=1e4)
    parser.add_argument('-lbfgs_show',
                        help='show lbfgs iterations (only with pnum = 1)',
                        dest='lbfgs_show',
                        action='store_true')

    args = parser.parse_args()

    in_file = args.in_file
    out_file = args.out_file
    pnum = args.pnum
    iterations = args.iterations
    init_seed = args.init_seed
    shuffle_seed = args.shuffle_seed
    sort_by_realpos = args.sort_by_realpos
    drop = args.drop
    lbfgs_pgtol = args.lbfgs_pgtol
    lbfgs_factr = args.lbfgs_factr
    lbfgs_show = args.lbfgs_show

    realposfile = args.realposfile
    keep_unreal = args.keep_unreal

    logger("loading interactions from %s ..." % in_file)
    chrs = []  #'ENA|CP002684|CP002684.1',]
    d, bin_chr, bin_position = tr.load_data_txt(in_file,
                                                retain=drop,
                                                remove_nans=True,
                                                rename=True,
                                                chrs=chrs)
    logger(" loaded matrix with %s contigs." % d.shape[0])

    if realposfile != None:
        logger("loading real positions from %s ..." % realposfile)
        contig_pos_dict = {}
        with open(realposfile, "r") as fh:
            for line in fh:
                c_name, c_start, c_end = line.rstrip("\n").split("\t")
                contig_pos_dict[c_name] = (float(c_start), float(c_end))

        realpos = np.array(
            [contig_pos_dict.get(i, (np.nan, np.nan)) for i in bin_chr])
        realpos = realpos[:, 0] + np.mean(bin_position, 1)

        if not keep_unreal:
            logger("removing contigs without real positions...")

            relevant = ~np.isnan(realpos)
            realpos = realpos[relevant]
            d = d[relevant, :][:, relevant]
            bin_chr = bin_chr[relevant]

            logger(" %s contigs left." % d.shape[0])

    # average contigs that share the same id
    logger("averaging contigs that share the same id...")
    d = tr.average_reduce_2d(d, bin_chr)

    if realposfile != None:
        realpos = tr.average_reduce(realpos, bin_chr)

    bin_chr = np.unique(bin_chr)
    logger(" %s contigs left." % d.shape[0])

    shuffle = True
    if (sort_by_realpos):
        if realposfile == None:
            sys.exit('-best requires -realpos')
        if np.any(np.isnan(realpos)):
            sys.exit(
                '-best requires real positions to be given for ALL contigs')

        rr = np.argsort(realpos)
        realpos = realpos[rr]
        d = d[rr, :][:, rr]
        bin_chr = bin_chr[rr]
        shuffle = False

    logger("scaffolding %s contigs ..." % d.shape[0])
    logger(" running %s optimisations in %s threads ..." % (iterations, pnum))
    scales, pos, x0, fvals = tr.assemble_chromosome(d,
                                                    pnum,
                                                    iterations,
                                                    shuffle,
                                                    shuffle_seed,
                                                    init_seed,
                                                    return_all=True,
                                                    log_data=True,
                                                    lbfgs_factr=lbfgs_factr,
                                                    lbfgs_pgtol=lbfgs_pgtol,
                                                    approx_grad=False,
                                                    lbfgs_show=lbfgs_show)

    logger("saving results ...")
    if realposfile != None:
        print pos
        np.savetxt(out_file + '_predpos.tab',
                   np.rec.fromarrays([bin_chr, realpos, pos[0, :]]),
                   fmt='%s',
                   delimiter='\t')
        # plot
        plt.plot(realpos, pos[0, :], 'b.')
        plt.xlabel("Expected position")
        plt.ylabel("Predicted position")
        plt.savefig(out_file + '_predpos.png')
    else:
        np.savetxt(out_file + '_predpos.tab',
                   np.rec.fromarrays([bin_chr, pos[0, :]]),
                   fmt='%s',
                   delimiter='\t')
    np.savetxt(out_file + '_pos_all.tab', pos, fmt='%s', delimiter='\t')
    np.savetxt(out_file + '_x0_all.tab', x0, fmt='%s', delimiter='\t')
    np.savetxt(out_file + '_fvals_all.tab', fvals, fmt='%s', delimiter='\t')
    np.savetxt(out_file + '_scales_all.tab', scales, fmt='%s', delimiter='\t')
    logger(" done.")
def main():
    
    parser=argparse.ArgumentParser(description='Scaffold chromosome de novo from contig interaction matrix.',formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',help='interaction frequency matrix file',dest='in_file',type=str,required=True)
    parser.add_argument('-out',help='out file prefix',dest='out_file',type=str,required=True)
    parser.add_argument('-it',help='number of times to rerun L-BFGS',dest='iterations',type=int,default=1)
    parser.add_argument('-p',help='number of processors to use',dest='pnum',type=int,default=0)
    parser.add_argument('-seed',help='seed for L-BFGS init',dest='init_seed',type=int,default=0)
    parser.add_argument('-shuffle_seed',help='seed for shuffle',dest='shuffle_seed',type=int,default=0)
    parser.add_argument('-realpos',help='file with actual contig positions (sorted same as interaction matrix). "contig\tstart\tend"',dest='realposfile',type=str,default=None)
    parser.add_argument('-best',help='sort by original positions to estimate best solution',dest='sort_by_realpos',action='store_true')
    parser.add_argument('-drop',help='leaves every nth bin in the data, ignoring the rest. 1 will use the whole dataset',dest='drop',type=int,default=1)
    parser.add_argument('-keep_unreal',help='keep contigs for which real position is not known',dest='keep_unreal',action='store_true')
    
    parser.add_argument('-lbfgs_pgtol',help='pgtol for lbfgs',dest='lbfgs_pgtol',type=float,default=1e-9)
    parser.add_argument('-lbfgs_factr',help='factr for lbfgs',dest='lbfgs_factr',type=float,default=1e4)
    parser.add_argument('-lbfgs_show',help='show lbfgs iterations (only with pnum=1)',dest='lbfgs_show',action='store_true')
    
    args=parser.parse_args()
      
    in_file=args.in_file
    out_file=args.out_file
    pnum=args.pnum
    iterations=args.iterations
    init_seed=args.init_seed
    shuffle_seed=args.shuffle_seed
    sort_by_realpos=args.sort_by_realpos
    drop=args.drop
    lbfgs_pgtol=args.lbfgs_pgtol
    lbfgs_factr=args.lbfgs_factr
    lbfgs_show=args.lbfgs_show
    
    realposfile=args.realposfile
    keep_unreal=args.keep_unreal

    sys.stderr.write("loading interactions from "+in_file+" ...\n")
    
    d,bin_chr,bin_position=triangulation.load_data_txt(in_file,retain=drop,remove_nans=True)

    sys.stderr.write("loaded matrix with "+str(d.shape[0])+" contigs.\n")

    if realposfile!=None:

        sys.stderr.write("loading real positions from "+realposfile+" ...\n")
    
        contig_pos_dict={}
        with open(realposfile,"r") as fh:
            for line in fh:
                c_name,c_start,c_end=line.rstrip("\n").split("\t")
                contig_pos_dict[c_name] = (float(c_start),float(c_end))

        realpos=np.array([contig_pos_dict.get(i,(np.nan,np.nan)) for i in bin_chr])

        realpos=realpos[:,0]+np.mean(bin_position,1)
        
        if not keep_unreal:
            sys.stderr.write("removing contigs without real positions...\n")
        
            relevant = ~np.isnan(realpos)
            realpos=realpos[relevant]
            d=d[relevant,:][:,relevant]
            bin_chr=bin_chr[relevant]

            sys.stderr.write(str(d.shape[0])+" contigs left.\n")

    # average contigs that share the same id

    sys.stderr.write("averaging contigs that share the same id...\n")

    d=triangulation.average_reduce_2d(d,bin_chr)
    
    if realposfile!=None:
        realpos=triangulation.average_reduce(realpos,bin_chr)
    
    bin_chr=np.unique(bin_chr)

    sys.stderr.write(str(d.shape[0])+" contigs left.\n")

        
    shuffle=True
    if (sort_by_realpos):
        if realposfile==None:
            sys.exit('-best requires -realpos')
        if np.any(np.isnan(realpos)):
            sys.exit('-best requires real positions to be given for ALL contigs')
        
        rr=np.argsort(realpos)
        realpos=realpos[rr]
        d=d[rr,:][:,rr]
        bin_chr=bin_chr[rr]
        shuffle=False


    sys.stderr.write("scaffolding "+str(d.shape[0])+" contigs ...\n")

    
    scales,pos,x0,fvals=triangulation.assemble_chromosome(d,pnum=pnum,iterations=iterations,shuffle=shuffle,return_all=True,shuffle_seed=shuffle_seed,init_seed=init_seed,log_data=True,lbfgs_factr=lbfgs_factr,lbfgs_pgtol=lbfgs_pgtol,approx_grad=False,lbfgs_show=lbfgs_show)

    
    sys.stderr.write("saving results ...\n")
    
    
    if realposfile!=None:
        np.savetxt(out_file+'_predpos.tab',np.rec.fromarrays([bin_chr,realpos,pos[0,:]]),fmt='%s',delimiter='\t')
    else:
        np.savetxt(out_file+'_predpos.tab',np.rec.fromarrays([bin_chr,pos[0,:]]),fmt='%s',delimiter='\t')
        
    np.savetxt(out_file+'_pos_all.tab',pos,fmt='%s',delimiter='\t')

    np.savetxt(out_file+'_x0_all.tab',x0,fmt='%s',delimiter='\t')
        
    np.savetxt(out_file+'_fvals_all.tab',fvals,fmt='%s',delimiter='\t')

    np.savetxt(out_file+'_scales_all.tab',scales,fmt='%s',delimiter='\t')

    sys.stderr.write("done.\n")