def do_one(args): matrix_type = args.matrix_type rbp_ps = args.rbp_ps examine_tops = args.examine_tops result = subprocess.call("Rscript --version 2> /dev/null", shell=True) if result == 127: sys.stderr.write("Rscript is not found") sys.stderr.flush() sys.exit(1) if not os.path.isdir(args.output): os.mkdir(args.output) log_file = "%s/%s" % (args.output, args.log_file) logger = logging.getLogger("prep_fast") logger.setLevel(logging.DEBUG) if not logger.hasHandlers(): handler = logging.FileHandler(log_file) handler.setFormatter( logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s")) logger.addHandler(handler) if args.verbose: logger.addHandler(logging.StreamHandler()) t_overwrite = args.overwrite_input_bin #if there is no binary input file, create one check_required = ["expr.npy", "Xcen.npy", "genes.npy"] for rbp_p in args.rbp_ps: check_required.append("t_matrix_%s_%.2f.npy" % (args.matrix_type, rbp_p)) for cr in check_required: if not os.path.isfile("%s/%s" % (args.output, cr)): t_overwrite = True break expr, Xcen, genes, t_matrix = None, None, None, None if t_overwrite: expr, genes, Xcen = read_matrix(f_expr=args.expr, f_Xcen=args.centroid, logger=logger) logger.info( "Calculate all pairwise Euclidean distance between cells using their physical coordinates" ) euc = squareform(pdist(Xcen, metric="euclidean")) for rbp_p in args.rbp_ps: logger.info("For rbp_p %.2f:" % rbp_p) t_matrix = spatial_genes.rank_transform_matrix( euc, reverse=False, rbp_p=rbp_p, matrix_type=matrix_type, logger=logger) np.save( "%s/t_matrix_%s_%.2f.npy" % (args.output, args.matrix_type, rbp_p), t_matrix) np.save("%s/expr.npy" % args.output, expr) np.save("%s/Xcen.npy" % args.output, Xcen) np.save("%s/genes.npy" % args.output, genes) else: logger.info("Using existing input binaries...") expr = np.load("%s/expr.npy" % args.output) Xcen = np.load("%s/Xcen.npy" % args.output) genes = np.load("%s/genes.npy" % args.output) ncell = Xcen.shape[0] for rbp_p in args.rbp_ps: for examine_top in args.examine_tops: outdir = "%s/result_fast_sim_5000_%.2f_%.3f" % (args.output, rbp_p, examine_top) if matrix_type == "dissim": outdir = "%s/result_fast_5000_%.2f_%.3f" % (args.output, rbp_p, examine_top) if not os.path.isdir(outdir): os.mkdir(outdir) source_path = os.path.dirname(silhouetteRank.__file__) if not os.path.isfile("%s/do_kmeans.R" % outdir): copyfile("%s/do_kmeans.R" % source_path, "%s/do_kmeans.R" % outdir) if not os.path.isfile("%s/do_gpd.R" % outdir): copyfile("%s/do_gpd.R" % source_path, "%s/do_gpd.R" % outdir) if not os.path.isfile("%s/qval.R" % outdir): copyfile("%s/qval.R" % source_path, "%s/qval.R" % outdir)
def do_one(args): matrix_type = args.matrix_type rbp_p = args.rbp_p if not os.path.isdir(args.output): os.mkdir(args.output) logdir = "%s/logs.fast" % args.output if not os.path.isdir(logdir): os.mkdir(logdir) log_file = "%s/real_%.2f_%.3f.out" % (logdir, args.rbp_p, args.examine_top) logger = logging.getLogger("real_fast_%.2f_%.3f" % (args.rbp_p, args.examine_top)) logger.setLevel(logging.DEBUG) if not logger.hasHandlers(): handler = logging.FileHandler(log_file, "w") handler.setFormatter( logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s")) logger.addHandler(handler) if args.verbose: logger.addHandler(logging.StreamHandler()) t_overwrite = args.overwrite_input_bin check_required = [ "expr.npy", "Xcen.npy", "genes.npy", "t_matrix_%s_%.2f.npy" % (args.matrix_type, args.rbp_p) ] for cr in check_required: if not os.path.isfile("%s/%s" % (args.output, cr)): t_overwrite = True break expr, Xcen, genes, t_matrix = None, None, None, None if t_overwrite: expr, Xcen, genes = read(f_expr=args.expr, f_Xcen=args.centroid, logger=logger) logger.info( "Calculate all pairwise Euclidean distance between cells using their physical coordinates" ) euc = squareform(pdist(Xcen, metric="euclidean")) logger.info( "Rank transform euclidean distance, and then apply exponential transform" ) t_matrix = spatial_genes.rank_transform_matrix(euc, reverse=False, rbp_p=rbp_p, matrix_type=matrix_type, logger=logger) np.save( "%s/t_matrix_%s_%.2f.npy" % (args.output, args.matrix_type, args.rbp_p), t_matrix) np.save("%s/expr.npy" % args.output, expr) np.save("%s/Xcen.npy" % args.output, Xcen) np.save("%s/genes.npy" % args.output, genes) else: logger.info("Using existing input binaries...") expr = np.load("%s/expr.npy" % args.output) Xcen = np.load("%s/Xcen.npy" % args.output) genes = np.load("%s/genes.npy" % args.output) t_matrix = np.load("%s/t_matrix_%s_%.2f.npy" % (args.output, args.matrix_type, args.rbp_p)) logger.info("Compute silhouette metric per gene using fast method") examine_top = args.examine_top for t_trial in range(args.num_trials): res = spatial_genes.calc_silhouette_per_gene_approx( genes=genes, expr=expr, matrix=t_matrix, matrix_type=matrix_type, examine_top=examine_top, logger=logger) if matrix_type == "sim": f_name = "%s/silhouette.sim.fast.rbp.%.2f.top.%.3f.%d.txt" % ( args.output, rbp_p, examine_top, t_trial) else: f_name = "%s/silhouette.fast.rbp.%.2f.top.%.3f.%d.txt" % ( args.output, rbp_p, examine_top, t_trial) fw = open(f_name, "w") for ind, v in enumerate(res): fw.write("%d\t%s\t%.10f\n" % (ind, v[0], v[1])) fw.close()
continue scores = [] f = open("%s/%d" % (outdir, target)) for l in f: l = l.rstrip("\n") scores.append(float(l)) f.close() if len(scores)!=5000: size_to_do.append(target) if size_to_do==[]: sys.exit(0) expr, genes, Xcen = read_matrix() ncell = Xcen.shape[0] sys.stdout.write("Calculate all pairwise Euclidean distance between cells using their physical coordinates\n") euc = squareform(pdist(Xcen, metric="euclidean")) sys.stdout.write("Rank transform euclidean distance, and then apply exponential transform\n") t_matrix = spatial_genes.rank_transform_matrix(euc, reverse=False, rbp_p=rbp_p, matrix_type=matrix_type) sys.stdout.write("Compute silhouette metric per gene\n") source_path = os.path.dirname(silhouetteRank.__file__) if not os.path.isfile("%s/do_gpd.R" % outdir): copyfile("%s/do_gpd.R" % source_path, "%s/do_gpd.R" % outdir) if not os.path.isfile("%s/do_kmeans.R" % outdir): copyfile("%s/do_kmeans.R" % source_path, "%s/do_kmeans.R" % outdir) res = spatial_genes.random_pattern(matrix=t_matrix, matrix_type=matrix_type, num_cell = ncell, sizes=size_to_do, trials_per_gene=5000, run_gpd=True, outdir=outdir)
def do_one(args): matrix_type = args.matrix_type rbp_p = args.rbp_p examine_top = args.examine_top #matrix_type = "dissim" # sim or dissim result = subprocess.call("Rscript --version 2> /dev/null", shell=True) if result == 127: sys.stderr.write("Rscript is not found") sys.stderr.flush() sys.exit(1) if not os.path.isdir(args.output): os.mkdir(args.output) logdir = "%s/logs" % args.output if not os.path.isdir(logdir): os.mkdir(logdir) log_file = "%s/%.2f_%.3f.out" % (logdir, args.rbp_p, args.examine_top) logger = logging.getLogger("random_%.2f_%.3f" % (args.rbp_p, args.examine_top)) logger.setLevel(logging.DEBUG) t_overwrite = args.overwrite_input_bin #if there is no binary input file, create one check_required = [ "expr.npy", "Xcen.npy", "genes.npy", "t_matrix_%s_%.2f.npy" % (args.matrix_type, args.rbp_p) ] for cr in check_required: if not os.path.isfile("%s/%s" % (args.output, cr)): t_overwrite = True break expr, Xcen, genes, t_matrix = None, None, None, None if t_overwrite: expr, genes, Xcen = read_matrix(f_expr=args.expr, f_Xcen=args.centroid, logger=logger) logger.info( "Calculate all pairwise Euclidean distance between cells using their physical coordinates" ) euc = squareform(pdist(Xcen, metric="euclidean")) logger.info( "Rank transform euclidean distance, and then apply exponential transform" ) t_matrix = spatial_genes.rank_transform_matrix(euc, reverse=False, rbp_p=rbp_p, matrix_type=matrix_type, logger=logger) np.save( "%s/t_matrix_%s_%.2f.npy" % (args.output, args.matrix_type, args.rbp_p), t_matrix) np.save("%s/expr.npy" % args.output, expr) np.save("%s/Xcen.npy" % args.output, Xcen) np.save("%s/genes.npy" % args.output, genes) else: logger.info("Using existing input binaries...") expr = np.load("%s/expr.npy" % args.output) Xcen = np.load("%s/Xcen.npy" % args.output) genes = np.load("%s/genes.npy" % args.output) t_matrix = np.load("%s/t_matrix_%s_%.2f.npy" % (args.output, args.matrix_type, args.rbp_p)) #expr, genes, Xcen = read_matrix() ncell = Xcen.shape[0] #sys.stdout.write("Calculate all pairwise Euclidean distance between cells using their physical coordinates\n") #euc = squareform(pdist(Xcen, metric="euclidean")) #sys.stdout.write("Rank transform euclidean distance, and then apply exponential transform\n") #t_matrix = spatial_genes.rank_transform_matrix(euc, reverse=False, rbp_p=rbp_p, matrix_type=matrix_type) logger.info("Compute silhouette metric per gene") outdir = "%s/result_sim_5000_%.2f_%.3f" % (args.output, rbp_p, examine_top) if matrix_type == "dissim": outdir = "%s/result_5000_%.2f_%.3f" % (args.output, rbp_p, examine_top) if not os.path.isdir(outdir): os.mkdir(outdir) source_path = os.path.dirname(silhouetteRank.__file__) if not os.path.isfile("%s/do_gpd.R" % outdir): copyfile("%s/do_gpd.R" % source_path, "%s/do_gpd.R" % outdir) if not os.path.isfile("%s/do_kmeans.R" % outdir): copyfile("%s/do_kmeans.R" % source_path, "%s/do_kmeans.R" % outdir) sizes = read_frequency(expr=expr, genes=genes, Xcen=Xcen, frequency_file=None, \ read_from_file=False, outdir=outdir, examine_top=examine_top, num_query_sizes=args.query_sizes) res = spatial_genes.random_pattern(matrix=t_matrix, matrix_type=matrix_type, num_cell=ncell, sizes=sizes, trials_per_gene=5000, run_gpd=True, outdir=outdir, logger=logger)