def fit(self, path=None, data_format="csv", n_features=None, cascade_arity=2, n_partitions=4, cascade_iterations=5, tol=10**-3, C=1.0, kernel="rbf", gamma="auto"): try: self._kernel_f = getattr(self, CascadeSVM.name_to_kernel[kernel]) except AttributeError: self._kernel_f = getattr(self, CascadeSVM.name_to_kernel["rbf"]) assert (gamma is "auto" or type(gamma) == float or type(float(gamma)) == float), "Gamma is not a valid float" assert (kernel is None or kernel in self.name_to_kernel.keys()), \ "Incorrect kernel value [%s], available kernels are %s" % ( kernel, self.name_to_kernel.keys()) assert (C is None or type(C) == float or type(float(C)) == float), \ "Incorrect C type [%s], type : %s" % (C, type(C)) assert cascade_arity > 1, "Cascade arity must be greater than 1" assert cascade_iterations > 0, "Max iterations must be greater than 0" self._cascade_arity = cascade_arity self._max_iterations = cascade_iterations self._npartitions = n_partitions self._tol = tol self._last_W = 0 self._clf = None self._clf_params = {"gamma": gamma, "C": C, "kernel": kernel} self.read_time = time() self.total_time = time() # if data_format == "libsvm": assert n_features > 0 or data_format != "libsvm" # "Number of features is required when using libsvm format" files = os.listdir(path) if not n_features: n_features = self._count_features(os.path.join(path, files[0])) partitions = self._read_dir(files, path, data_format, n_features) # Uncomment to measure read time # barrier() self.read_time = time() - self.read_time self.fit_time = time() self._cascade_fit(partitions) barrier() self.fit_time = time() - self.fit_time self.total_time = time() - self.total_time
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-e", "--estimators", metavar="N_ESTIMATORS", type=int, help="default is 10", default=10) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-md", "--max_depth", metavar="MAX_DEPTH", type=int, help="default is np.inf", required=False) parser.add_argument("-dd", "--dist_depth", metavar="DIST_DEPTH", type=int, help="default is auto", required=False) parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("--dense", help="use dense data structures", action="store_true") parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH", help="test file path", type=str, required=False) parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) y = x[:, x.shape[1] - 2:x.shape[1] - 1] x = x[:, :x.shape[1] - 1] if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() if args.dist_depth: dist_depth = args.dist_depth else: dist_depth = "auto" if args.max_depth: max_depth = args.max_depth else: max_depth = np.inf forest = RandomForestClassifier(n_estimators=args.estimators, max_depth=max_depth, distr_depth=dist_depth) forest.fit(x, y) barrier() fit_time = time.time() - s_time out = [ forest.n_estimators, forest.distr_depth, forest.max_depth, read_time, fit_time ] if args.test_file: if args.svmlight: x_test, y_test = ds.load_svmlight_file(args.test_file, block_size, args.features, sparse) else: x_test = ds.load_txt_file(args.test_file, block_size) y_test = x_test[:, x_test.shape[1] - 1:x_test.shape[1]] x_test = x_test[:, :x_test.shape[1] - 1] out.append(compss_wait_on(forest.score(x_test, y_test))) print(out)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-k", "--kernel", metavar="KERNEL", type=str, help="linear or rbf (default is rbf)", choices=["linear", "rbf"], default="rbf") parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int, help="default is 2", default=2) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS", type=int, help="default is 5", default=5) parser.add_argument("-g", "--gamma", metavar="GAMMA", type=float, help="(only for rbf kernel) default is 1 / n_features", default=None) parser.add_argument("-c", metavar="C", type=float, default=1, help="Penalty parameter C of the error term. " "Default:1") parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH", help="test file path", type=str, required=False) parser.add_argument("-o", "--output_file", metavar="OUTPUT_FILE_PATH", help="output file path", type=str, required=False) parser.add_argument("--convergence", help="check for convergence", action="store_true") parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("-s", "--shuffle", help="shuffle input data", action="store_true") args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 if not args.gamma: gamma = "auto" else: gamma = args.gamma sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) y = x[:, x.shape[1] - 2: x.shape[1] - 1] x = x[:, :x.shape[1] - 1] if args.shuffle: x, y = shuffle(x, y) if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() csvm = CascadeSVM(cascade_arity=args.arity, max_iter=args.iteration, c=args.c, gamma=gamma, check_convergence=args.convergence, verbose=args.verbose) csvm.fit(x, y) barrier() fit_time = time.time() - s_time out = [args.kernel, args.arity, args.part_size, csvm._clf_params["gamma"], args.c, csvm.iterations, csvm.converged, read_time, fit_time] if os.path.isdir(train_data): n_files = os.listdir(train_data) out.append(len(n_files)) if args.test_file: if args.svmlight: x_test, y_test = ds.load_svmlight_file(args.test_file, block_size, args.features, sparse) else: x_test = ds.load_txt_file(args.test_file, block_size) y_test = x_test[:, x_test.shape[1] - 1: x_test.shape[1]] x_test = x_test[:, :x_test.shape[1] - 1] out.append(compss_wait_on(csvm.score(x_test, y_test))) if args.output_file: with open(args.output_file, "ab") as f: wr = csv.writer(f) wr.writerow(out) else: print(out)
def bam_merge(self, in_bam_job_files): """ Wrapper task taking any number of bam files and merging them into a single bam file. Parameters ---------- bam_job_files : list List of the locations of the separate bam files that are to be merged The first file in the list will be taken as the output file name """ merge_round = -1 bam_job_files = [i for i in in_bam_job_files] while True: merge_round += 1 if len(bam_job_files) > 1: tmp_alignments = [] if bam_job_files: while len(bam_job_files) >= 10: current_list_len = len(bam_job_files) for i in range(0, current_list_len - 9, 10): # pylint: disable=unused-variable bam_out = bam_job_files[0] + "_merge_" + str( merge_round) + ".bam" tmp_alignments.append(bam_out) self.bam_merge_10(bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_out) bam_out = bam_job_files[0] + "_merge_" + str( merge_round) + ".bam" if len(bam_job_files) >= 5: tmp_alignments.append(bam_out) self.bam_merge_5(bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_out) bam_out = bam_job_files[0] + "_merge_" + str( merge_round) + ".bam" if len(bam_job_files) == 4: tmp_alignments.append(bam_out) self.bam_merge_4(bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_out) elif len(bam_job_files) == 3: tmp_alignments.append(bam_out) self.bam_merge_3(bam_job_files.pop(0), bam_job_files.pop(0), bam_job_files.pop(0), bam_out) elif len(bam_job_files) == 2: tmp_alignments.append(bam_out) self.bam_merge_2(bam_job_files.pop(0), bam_job_files.pop(0), bam_out) else: tmp_alignments.append(bam_job_files[0]) barrier() bam_job_files = [] bam_job_files = [new_bam for new_bam in tmp_alignments] else: break return bam_job_files[0]
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int, help="default is 50", default=50) parser.add_argument("-c", "--centers", metavar="N_CENTERS", type=int, help="default is 2", default=2) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS", type=int, help="default is 5", default=5) parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("--labeled", help="the last column of the input file " "represents labels (only for text " "files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) n_features = x.shape[1] if args.labeled and not args.svmlight: x = x[:, :n_features - 1] if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() kmeans = KMeans(n_clusters=args.clusters, max_iter=args.iteration, arity=args.arity, verbose=True) kmeans.fit(x) barrier() fit_time = time.time() - s_time out = [args.clusters, args.arity, args.part_size, read_time, fit_time] print(out)
def main(): import errno import os import sys import time from pycompss.api.api import barrier, compss_wait_on # usage if len(sys.argv) != 8: print("Usage: {} BWA_DB_FILE CONTIG_FILE REFERENCE_FILE REFERENCE_INDEX_FILE INPUT_DIR WORK_DIR " \ "NUM_PROCESSES [NUM_BUCKETS=2*NUM_PROCESSES]\n\n" \ "Program name must be called with an absolute path (starting with '/').".format(sys.argv[0])) return 1 # find program directory and basenames cmd_dir = os.path.dirname(sys.argv[0]) if cmd_dir == "" or cmd_dir[0] != '/': print( "Program must be called with an absolute path (starting with '/')") return 1 prog_basename = os.path.basename(os.path.splitext(sys.argv[0])[0]) # read inputs bwa_db_file = sys.argv[1] contig_file = sys.argv[2] ref_file = sys.argv[3] ref_idx_file = sys.argv[4] in_dir_prefix = sys.argv[5] work_dir = sys.argv[6] num_processes = int(sys.argv[7]) num_buckets = int(sys.argv[8]) # setup directories in_dirs = [in_dir_prefix + '/' + str(x) for x in range(num_processes)] out_dir = "{}/{}_OUT".format(work_dir, prog_basename) try: os.makedirs(out_dir, mode=0o700) except OSError as e: if e.errno != errno.EEXIST: print("Failed to create Directory[{}].\n".format(out_dir)) raise start_time = time.time() # mapping & merge inputs = [] for in_dir in in_dirs: exts = set( [os.path.splitext(file1)[1] for file1 in os.listdir(in_dir)]) for ext in exts: elem = [ in_dir + '/' + f for f in os.listdir(in_dir) if f.endswith(ext) ] elem.sort() inputs.append(elem) # inputs = [[in_dir+'part_1.'+i, in_dir+'part_2.'+i] for i in range(num_processes)] # ~ [[part_1.0, part_2.0], [part_1.1, part_2.1], ...] print("Inputs: ", str(inputs)) # dbg contigs = reduce( lambda e1, e2: mapping_merge(e1, cmd_dir, bwa_db_file, contig_file, e2 ), inputs, {}) print("before compss_wait_on") # dbg contigs = compss_wait_on(contigs) print("after compss_wait_on") # dbg with open('output.dict', 'w') as f: # dbg f.write(str(contigs)) # dbg buckets = split(num_buckets, contigs) # rm_dup & analyze tar_file = init_tar(out_dir) reduce( lambda tar_file1, bucket: rmdup_analyze_tar( cmd_dir, ref_idx_file, ref_file, bucket, tar_file1), buckets, tar_file) print("before barrier") # dbg barrier() print("after barrier") # dbg print("NGSA-mini-py with {} processes. Ellapsed Time {} (s)".format( num_processes, time.time() - start_time))