def column_filename(self, colname, sliceno=None): dc = self.columns[colname] jid, name = dc.location.split('/', 1) if dc.offsets: return resolve_jobid_filename(jid, name) else: if sliceno is None: sliceno = '%s' return resolve_jobid_filename(jid, name % (sliceno, ))
def run_job(self, jobid, subjob_cookie=None, parent_pid=0): W = self.workspaces[get_workspace_name(jobid)] # active_workspaces = {} for name in self.source_workdirs: active_workspaces[name] = self.workspaces[name].get_path() slices = self.workspaces[self.target_workdir].get_slices() t0 = time.time() setup = update_setup(jobid, starttime=t0) prof = setup.profile or DotDict() new_prof, files, subjobs = dispatch.launch(W.path, setup, self.config, self.Methods, active_workspaces, slices, self.debug, self.daemon_url, subjob_cookie, parent_pid) if self.debug: delete_from = Temp.TEMP else: delete_from = Temp.DEBUG for filename, temp in list(files.items()): if temp >= delete_from: unlink(join(W.path, jobid, filename)) del files[filename] prof.update(new_prof) prof.total = 0 prof.total = sum(v for v in prof.values() if isinstance(v, (float, int))) data = dict( starttime=t0, endtime=time.time(), profile=prof, ) update_setup(jobid, **data) data['files'] = files data['subjobs'] = subjobs json_save(data, resolve_jobid_filename(jobid, 'post.json'))
def main(urd): urd.build("test_json") print() print("Testing dataset creation, export, import") source = urd.build("test_datasetwriter") urd.build("test_datasetwriter_verify", datasets=dict(source=source)) ds = Dataset(source, "passed") csvname = "out.csv.gz" csv = urd.build("csvexport", options=dict(filename=csvname, separator="\t"), datasets=dict(source=ds)) csv_quoted = urd.build("csvexport", options=dict(filename=csvname, quote_fields='"'), datasets=dict(source=ds)) reimp_csv = urd.build("csvimport", options=dict(filename=resolve_jobid_filename(csv, csvname), separator="\t")) reimp_csv_quoted = urd.build("csvimport", options=dict(filename=resolve_jobid_filename(csv_quoted, csvname), quote_support=True)) urd.build("test_compare_datasets", datasets=dict(a=reimp_csv, b=reimp_csv_quoted)) urd.build("test_dataset_column_names") print() print("Testing csvimport with more difficult files") urd.build("test_csvimport_corner_cases") urd.build("test_csvimport_separators") print() print("Testing subjobs and dataset typing") urd.build("test_subjobs_type", datasets=dict(typed=ds, untyped=reimp_csv)) urd.build("test_dataset_old_columns") print() print("Testing dataset chaining, filtering, callbacks and rechaining") selfchain = urd.build("test_selfchain") urd.build("test_rechain", jobids=dict(selfchain=selfchain)) print() print("Testing dataset sorting (with subjobs again)") urd.build("test_sorting") urd.build("test_sort_stability") print() print("Test hashlabels") urd.build("test_hashlabel")
def full_filename(filename, ext, sliceno=None, jobid=None): if not filename or not filename[0]: # Fallback to default in calling function return None if isinstance(filename, JobWithFile): if jobid: raise Exception("Don't specify a jobid when passing a JobWithFile as filename") if sliceno is None: assert not filename.sliced, "A sliced file requires a sliceno" else: assert filename.sliced, "An unsliced file can not have a sliceno" jobid, filename = filename[:2] if not filename.endswith(ext): filename += ext if sliceno is not None: filename = filename.replace(ext, '%02d' % (int(sliceno),)) + ext if jobid is not None: filename = resolve_jobid_filename(jobid, filename) return filename
def analysis(sliceno, params, prepare_res): dw, jobs, sort_idx = prepare_res single_job = (len(jobs) == 1) if options.sort_across_slices: columniter = partial(Dataset.iterate_list, None, jobids=jobs) per_slice = len(sort_idx) // params.slices if sliceno + 1 == params.slices: sort_idx = sort_idx[per_slice * sliceno:] else: sort_idx = sort_idx[per_slice * sliceno:per_slice * (sliceno + 1)] else: columniter = partial(Dataset.iterate_list, sliceno, jobids=jobs) sort_idx = sort(columniter) if single_job and not options.sort_across_slices and sort_idx == sorted(sort_idx): # this slice is fully sorted as is. slice_dir = '%02d' % (sliceno,) symlink(resolve_jobid_filename(datasets.source, slice_dir), slice_dir) return len(sort_idx) for column in datasets.source.columns: lst = list(columniter(column)) w = dw.writers[column].write for idx in sort_idx: w(lst[idx])
def main(urd): urd.build("test_report") urd.build("test_json") print() print("Testing dataset creation, export, import") source = urd.build("test_datasetwriter") urd.build("test_datasetwriter_verify", datasets=dict(source=source)) ds = Dataset(source, "passed") csvname = "out.csv.gz" csvname_uncompressed = "out.csv" csv = urd.build("csvexport", options=dict(filename=csvname, separator="\t"), datasets=dict(source=ds)) csv_uncompressed = urd.build("csvexport", options=dict(filename=csvname_uncompressed, separator="\t"), datasets=dict(source=ds)) csv_quoted = urd.build("csvexport", options=dict(filename=csvname, quote_fields='"'), datasets=dict(source=ds)) reimp_csv = urd.build("csvimport", options=dict(filename=resolve_jobid_filename( csv, csvname), separator="\t")) reimp_csv_uncompressed = urd.build( "csvimport", options=dict(filename=resolve_jobid_filename(csv_uncompressed, csvname_uncompressed), separator="\t")) reimp_csv_quoted = urd.build("csvimport", options=dict(filename=resolve_jobid_filename( csv_quoted, csvname), quotes=True)) urd.build("test_compare_datasets", datasets=dict(a=reimp_csv, b=reimp_csv_uncompressed)) urd.build("test_compare_datasets", datasets=dict(a=reimp_csv, b=reimp_csv_quoted)) urd.build("test_dataset_column_names") print() print("Testing csvimport with more difficult files") urd.build("test_csvimport_corner_cases") urd.build("test_csvimport_separators") print() print("Testing subjobs and dataset typing") urd.build("test_subjobs_type", datasets=dict(typed=ds, untyped=reimp_csv)) urd.build("test_dataset_old_columns") # This one is so you get a more useful error message if numeric_comma is broken. urd.build("dataset_type", datasets=dict(source=source), options=dict(numeric_comma=True, column2type=dict(b="float64"), defaults=dict(b="0"))) urd.build("test_dataset_type_corner_cases") print() print("Testing dataset chaining, filtering, callbacks and rechaining") selfchain = urd.build("test_selfchain") urd.build("test_rechain", jobids=dict(selfchain=selfchain)) print() print("Testing dataset sorting and rehashing (with subjobs again)") urd.build("test_sorting") urd.build("test_sort_stability") urd.build("test_rehash") print() print("Test hashlabels") urd.build("test_hashlabel") print() print("Test dataset_checksum") urd.build("test_dataset_checksum") print() print("Test csvimport_zip") urd.build("test_csvimport_zip")
def main(urd): # Example 1. Create a chain of datasets containing random data. jid_prev = None for n in range(5): jid_ds = urd.build( 'example1_create_dataset', datasets=dict(previous=jid_prev), options=dict(approx_rows=100000, seed=n), name='Created_number_%s' % (n, ), ) jid_prev = jid_ds # Example 2. Export the last dataset in the chain to a tab # separated textfile. jid_exp = urd.build( 'csvexport', datasets=dict(source=jid_ds), options=dict(filename='random.tsv', separator='\t'), ) filename = resolve_jobid_filename(jid_exp, 'random.tsv') print('Exported file stored in \"%s\"' % (filename, )) # Example 3. Import the tab separated textfile and type it jid_imp = urd.build( 'csvimport', options=dict(filename=filename, separator='\t', labelsonfirstline=True), ) jid_typ = urd.build( 'dataset_type', datasets=dict(source=jid_imp), options=dict(column2type=dict(rflt='number', rint='number')), ) # Example 4. Run a method computing the average of a column, in a # loop, one column at a time. The column name is an # input parameter. for column in Dataset(jid_typ).columns: jid_avg = urd.build( 'example1_calc_average', datasets=dict(source=jid_typ), options=dict(column=column), ) (s, n) = blob.load(jobid=jid_avg) print("Column %s: sum=%f, length=%d, average=%f" % (column, s, n, s / n)) # Example 5. Create a new column that is the product of two # existing columns. jid_add = urd.build( 'example1_add_column', datasets=dict(source=jid_typ), ) # Example 6. Export a dataset with named columns in specified # order. jid_add_exp = urd.build( 'csvexport', datasets=dict(source=jid_add), options=dict(filename='prod.csv', labels=( 'prod', 'rflt', 'rint', )), ) print(urd.joblist.pretty)