def main(): stub = 'azuremlsampleexperiments.blob.core.windows.net/criteo/day_1.gz' url = 'http://{}'.format(stub) path = '../temp/day_1.gz' if not (os.path.exists(path) or os.path.exists(path.replace('.gz', ''))): rc = os.system('wget {} -O {}'.format(url, path)) if rc != 0: raise StandardError('Could not fetch data') os.system('hdfs dfs -mkdir /scratch') rc = hdfs_put(path)
cxn.load_dense_matrix('../output/y{}_sparse.csv'.format(sparse_gb), 'y{}_sparse'.format(sparse_gb)) paths = os.listdir('../output') paths = filter( lambda x: (x != '.gitignore') and ('.log' not in x) and ('.mtd' not in x), paths) paths = map(lambda x: os.path.join('../output', x), paths) with open('manifest.txt') as fh: manifest = fh.read().split('\n') fh = open('manifest.txt', 'a') for path in paths: dest, ext = path.replace('../output/', '').split('.') data.write_sparse_meta(dest, path, cxn) if path in manifest: continue utils.hdfs_put(path) fh.write(path + '\n') fh.flush() fh.close() # make sure git ignores these files with open('../output/.gitignore', 'w') as fh: fh.write('*.csv\n*.mtd\n*.mtx') # stop logging end_make_logging()
#all_files = os.listdir('../output/scale_nodes') #for s in systems: # for op in ops: # relevant_files = filter( # lambda x: (s in x) and (op in x) and (nodes in x), all_files) # map(lambda x: os.unlink('../output/scale_nodes/{}'.format(x)), # relevant_files) cmd_args = ('opType={opType} mattype={mattype}' ' Mpath={Mpath} Npath={Npath}' ' wPath={wPath} tableStub={tableStub}' ' nodes={nodes} passPath=/scratch/pass.csv' ' outdir=scale_nodes') data.gen_data_disk('../temp/pass.csv', 2, 2, 2**12) utils.hdfs_put('../temp/pass.csv') for op in ops: mattype_m = 'tall' if op != 'GMM' else 'wide' mattype_n = 'tall' Mpath_disk = '../external/disk_data/M{}_{}.csv'.format(matsize, mattype_m) wPath_disk = '../external/disk_data/w{}_{}.csv'.format(matsize, mattype_m) Npath_disk = '../external/disk_data/N{}_{}.csv'.format(matsize, mattype_n) if op == 'GMM': NPath_disk = '../external/disk_data/M{}_tall.csv'.format(matsize) Mpath_hdfs = Mpath_disk.replace('../external/disk_data', '/scratch') wPath_hdfs = wPath_disk.replace('../external/disk_data', '/scratch') Npath_hdfs = Npath_disk.replace('../external/disk_data', '/scratch') cmd_params_disk = {