def binpid2zip(pid, outfile, log_callback=None): def log(msg): if log_callback is not None: log_callback(msg) """Generate a zip file given a canonical pid""" parsed = parse_pid(pid) bin_pid = ''.join([parsed[NAMESPACE], parsed[BIN_LID]]) timestamp = iso8601(strptime(parsed[TIMESTAMP], parsed[TIMESTAMP_FORMAT])) log('copying raw data for %s to temp files ...' % bin_pid) with tempfile.NamedTemporaryFile() as hdr_tmp: hdr_path = hdr_tmp.name drain(UrlSource(bin_pid+'.hdr'), LocalFileSink(hdr_path)) hdr = parse_hdr_file(hdr_path) with tempfile.NamedTemporaryFile() as adc_tmp: adc_path = adc_tmp.name drain(UrlSource(bin_pid+'.adc'), LocalFileSink(adc_path)) adc = Adc(adc_path, parsed[SCHEMA_VERSION]) unstitched_targets = add_pids(adc.get_targets(), bin_pid) stitched_targets = list_stitched_targets(unstitched_targets) with tempfile.NamedTemporaryFile() as roi_tmp: roi_path = roi_tmp.name drain(UrlSource(bin_pid+'.roi'), LocalFileSink(roi_path)) canonical_pid = bin_pid log('copied raw data for %s' % canonical_pid) """*parsed_pid - result of parsing pid *canonical_pid - canonicalized with URL prefix *targets - list of (stitched) targets *hdr - result of parsing header file *timestamp - timestamp (FIXME in what format?) *roi_path - path to ROI file outfile - where to write resulting zip file""" log('creating zip file for %s' % bin_pid) with open(outfile,'wb') as fout: return bin2zip(parsed,bin_pid,stitched_targets,hdr,timestamp,roi_path,fout)
def test_integrity(self,b): parsed = parse_pid(b.lid) schema_version = parsed[SCHEMA_VERSION] fs = {} for f in b.files: fs[f.filetype] = f.local_path try: check_fileset(fs, schema_version) return True except: return False
def accepts_product(product_pid): parsed = parse_pid(product_pid) namespace = parsed[NAMESPACE] product = parsed[PRODUCT] ep = '%sapi/accepts_products/%s' % (namespace, product) try: return requests.get(ep).json()[product] except: return False
def do_webcache(pid,job): parsed = parse_pid(pid) bin_pid = ''.join([parsed[NAMESPACE], parsed[BIN_LID]]) mosaic_base_url = '%sapi/mosaic/size/800x600/scale/0.33/page/1' % parsed[NAMESPACE] mosaic_json = '%s/%s.json' % (mosaic_base_url, bin_pid) mosaic_jpg = '%s/%s.jpg' % (mosaic_base_url, bin_pid) logging.warn('WEBCACHE hitting %s' % mosaic_json) r1 = requests.get(mosaic_json) json = r1.json() # read it, and throw it away logging.warn('WEBCACHE hitting %s' % mosaic_jpg) r2 = requests.get(mosaic_jpg) img_data = StringIO(r2.content) # read it, and throw it away logging.warn('WEBCACHE done for %s' % pid)
def get_product_destination(session, pid, product_type=None): parsed = parse_pid(pid) if product_type is None: product_type = parsed[PRODUCT] ts_label = parsed[TS_LABEL] if product_type=='multiblob': # sidecar files for features product_type='features' roots = get_data_roots(session, ts_label, product_type=product_type) if not roots: raise NotFound('no product destination found') root = roots[0] S = next(get_resolver().ifcb.files.product_path(root=root,**parsed)) return S[FILE_PATH]
def extract_features(pid,job): def log_callback(msg): logging.warn('FEATURES %s' % msg) client.heartbeat(pid,message=msg) parsed_pid = parse_pid(pid) bin_lid = parsed_pid[LID] bin_pid = ''.join([parsed_pid[NAMESPACE], parsed_pid[LID]]) binzip_url = ''.join([bin_pid,'_binzip.zip']) blob_url = ''.join([bin_pid,'_blob.zip']) features_url = ''.join([bin_pid,'_features.csv']) multiblob_url = ''.join([bin_pid,'_multiblob.csv']) if exists(features_url): log_callback('skipping %s - features exist' % pid) return log_callback('computing features for %s' % pid) with safe_tempdir() as binzip_dir: # download bin zip binzip_path = os.path.join(binzip_dir, '%s.zip' % bin_lid) log_callback('downloading %s to %s' % (binzip_url, binzip_path)) download(binzip_url, binzip_path) # download blob zip blob_path = os.path.join(binzip_dir, '%s_blob.zip' % bin_lid) log_callback('downloading %s to %s' % (blob_url, blob_path)) download(blob_url, blob_path) # compute features with safe_tempdir() as job_dir: # output of matlab job feature_csv = os.path.join(job_dir, csvname(bin_pid)) multiblob_csv = os.path.join(job_dir, 'multiblob', multiblobname(bin_pid)) # params for matlab job namespace = os.path.dirname(binzip_path) + '/' lid = os.path.basename(binzip_path) matlab = Matlab(MATLAB_EXEC_PATH, MATLAB_PATH, output_callback=log_callback) cmd = 'bin_features(\'%s\',\'%s\',\'%s\',\'chatty\')' % (namespace, lid, job_dir + '/') log_callback('running %s' % cmd) matlab.run(cmd) log_callback('matlab exited') if os.path.exists(feature_csv): log_callback('features found at %s' % feature_csv) else: raise Exception('no features found') log_callback('uploading %s' % features_url) upload(feature_csv, features_url) if os.path.exists(multiblob_csv): log_callback('multiblob found at %s' % multiblob_csv) log_callback('uploading %s' % multiblob_url) upload(multiblob_csv, multiblob_url) log_callback('complete') client.wakeup()
def do_binzip(pid, job): def log_callback(msg): logging.warn('BINZIP %s' % msg) client.heartbeat(pid,message=msg) parsed = parse_pid(pid) binzip_url = '%s%s_binzip.zip' % (parsed[NAMESPACE], parsed[BIN_LID]) log_callback('creating zipfile for %s' % pid) with tempfile.NamedTemporaryFile() as zip_tmp: zip_path = zip_tmp.name binpid2zip(pid, zip_path) # construct binzip URL log_callback('depositing %s' % binzip_url) upload(zip_path, binzip_url) log_callback('deposited %s' % binzip_url) client.wakeup()
def do_acc(pid, job): parsed = parse_pid(pid) lid = parsed[LID] ts_label = parsed[TS_LABEL] roots = get_data_roots(session, ts_label) # get raw data roots fileset = parsed_pid2fileset(parsed, roots) fileset[LID] = lid session.expire_all() # don't be stale! acc = Accession(session,ts_label)#,fast=True) # FIXME fast=True disables checksumming client.update(pid,ttl=3600) # allow 1hr for accession ret = acc.add_fileset(fileset) if ret=='ADDED': schedule_products(pid, client) session.commit() client.wakeup() elif ret=='FAILED': raise Exception('accession failed')
def extract_blobs(pid,job): def log_callback(msg): logging.warn('BLOBS %s' % msg) client.heartbeat(pid,message=msg) parsed_pid = parse_pid(pid) bin_lid = parsed_pid[LID] bin_pid = ''.join([parsed_pid[NAMESPACE], parsed_pid[LID]]) binzip_url = ''.join([bin_pid,'_binzip.zip']) binzip_file = os.path.basename(binzip_url) deposit_url = '%s_blobs.zip' % bin_pid if exists(deposit_url): log_callback('skipping %s - blobs exist' % pid) return log_callback('computing blobs for %s' % pid) with safe_tempdir() as binzip_dir: # first, copy the zipfile to a temp dir binzip_path = os.path.join(binzip_dir, '%s.zip' % bin_lid) log_callback('downloading %s to %s' % (binzip_url, binzip_path)) download(binzip_url, binzip_path) # now run bin_blobs with safe_tempdir() as job_dir: # configure matlab matlab = Matlab(MATLAB_EXEC_PATH, MATLAB_PATH, output_callback=log_callback) # run command blobs_file = os.path.join(job_dir, blob_zip_name(bin_pid)) cmd = 'bin_blobs(\'%s\',\'%s\',\'%s\')' % (bin_pid, binzip_path, job_dir) log_callback('running %s' % cmd) matlab.run(cmd) log_callback('MATLAB done, checking for %s' % blobs_file) if not os.path.exists(blobs_file): raise Exception('missing output file') log_callback('depositing %s' % blobs_file) upload(blobs_file, deposit_url) log_callback('deposited %s' % blobs_file) log_callback('completed %s' % bin_pid) client.wakeup()
def accession_demo(session,ts_label,root): # now accede for fs in get_resolver().ifcb.files.list_raw_filesets(root): lid = fs['lid'] try: parsed = parse_pid(lid) except: print 'barf %s' % lid raise ts = text2utcdatetime(parsed['timestamp'], parsed['timestamp_format']) b = Bin(ts_label=ts_label, lid=lid, sample_time=ts) session.add(b) # now make mostly bogus fixity entries now = datetime.now() paths = [fs['hdr_path'], fs['adc_path'], fs['roi_path']] filetypes = ['hdr','adc','roi'] for path,filetype in zip(paths,filetypes): length = os.stat(path).st_size name = os.path.basename(path) #checksum = sha1_file(path) checksum = 'placeholder' f = File(local_path=path, filename=name, length=length, filetype=filetype, sha1=checksum, fix_time=now) b.files.append(f) session.commit()
def new_bin(self,lid): parsed = parse_pid(lid) sample_time = get_timestamp(parsed) return Bin(ts_label=self.ts_label, lid=lid, sample_time=sample_time)
def pid2fileset(pid,roots): parsed_pid = parse_pid(pid) return parsed_pid2fileset(parsed_pid,roots)