def get_tags(func,args,kwargs): if args is None: args = () if kwargs is None: kwargs = {} if hasattr(func.steptags,'__call__'): outsteptags = func.steptags(*args,**kwargs) else: outsteptags = func.steptags if is_string_like(outsteptags): outsteptags = (outsteptags,) assert isinstance(outsteptags,tuple) and all([is_string_like(x) for x in outsteptags]) return outsteptags
def filter_source(): if hasattr(filter,'read'): filter.seek(0) return cPickle.loads(filter.read()) elif is_string_like(filter): return cPickle.loads(open(filter).read()) else: return filter
def cross(dbname, inroots, outroots, setup=None, cleanup=None): if is_string_like(inroots): inroots = [inroots] if is_string_like(outroots): outroots = [outroots] def func(f): f.dbname = dbname f.meta_action = cross_op f.action_name = "cross" f.inroots = inroots f.outroots = outroots f.setup = setup f.cleanup = cleanup return f return func
def aggregate(dbname, inroots, aggregate_on, outroots, setup=None, cleanup=None): if is_string_like(inroots): inroots = [inroots] if is_string_like(outroots): outroots = [outroots] def func(f): f.dbname = dbname f.aggregate_on = aggregate_on f.meta_action = aggregate_op f.action_name = "aggregate" f.inroots = inroots f.outroots = outroots f.setup = setup f.cleanup = cleanup return f return func
def WgetMultiple(link, fname, maxtries=10): link = link if is_string_like(link) else link['URL'] opstring = '--user-agent="Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7"' time.sleep(5) for i in range(maxtries): wget(link, fname, opstring) F = open(fname,'r').read().strip() if F.startswith('<!DOCTYPE HTML'): return else: print 'download of ' + link + ' failed: ' + F[:20] time.sleep(15) print 'download of ' + link + ' failed after ' + str(maxtries) + ' attempts' return
def inject(dbname, outroots, generator, setup=None, cleanup=None, caches=None): if is_string_like(outroots): outroots = [outroots] def func(f): f.meta_action = inject_op f.action_name = "inject" f.dbname = dbname f.inroots = [""] f.outroots = outroots f.generator = generator f.setup = setup f.cleanup = cleanup return f return func
def location_format_check(self,path): return is_string_like(path)
def name_format_check(self,name): return is_string_like(name)
def db_computation_handler(insteptags,func,inconfig_path,outconfig_path,filebuncher = None, args = None, kwargs = None,inquery = None, outquery = None): if filebuncher is None: filebuncher = lambda X : [[x] for x in X] conn = connect_to_db() db = conn[DATA_DB_NAME] if is_string_like(insteptags): insteptags = [insteptags] incertpaths = get_cert_paths(insteptags,inconfig_path,inquery) incertdicts = [cPickle.load(open(incertpath)) for incertpath in incertpaths] incolroots = [incertdict['root'] for incertdict in incertdicts] incolnames = [incolroot + '.files' for incolroot in incolroots] if inquery is None: inquery = {} if outquery is None: outquery = {} assert inquery <= outquery for incolname in incolnames: if incolname not in db.collection_names(): raise NoInputCollectionError(incolname) incols = [db[incolname] for incolname in incolnames] inrecs = zip(*[list(incol.find(inquery)) for incol in incols]) inrecs = [zip(insteptags,rec) for rec in inrecs] outsteptags = get_tags(func,args,kwargs) outcolroots = [get_col_root(steptag = steptag,config_path = outconfig_path,conn = conn) for steptag in outsteptags] recgroups = filebuncher(inrecs) in_fs = dict([(intag,gridfs.GridFS(db,collection = incolroot)) for (intag,incolroot) in zip(insteptags,incolroots)]) out_fs = dict([(outtag,gridfs.GridFS(db,collection = outcolroot)) for (outtag,outcolroot) in zip(outsteptags,outcolroots)]) if kwargs is None: kwargs = {} if args is None: args = () for recs in recgroups: in_fhs = [[(r[0],in_fs[r[0]].get(r[1]['_id'])) for r in rec] for rec in recs] results = func(in_fhs,outconfig_path,*args,**kwargs) if is_string_like(results): results = (results,) if isinstance(results,tuple): results = [results] assert len(results) == len(recs), 'something wrong in your function' assert all([len(result) == len(outsteptags) for result in results]) for (rec,result) in zip(recs,results): for (outtag,res) in zip(outsteptags,result): print(rec[0][1]['_id']) out_fs[outtag].delete(rec[0][1]['_id']) out_fs[outtag].put(res,**rec[0][1]) outcertpaths = get_cert_paths(outsteptags,outconfig_path,outquery) for (outcertpath,outsteptag,outcolroot) in zip(outcertpaths,outsteptags,outcolroots): createCertificateDict(outcertpath,{'config_path': outconfig_path, 'steptag': outsteptag, 'root' : outcolroot,'query':outquery})
def get_cert_paths(tags,config_path,query): if is_string_like(tags): tags = [tags] qstr = '__' + repr(query) if query else '' return tuple([os.path.join(CERTIFICATE_ROOT,get_col_root(config_path = config_path,steptag = tag) + qstr) for tag in tags])
def SummarizeMetaData(X): if 'image' in X.keys(): from PIL import Image K = 125 try: x = Image.open('..' + X['image']) except: print 'Importing image', X['image'], 'failed. here is the error:' print_exc() sizetag = '' else: (w,h) = x.size if w > K or h > K: r = float(max(w,h)) w = int(w * K/r) h = int(h * K/r) sizetag = 'width="' + str(w) + '" height="' + str(h) + '"' image = '<img src="' + X['image'] + '" ' + sizetag + '/><br/>' else: image = '' if 'description' in X.keys(): description = '<strong>Description: </strong>' + X['description'].replace('\n','<br/>') else: description = '' if 'author' in X.keys(): author = '<strong>Author: </strong>' + X['author'] else: author = '' if 'title' in X.keys(): title = '<strong>Title: </strong>' + X['title'] else: title = '' if 'keywords' in X.keys(): keywords = '<strong>Keywords: </strong>' + X['keywords'] else: keywords = '' if 'signature' in X.keys(): if X['signature'] != 'directory': signature = '<strong>Signature: </strong> This appears to be a ' + X['signature'] + ' file.' elif 'DIR_signature' in X.keys(): signature = '<strong>Signature: </strong> This is a directory consisting of ' + X['DIR_signature'] + ' files.' else: signature = '' else: signature = '' X['signature'] = '' nr = [x for x in X.keys() if x.endswith('nrows')] nc = [x for x in X.keys() if x.endswith('ncols')] preamble = 'It has' if X['signature'] == 'tabular' else 'Its constituent datasets commonly have' if X['signature'] == 'directory' and 'DIR_signature' in X.keys() and X['DIR_signature'] == 'tabular' else 'This data has' if len(nr) > 0 and len(nc) > 0: ending = str(X[nr[0]]) + ' rows and ' + str(X[nc[0]]) + ' columns.' elif len(nr) > 0: ending = str(X[nr[0]]) + ' rows.' elif len(nc) > 0: ending = str(X[nc[0]]) + ' columns.' else: ending = '' if ending != '': tabulartext = preamble + ' ' + ending else: tabulartext = '' nn = [x for x in X.keys() if x.endswith('colnames')] if len(nn) > 0: names = X[nn[0]] nt = [x for x in X.keys() if x.endswith('coltypes') and len(X[x]) == len(X[nn[0]])] if len(nt): types = [' (' + t + ')' for t in X[nt[0]]] else: types = ['']*len(names) nd = [x for x in X.keys() if x.endswith('coldescrs') and isinstance(X[x],dict) and set(X[x].keys()).intersection(names)] if len(nd) > 0: descrs = X[nd[0]] descrs = [': ' + descrs[n] if n in descrs.keys() else '' for n in names] else: descrs = ['']*len(names) coltext = 'The columns are:<br/>' + '<br/>'.join(['<strong>'+n+'</strong>' + t + d for (n,t,d) in zip(names,types,descrs)]) else: coltext = '' nt = [] nd = [] #frequentwords if 'frequentwords' in X.keys(): frequentwords = 'Frequent words in this data include: ' + repr(X['frequentwords']) + '.' else: frequentwords = '' text = '<br/><br/>'.join([x for x in [image,title,author,description,signature,tabulartext,coltext,frequentwords,keywords] if x != '']) OtherKeys = set(X.keys()).difference(['image','coloring','description','author','title','keywords','signature','frequentwords','colformats','nfiles','npaths'] + nr + nc + nn + nt + nd) if OtherKeys: text += '<br/><br/><strong>Other MetaData</strong>:' + '<br/><br/>'.join(['<strong>' + k + ': </strong>' + (X[k] if is_string_like(X[k]) else repr(X[k])) for k in OtherKeys]) return text
def ConsolidateSources(metapath,objname=None,extensions=None): consolidated = {} if extensions is None: extensions = ['Attached','Associated','Automatic','Inherited'] combined = CombineSources(metapath,extensions=extensions) if 'Resources' in combined.keys(): consolidated['Resources'] = uniqify(ListUnion(combined['Resources'].values())) if 'image' in combined.keys(): consolidated['image'] = ListUnion([x.split(',') if is_string_like(x) else x for x in combined['image'].values()]) if 'author' in combined.keys(): consolidated['author'] = '; '.join(combined['author'].values()) if 'title' in combined.keys(): consolidated['title'] = '; '.join(combined['title'].values()) if 'description' in combined.keys(): descrs = combined['description'].items() if len(descrs) == 1: consolidated['description'] = descrs[0][1] else: consolidated['description'] = '\n\n'.join([e + ': ' + d for (e,d) in descrs]) elif 'Verbose' in combined.keys(): descrs = combined['Verbose'].items() if len(descrs) == 1: consolidated['description'] = descrs[0][1] else: consolidated['description'] = '\n\n'.join([e + ': ' + d for (e,d) in descrs]) if 'keywords' in combined.keys(): for k in combined['keywords'].keys(): if not is_string_like(combined['keywords'][k]): combined['keywords'][k] = ','.join(combined['keywords'][k]) consolidated['keywords'] = ','.join([x.strip() for x in uniqify((','.join(combined['keywords'].values())).split(','))]) if 'signature' in combined.keys(): s = uniqify(combined['signature'].values()) if len(s) == 1: consolidated['signature'] = s[0] else: consolidated['signature'] = '' L = ['nrows','ncols','coloring','wordassoc','colformats','coltypes','colnames','wordassoc','frequentwords','nfiles','npaths'] LL = L + [x for x in combined.keys() if x.startswith('DIR_')] for x in LL: if x in combined.keys() and 'Automatic' in combined[x].keys(): consolidated[x] = combined[x]['Automatic'] elif x in combined.keys() and 'Attached' in combined[x].keys(): consolidated[x] = combined[x]['Automatic'] elif x in combined.keys() and 'Associated' in combined[x].keys(): consolidated[x] = combined[x]['Associated'] elif x in combined.keys() and 'Inherited' in combined[x].keys(): consolidated[x] = combined[x]['Inherited'] if 'coldescrs' in combined.keys(): coldescrs = {} for c in combined['coldescrs'].values(): if isinstance(c,dict): for k in c.keys(): if k in coldescrs.keys(): coldescrs[k] += (c[k],) else: coldescrs[k] = (c[k],) for k in coldescrs.keys(): coldescrs[k] = '\n'.join(coldescrs[k]) consolidated['coldescrs'] = coldescrs OtherKeys = set(combined.keys()).difference(consolidated.keys()) for k in OtherKeys: consolidated[k] = ' '.join([x if is_string_like(x) else repr(x) for x in combined[k].values()]) return consolidated
def hsuck(seed,datadir,L,suffix='',write=True,ipath=None,getfunc0=None): if is_string_like(seed): seed = [('URL',seed)] if getfunc0 is None: getfunc0 = modwget if suffix and not suffix.endswith('_'): suffix = suffix + '_' if not datadir.endswith('/'): datadir += '/' D = [(suffix + 'initialize',hstart,(seed,datadir,getfunc0))] for (i,l) in enumerate(L[:-1]): round = i+1 oldmanifestpath = datadir + 'Manifest_' + str(round-1) + '.tsv' newmanifestpath = datadir + 'Manifest_' + str(round) + '.tsv' oldtotallinkpath = datadir + 'TotalLinks_' + str(round-1) + '.tsv' newtotallinkpath = datadir + 'TotalLinks_' + str(round) + '.tsv' olddownloaddir = datadir + 'Downloads_' + str(round-1) + '/' newdownloaddir = datadir + 'Downloads_' + str(round) + '/' Suffix = suffix + 'Round' + str(round) + '_' if hasattr(l,'__call__'): Parser = l Getter = modwget splitfunc = None prefixlist = None else: assert isinstance(l,dict) and 'Parser' in l.keys() Parser = l['Parser'] if 'Splitter' in l.keys(): (splitfunc, prefixlist) = l['Splitter'] else: (splitfunc, prefixlist) = (None, None) if 'Getter' in l.keys(): Getter = l['Getter'] else: Getter = modwget D += [(Suffix + 'parse',applyparser,(oldmanifestpath,oldtotallinkpath,olddownloaddir,newmanifestpath,newtotallinkpath,Parser,splitfunc,prefixlist,round))] if (splitfunc != None) and (prefixlist != None): assert all(['/' not in p for p in prefixlist]) splitdir = datadir + 'SplitManifest_' + str(round) + '/' D += [(Suffix + 'splitmanifest',applysplitter,(newmanifestpath,splitdir))] D += [(Suffix + 'initializedownloads',MakeDir,(newdownloaddir,))] D += [(Suffix + 'download_' + pathprocessor([p]).replace('!','_').replace('-','_'),applygetter,(splitdir + 'Manifest_' + pathprocessor([p]) + '.tsv',newdownloaddir + pathprocessor([p]) + '/',Getter)) for p in prefixlist] else: D += [(Suffix + 'download',applygetter,(newmanifestpath,newdownloaddir,Getter))] if L[-1]: oldmanifestpath = datadir + 'Manifest_' + str(round) + '.tsv' newmanifestpath = datadir + 'Catalog.tsv' oldtotallinkpath = datadir + 'TotalLinks_' + str(round) + '.tsv' olddownloaddir = datadir + 'Downloads_' + str(round) + '/' Suffix = suffix + 'Final_' assert hasattr(L[-1],'__call__') Parser = L[-1] D += [(Suffix + 'parse',applyparser,(oldmanifestpath,oldtotallinkpath,olddownloaddir,newmanifestpath,None,Parser,None,None,'final'))] if write: assert ipath, 'ipath must be specified' actualize(ipath,D) return D