예제 #1
0
파일: dbutils.py 프로젝트: yamins81/ecc
def get_tags(func,args,kwargs):

    if args is None:
        args = ()
    if kwargs is None:
        kwargs = {}
    if hasattr(func.steptags,'__call__'):
        outsteptags = func.steptags(*args,**kwargs)
    else:
        outsteptags = func.steptags
    if is_string_like(outsteptags):
        outsteptags = (outsteptags,)

    assert isinstance(outsteptags,tuple) and all([is_string_like(x) for x in outsteptags])
    
    return outsteptags
예제 #2
0
 def filter_source():
     if hasattr(filter,'read'):
         filter.seek(0)
         return cPickle.loads(filter.read())
     elif is_string_like(filter):
         return cPickle.loads(open(filter).read())
     else:
         return filter
예제 #3
0
def cross(dbname, inroots, outroots, setup=None, cleanup=None):
    if is_string_like(inroots):
        inroots = [inroots]
    if is_string_like(outroots):
        outroots = [outroots]

    def func(f):
        f.dbname = dbname
        f.meta_action = cross_op
        f.action_name = "cross"
        f.inroots = inroots
        f.outroots = outroots
        f.setup = setup
        f.cleanup = cleanup
        return f

    return func
예제 #4
0
def aggregate(dbname, inroots, aggregate_on, outroots, setup=None, cleanup=None):
    if is_string_like(inroots):
        inroots = [inroots]
    if is_string_like(outroots):
        outroots = [outroots]

    def func(f):
        f.dbname = dbname
        f.aggregate_on = aggregate_on
        f.meta_action = aggregate_op
        f.action_name = "aggregate"
        f.inroots = inroots
        f.outroots = outroots
        f.setup = setup
        f.cleanup = cleanup
        return f

    return func
예제 #5
0
파일: bls.py 프로젝트: govdata/govdata-core
def WgetMultiple(link, fname, maxtries=10):
    link = link if is_string_like(link) else link['URL']
    opstring = '--user-agent="Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7"'
    time.sleep(5)
    for i in range(maxtries):
        wget(link, fname, opstring)
        F = open(fname,'r').read().strip()
        if F.startswith('<!DOCTYPE HTML'):
            return
        else:
            print 'download of ' + link + ' failed: ' + F[:20]
            time.sleep(15)
            
    print 'download of ' + link + ' failed after ' + str(maxtries) + ' attempts'
    return
예제 #6
0
def inject(dbname, outroots, generator, setup=None, cleanup=None, caches=None):
    if is_string_like(outroots):
        outroots = [outroots]

    def func(f):
        f.meta_action = inject_op
        f.action_name = "inject"
        f.dbname = dbname
        f.inroots = [""]
        f.outroots = outroots
        f.generator = generator
        f.setup = setup
        f.cleanup = cleanup

        return f

    return func
예제 #7
0
파일: de.py 프로젝트: winnerczr/StarFlow
 def location_format_check(self,path):
     return is_string_like(path)
예제 #8
0
파일: de.py 프로젝트: winnerczr/StarFlow
 def name_format_check(self,name):
     return is_string_like(name)    
예제 #9
0
파일: dbutils.py 프로젝트: yamins81/ecc
def db_computation_handler(insteptags,func,inconfig_path,outconfig_path,filebuncher = None, args = None, kwargs = None,inquery = None, outquery = None):
    

    if filebuncher is None:
       filebuncher = lambda X : [[x] for x in X]
       
    conn = connect_to_db()
    db = conn[DATA_DB_NAME]
    
    if is_string_like(insteptags):
        insteptags = [insteptags]
        
    incertpaths = get_cert_paths(insteptags,inconfig_path,inquery)    
    incertdicts =  [cPickle.load(open(incertpath)) for incertpath in incertpaths]
    incolroots = [incertdict['root'] for incertdict in incertdicts]
    incolnames = [incolroot + '.files' for incolroot in incolroots]
    
    if inquery is None:
        inquery = {}
    if outquery is None:
        outquery = {}
    assert inquery <= outquery
    
    for incolname in incolnames:
        if incolname not in db.collection_names():
            raise NoInputCollectionError(incolname)
    incols = [db[incolname] for incolname in incolnames]
    inrecs = zip(*[list(incol.find(inquery)) for incol in incols])
    inrecs = [zip(insteptags,rec) for rec in inrecs]
       
    outsteptags = get_tags(func,args,kwargs)

    outcolroots = [get_col_root(steptag = steptag,config_path = outconfig_path,conn = conn) for steptag in outsteptags]

    recgroups = filebuncher(inrecs)
     
    in_fs = dict([(intag,gridfs.GridFS(db,collection = incolroot)) for (intag,incolroot) in zip(insteptags,incolroots)])
    out_fs = dict([(outtag,gridfs.GridFS(db,collection = outcolroot)) for (outtag,outcolroot) in zip(outsteptags,outcolroots)])
    
    if kwargs is None:
       kwargs = {}
    if args is None:
       args = ()    
    
    for recs in recgroups:    

        in_fhs = [[(r[0],in_fs[r[0]].get(r[1]['_id'])) for r in rec] for rec in recs]
        results = func(in_fhs,outconfig_path,*args,**kwargs)
        if is_string_like(results):
            results = (results,)
        if isinstance(results,tuple):
            results = [results]
       
        assert len(results) == len(recs), 'something wrong in your function'  
        assert all([len(result) == len(outsteptags) for result in results])
       
        for (rec,result) in zip(recs,results):
            for (outtag,res) in zip(outsteptags,result):
                print(rec[0][1]['_id'])
                out_fs[outtag].delete(rec[0][1]['_id'])
                out_fs[outtag].put(res,**rec[0][1])
    
    outcertpaths = get_cert_paths(outsteptags,outconfig_path,outquery)
    for (outcertpath,outsteptag,outcolroot) in zip(outcertpaths,outsteptags,outcolroots):
        createCertificateDict(outcertpath,{'config_path': outconfig_path, 'steptag': outsteptag, 'root' : outcolroot,'query':outquery})
예제 #10
0
파일: dbutils.py 프로젝트: yamins81/ecc
def get_cert_paths(tags,config_path,query):
    if is_string_like(tags):
        tags = [tags]
    
    qstr = '__' + repr(query) if query else ''
    return tuple([os.path.join(CERTIFICATE_ROOT,get_col_root(config_path = config_path,steptag = tag) + qstr) for tag in tags])
def SummarizeMetaData(X):

	if 'image' in X.keys():
		from PIL import Image	
		K = 125
		try:
			x = Image.open('..' + X['image'])
		except:
			print 'Importing image', X['image'], 'failed.  here is the error:'
			print_exc()
			sizetag = ''
		else:
			(w,h) = x.size
			if w > K or h > K:
				r = float(max(w,h))
				w = int(w * K/r)
				h = int(h * K/r)
			sizetag = 'width="' + str(w) + '" height="' + str(h) + '"'

		image = '<img src="' + X['image'] + '" ' + sizetag + '/><br/>'
	else:
		image = ''

	if 'description' in X.keys():
		description = '<strong>Description: </strong>' + X['description'].replace('\n','<br/>')
	else:
		description = ''
	
	if 'author' in X.keys():
		author = '<strong>Author: </strong>' + X['author']
	else:
		author = ''
	
	if 'title' in X.keys():
		title = '<strong>Title: </strong>' + X['title']
	else:
		title = ''
	
	if 'keywords' in X.keys():
		keywords = '<strong>Keywords: </strong>' + X['keywords']
	else:
		keywords = ''
	
	if 'signature' in X.keys():
		if X['signature'] != 'directory':
			signature = '<strong>Signature: </strong> This appears to be a ' + X['signature'] + ' file.'  
		elif 'DIR_signature' in X.keys():
			signature = '<strong>Signature: </strong> This is a directory consisting of ' + X['DIR_signature'] + ' files.'  
		else:
			signature = ''
	else:
		signature = ''
		X['signature'] = ''

	
	nr = [x for x in X.keys() if x.endswith('nrows')]
	nc = [x for x in X.keys() if x.endswith('ncols')]
	preamble = 'It has' if X['signature'] == 'tabular' else 'Its constituent datasets commonly have' if X['signature'] == 'directory' and 'DIR_signature' in X.keys() and X['DIR_signature'] == 'tabular' else 'This data has'
	if len(nr) > 0 and len(nc) > 0:
		ending = str(X[nr[0]]) + ' rows and ' + str(X[nc[0]]) + ' columns.'
	elif len(nr) > 0:
		ending = str(X[nr[0]]) + ' rows.'
	elif len(nc) > 0:
		ending = str(X[nc[0]]) + ' columns.'
	else:
		ending = ''
	if ending != '':
		tabulartext = preamble + ' ' + ending
	else:
		tabulartext = ''
		
	
	nn = [x for x in X.keys() if x.endswith('colnames')]
	if len(nn) > 0:
		names = X[nn[0]]
		nt = [x for x in X.keys() if x.endswith('coltypes') and len(X[x]) == len(X[nn[0]])]
		if len(nt):
			types = [' (' + t + ')' for t in X[nt[0]]]
		else:
			types = ['']*len(names)
			
		nd = [x for x in X.keys() if x.endswith('coldescrs') and isinstance(X[x],dict) and set(X[x].keys()).intersection(names)]

		if len(nd) > 0:			
			descrs = X[nd[0]]
			descrs = [': ' + descrs[n] if n in descrs.keys() else '' for n in names]
		else:
			descrs = ['']*len(names)
		
		coltext = 'The columns are:<br/>' + '<br/>'.join(['<strong>'+n+'</strong>' + t + d for (n,t,d) in zip(names,types,descrs)])
	else:
		coltext = ''
		nt = []
		nd = []

	#frequentwords
	if 'frequentwords' in X.keys():
		frequentwords = 'Frequent words in this data include: ' + repr(X['frequentwords']) + '.'
	else:
		frequentwords = ''
		
	text = '<br/><br/>'.join([x for x in [image,title,author,description,signature,tabulartext,coltext,frequentwords,keywords] if x != ''])
	
	OtherKeys = set(X.keys()).difference(['image','coloring','description','author','title','keywords','signature','frequentwords','colformats','nfiles','npaths'] + nr + nc + nn + nt + nd)
	if OtherKeys:
		text +=  '<br/><br/><strong>Other MetaData</strong>:' + '<br/><br/>'.join(['<strong>' + k + ': </strong>' + (X[k] if is_string_like(X[k]) else repr(X[k]))  for k in OtherKeys])
	
	return text
def ConsolidateSources(metapath,objname=None,extensions=None):
	
	consolidated = {}
	if extensions is None:
		extensions = ['Attached','Associated','Automatic','Inherited']
	combined = CombineSources(metapath,extensions=extensions)

	if 'Resources' in combined.keys():
		consolidated['Resources'] = uniqify(ListUnion(combined['Resources'].values()))
			
	if 'image' in combined.keys():
		consolidated['image'] = ListUnion([x.split(',') if is_string_like(x) else x for x in combined['image'].values()])
	
	if 'author' in combined.keys():
		consolidated['author'] = '; '.join(combined['author'].values())
	
	if 'title' in combined.keys():
		consolidated['title'] = '; '.join(combined['title'].values())
	
	if 'description' in combined.keys():
		descrs = combined['description'].items()
		if len(descrs) == 1:
			consolidated['description'] = descrs[0][1]
		else:
			consolidated['description'] = '\n\n'.join([e + ': ' + d for (e,d) in descrs])
			
	elif 'Verbose' in combined.keys():
		descrs = combined['Verbose'].items()
		if len(descrs) == 1:
			consolidated['description'] = descrs[0][1]
		else:
			consolidated['description'] = '\n\n'.join([e + ': ' + d for (e,d) in descrs])
	
	if 'keywords' in combined.keys():
		for k in combined['keywords'].keys():
			if not is_string_like(combined['keywords'][k]):
				combined['keywords'][k] = ','.join(combined['keywords'][k])
				
		consolidated['keywords'] = ','.join([x.strip() for x in uniqify((','.join(combined['keywords'].values())).split(','))])
				
				
	if 'signature' in combined.keys():
		s = uniqify(combined['signature'].values())
		if len(s) == 1:
			consolidated['signature'] = s[0]
		else:
			consolidated['signature'] = ''
	
	L = ['nrows','ncols','coloring','wordassoc','colformats','coltypes','colnames','wordassoc','frequentwords','nfiles','npaths']
	LL = L + [x for x in combined.keys() if x.startswith('DIR_')]
	for x in LL:
		if x in combined.keys() and 'Automatic' in combined[x].keys():
			consolidated[x] = combined[x]['Automatic']
		elif x in combined.keys() and 'Attached' in combined[x].keys():
			consolidated[x] = combined[x]['Automatic']
		elif x in combined.keys() and 'Associated' in combined[x].keys():
			consolidated[x] = combined[x]['Associated']		
		elif x in combined.keys() and 'Inherited' in combined[x].keys():
			consolidated[x] = combined[x]['Inherited']		


	if 'coldescrs' in combined.keys():
		coldescrs = {}
		for c in combined['coldescrs'].values():
			if isinstance(c,dict):
				for k in c.keys():
					if k in coldescrs.keys():
						coldescrs[k] += (c[k],)
					else:
						coldescrs[k] = (c[k],)
	
		for k in coldescrs.keys():
			coldescrs[k] = '\n'.join(coldescrs[k])
		
		consolidated['coldescrs'] = coldescrs
				
	OtherKeys = set(combined.keys()).difference(consolidated.keys())

	for k in OtherKeys:
		consolidated[k] = ' '.join([x if is_string_like(x) else repr(x) for x in combined[k].values()])

	return consolidated
예제 #13
0
def hsuck(seed,datadir,L,suffix='',write=True,ipath=None,getfunc0=None):
	
	if is_string_like(seed):
		seed = [('URL',seed)]
		
	if getfunc0 is None:
		getfunc0 = modwget
		
	if suffix and not suffix.endswith('_'):
		suffix = suffix + '_'
	
	if not datadir.endswith('/'):
		datadir += '/'
	
	D = [(suffix + 'initialize',hstart,(seed,datadir,getfunc0))]

	for (i,l) in enumerate(L[:-1]):
		round = i+1
		
		oldmanifestpath = datadir + 'Manifest_' + str(round-1) + '.tsv'
		newmanifestpath = datadir + 'Manifest_' + str(round) + '.tsv'
		oldtotallinkpath = datadir + 'TotalLinks_' + str(round-1) + '.tsv'
		newtotallinkpath = datadir + 'TotalLinks_' + str(round) + '.tsv'
		olddownloaddir = datadir + 'Downloads_' + str(round-1) + '/'
		newdownloaddir = datadir + 'Downloads_' + str(round) + '/'
		Suffix = suffix + 'Round' + str(round) + '_'
		
		if hasattr(l,'__call__'):
			Parser = l
			Getter = modwget
			splitfunc = None
			prefixlist = None
		else:
			assert isinstance(l,dict) and 'Parser' in l.keys()
			Parser = l['Parser']
			if 'Splitter' in l.keys():
				(splitfunc, prefixlist) = l['Splitter']		
			else:
				(splitfunc, prefixlist) = (None, None)
			if 'Getter' in l.keys():
				Getter = l['Getter']
			else:
				Getter = modwget
			
		D += [(Suffix + 'parse',applyparser,(oldmanifestpath,oldtotallinkpath,olddownloaddir,newmanifestpath,newtotallinkpath,Parser,splitfunc,prefixlist,round))]
			
		if (splitfunc != None) and (prefixlist != None):
			assert all(['/' not in p for p in prefixlist])		
			splitdir  = datadir + 'SplitManifest_' + str(round) + '/'
			D += [(Suffix + 'splitmanifest',applysplitter,(newmanifestpath,splitdir))]
			D += [(Suffix + 'initializedownloads',MakeDir,(newdownloaddir,))]
			D += [(Suffix + 'download_' + pathprocessor([p]).replace('!','_').replace('-','_'),applygetter,(splitdir + 'Manifest_' + pathprocessor([p]) + '.tsv',newdownloaddir + pathprocessor([p]) + '/',Getter)) for p in prefixlist]			
		else:
			D += [(Suffix + 'download',applygetter,(newmanifestpath,newdownloaddir,Getter))]

	if L[-1]:
		oldmanifestpath = datadir + 'Manifest_' + str(round) + '.tsv'
		newmanifestpath = datadir + 'Catalog.tsv'
		oldtotallinkpath = datadir + 'TotalLinks_' + str(round) + '.tsv'
		olddownloaddir = datadir + 'Downloads_' + str(round) + '/'
		Suffix = suffix + 'Final_'
		assert hasattr(L[-1],'__call__')
		Parser = L[-1]
		
		D += [(Suffix + 'parse',applyparser,(oldmanifestpath,oldtotallinkpath,olddownloaddir,newmanifestpath,None,Parser,None,None,'final'))]
		

	if write:
		assert ipath, 'ipath must be specified'
		actualize(ipath,D)
	
	return D