def updateMetacollection(iterator, metacollection,incremental,versionNumber,totalVariables,tcs,spcs,volumes,dimensions,times,locations,varFormats): checkMetadata(iterator) metadata = iterator.metadata metadata['']['columns'] = totalVariables metadata['']['source'] = pm.son.SON(metadata['']['source']) metadata['']['title'] = metadata['']['source']['dataset']['name'] metadata['']['shortTitle'] = metadata['']['source']['dataset']['shortName'] value_processors = metadata[''].get('valueProcessors',{}) if metadata['']['columnGroups'].get('timeColumns',None): value_processors['timeColumns'] = 'return require("timedate").phrase(value);' if metadata['']['columnGroups'].get('spaceColumns',None): value_processors['spaceColumns'] = 'return require("location").phrase(value);' metadata['']['valueProcessors'] = value_processors name_processors = metadata[''].get('nameProcessors',{}) if metadata['']['columnGroups'].get('timeColNames',None): dateFormat = metadata['']['dateFormat'] name_processors['timeColNames'] = 'return require("timedate").phrase(require("timedate").stringtomongo(value,"' + dateFormat + '"));' metadata['']['nameProcessors'] = name_processors metacollection.ensure_index([('name',pm.DESCENDING),('versionNumber',pm.DESCENDING)], unique=True) metadata['']['varFormats'] = varFormats for k in metadata.keys(): metadata[k]['volume'] = volumes[k] metadata[k]['dimensions'] = dimensions[k] getCommonDatesLocations(iterator,metadata,times,locations,dimensions,k) if incremental: previousMetadata = dict([(p["name"],p) for p in metacollection.find({'versionNumber':versionNumber - 1})]) if previousMetadata: for x in previousMetadata.values(): x.pop('_id') for k in previousMetadata.keys(): if k not in metadata.keys(): metadata[k] = previousMetadata[k] for k in previousMetadata[''].keys(): if k not in metadata[''].keys(): metadata[''][k] = previousMetadata[''][k] for k in previousMetadata['']['columnGroups'].keys(): if k not in metadata['']['columnGroups'].keys(): metadata['']['columnGroups'][k] = previousMetadata['']['columnGroups'][k] else: metadata['']['columnGroups'][k] += previousMetadata['']['columnGroups'][k] for k in metadata.keys(): x = metadata[k] x['name'] = k x['versionNumber'] = versionNumber id = metacollection.insert(x,safe=True)
def updateCollection(download_dir,collectionName,parserClass,checkpath,certpath,parserArgs=None,parserKwargs=None,incremental=False): connection = pm.Connection(document_class=pm.son.SON) source_metadata = get_source_data(collectionName) db = connection['govdata'] assert not '__' in collectionName, 'collectionName must not contain consecutive underscores' metaCollectionName = '__' + collectionName + '__' versionName = '__' + collectionName + '__VERSIONS__' sliceDBName = '__' + collectionName + '__SLICES__' collection = db[collectionName] metacollection = db[metaCollectionName] versions = db[versionName] sliceDB = db[sliceDBName] if incremental: if versionName not in db.collection_names(): startInc = 0 else: startInc = get_max_increment_fromDB(versions) + 1 endInc = get_max_increment(download_dir) sources = [increment_format(download_dir,i) for i in range(startInc,endInc + 1)] else: sources = [download_dir] startInc = endInc = None if parserArgs == None: parserArgs = () if parserKwargs == None: parserKwargs = {} if sources: iterator = parserClass(sources[0],*parserArgs,**parserKwargs) iterator.set_source_metadata(source_metadata) uniqueIndexes = iterator.uniqueIndexes ColumnGroups = iterator.columnGroups sliceColTuples = getSliceColTuples(iterator.sliceCols) sliceColTuplesFlat = uniqify([tuple(sorted(uniqify(Flatten(sct)))) for sct in sliceColTuples]) sliceColList = uniqify(Flatten(ListUnion(sliceColTuples))) ContentCols = set(sliceColList + getContentCols(iterator)) if hasattr(iterator,'dateFormat'): TimeFormatter = td.mongotimeformatter(iterator.dateFormat) if collectionName in db.collection_names(): versionNumber = max(versions.distinct('versionNumber')) + 1 storedAllMetadata = metacollection.find_one({'name':'','versionNumber':versionNumber-1}) totalVariables = storedAllMetadata['columns'] VarMap = dict(zip(totalVariables,[str(x) for x in range(len(totalVariables))])) #check things are the same #and check consistent do so for all soruces else: versionNumber = 0 IndexCols = uniqify([x for x in ['subcollections'] + sliceColList + ListUnion([ColGroupsFlatten(ColumnGroups,k) for k in ['indexColumns','labelColumns','timeColumns','spaceColumns']]) if x not in uniqueIndexes]) totalVariables = SPECIAL_KEYS + uniqueIndexes + IndexCols assert not any(['.' in x or ('__' in x and x not in SPECIAL_KEYS) or x in ColumnGroups.keys() for x in totalVariables]) VarMap = dict(zip(totalVariables,map(str,range(len(totalVariables))))) cols = zip([VarMap[c] for c in uniqueIndexes + ['__versionNumber__']],[pm.DESCENDING]*(len(uniqueIndexes) + 1)) collection.ensure_index(cols,unique=True,dropDups=True) for col in IndexCols: collection.ensure_index(VarMap[col]) sliceDB.ensure_index('slice',unique=True,dropDups=True) vNInd = VarMap['__versionNumber__'] retInd = VarMap['__retained__'] specialKeyInds = [VarMap[k] for k in SPECIAL_KEYS] if 'timeColumns' in iterator.columnGroups.keys(): tcs = iterator.columnGroups['timeColumns'] else: tcs = [] if 'spaceColumns' in iterator.columnGroups.keys(): spcs = iterator.columnGroups['spaceColumns'] else: spcs = [] toParse = ListUnion([RecursiveFileList(source + '__PARSE__') for source in sources]) oldc = None SpaceCache = {} volumes = {'':0} dimensions = {'':[]} times = {'':[]} locations = {'':[]} varFormats = {} for file in toParse: iterator.refresh(file) checkMetadata(iterator) tcs = iterator.columnGroups.get('timeColumns',[]) spcs = iterator.columnGroups.get('spaceColumns',[]) index = 0 for c in iterator: newVars = [x for x in c.keys() if not x in totalVariables] assert not any (['__' in x or '.' in x or x in ColumnGroups.keys() for x in newVars]) , '__ and . must not appear in key names.' totalVariables += newVars VarMap.update(dict(zip(newVars,map(str,range(len(totalVariables) - len(newVars),len(totalVariables)))))) for tc in tcs: #time handling if tc in c.keys(): c[tc] = TimeFormatter(c[tc]) if COMPLETE_SPACE: for spc in spcs: if spc in c.keys(): #space t = getT(c[spc]) if t in SpaceCache.keys(): c[spc] = SpaceCache[t].copy() else: c[spc] = loc.SpaceComplete(c[spc]) SpaceCache[t] = c[spc].copy() if index % 100 == 0: print 'At', index index += 1 sctf = processSct(sliceColTuplesFlat,oldc,c) processRecord(c,collection,VarMap,totalVariables,uniqueIndexes,versionNumber,specialKeyInds,incremental,sliceDB,sctf,ContentCols) incrementThings(c,volumes,dimensions,times,locations,varFormats,tcs,spcs) oldc = c any_deleted = False if incremental: collection.update({vNInd:{'$lte': versionNumber - 1}}, {'$set':{vNInd:versionNumber}}) sliceDB.update({},{'$set':{'version':versionNumber}}) else: deleted = collection.find({vNInd:versionNumber - 1, retInd : {'$exists':False}}) for d in deleted: any_deleted = True sliceDelete(d,collection,sliceColTuples,VarMap,sliceDB,version) if any_deleted: subColInd = str(totalVariables.index('Subcollections')) subcols = [''] + uniqify(ListUnion(collection.distinct(subColInd))) for sc in subcols: volumes[sc] = collection.find({subColInd:sc}).count() dimensions[sc] = [k for k in totalVariables if collection.find_one({subColInd:sc,str(totalVariables.index(k)):{'$exists':True}})] times[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in tcs]) locations[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in spcs]) updateMetacollection(iterator,metacollection,incremental,versionNumber,totalVariables,tcs,spcs,volumes,dimensions,times,locations,varFormats) updateAssociatedFiles(sources,collection) updateVersionHistory(versionNumber,versions,startInc,endInc) updateSourceDBFromCollections(collectionNames = [collectionName]) connection.disconnect() createCertificate(certpath,'Collection ' + collectionName + ' written to DB.')