def getCommonDatesLocations(iterator,metadata,times,locations,dimensions,k): vNInd = '0' overallDateFormat = iterator.overallDateFormat if hasattr(iterator,'overallDateFormat') else '' dateFormat = iterator.dateFormat if hasattr(iterator,'dateFormat') else '' overallDate = iterator.overallDate if hasattr(iterator,'overallDate') else '' if overallDateFormat or dateFormat: DF = overallDateFormat + dateFormat F = td.mongotimeformatter(DF) T1 = [F(overallDate + x) for x in iterator.columnGroups['timeColNames'] if x in dimensions[k]] if overallDateFormat: reverseF = td.reverse(dateFormat) T2 = [F(overallDate + y) for y in map(reverseF,times[k])] else: T2 = times[k] mindate = min(T1 + T2) maxdate = max(T1 + T2) divisions = uniqify(ListUnion([td.getLowest(t) for t in T1 + T2])) metadata[k]['beginDate'] = mindate metadata[k]['endDate'] = maxdate metadata[k]['dateDivisions'] = divisions #locations if locations[k]: if hasattr(iterator,'overallLocation'): locs = [loc.integrate(iterator.overallLocation,l) for l in locations[k]] else: locs = locations[k] locs = locListUniqify(locs) metadata[k]['spatialDivisions'] = uniqify(ListUnion([loc.divisions(x) for x in locs])) metadata[k]['commonLocation'] = reduce(loc.intersect,locs)
def initialize_argdict(collection): d = {} ; ArgDict = {} sliceCols = uniqify(Flatten(collection.sliceCols)) sliceColList = ListUnion([[x] if x.split('.')[0] in collection.columns else collection.columnGroups.get(x,[]) for x in sliceCols]) if hasattr(collection,'contentCols'): contentColList = ListUnion([[x] if x.split('.')[0] in collection.columns else collection.columnGroups.get(x,[]) for x in collection.contentCols]) contentCols = uniqify(contentColList + sliceColList) else: contentCols = sliceColList contentColNums = getStrs(collection,contentCols) ArgDict['contentColNums'] = contentColNums if hasattr(collection,'dateFormat'): dateFormat = collection.dateFormat ArgDict['overallDateFormat'] = dateFormat timeFormatter = td.mongotimeformatter(dateFormat) ArgDict['timeFormatter'] = timeFormatter else: dateFormat = '' if hasattr(collection,'overallDate'): od = collection.overallDate['date'] odf = collection.overallDate['format'] ArgDict['overallDate'] = od overallDateFormat = odf + dateFormat ArgDict['overallDateFormat'] = overallDateFormat timeFormatter = td.mongotimeformatter(overallDateFormat) ArgDict['timeFormatter'] = timeFormatter OD = timeFormatter(overallDate +'X'*len(dateFormat)) ArgDict['dateDivisions'] = td.getLowest(OD) ArgDict['datePhrases'] = [td.phrase(OD)] ArgDict['mindate'] = OD ArgDict['maxdate'] = OD if dateFormat: reverseTimeFormatter = td.reverse(dateFormat) ArgDict['reverseTimeFormatter'] = reverseTimeFormatter else: od = '' if 'timeColNames' in collection.columnGroups.keys(): timeColNamesInd = getNums(collection,collection.columnGroups['timeColNames']) tcs = [timeFormatter(od + t) for t in collection.columnGroups['timeColNames']] ArgDict['timeColNames'] = tcs ArgDict['timeColNameInds'] = timeColNamesInd ArgDict['timeColNameDivisions'] = [[td.TIME_DIVISIONS[x] for x in td.getLowest(tc)] for tc in tcs] ArgDict['timeColNamePhrases'] = [td.phrase(t) for t in tcs] if 'timeColumns' in collection.columnGroups.keys(): ArgDict['timeColInds'] = getNums(collection,collection.columnGroups['timeColumns']) #overall location if hasattr(collection,'overallLocation'): ol = collection.overallLocation ArgDict['overallLocation'] = ol else: ol = None #get divisions and phrases from OverallLocation and SpaceColNames if 'spaceColNames' in collection.columnGroups.keys(): spaceColNames = collection.columnGroups['spaceColNames'] ArgDict['spaceColNames'] = [loc.integrate(ol,x) for x in spaceColNames] if 'spaceColumns' in collection.columnGroups.keys(): ArgDict['spaceColInds'] = getNums(collection,collection.columnGroups['spaceColumns']) Source = collection.source SourceNameDict = son.SON([(k,Source[k]['name'] if isinstance(Source[k],dict) else Source[k]) for k in Source.keys()]) SourceAbbrevDict = dict([(k,Source[k]['shortName']) for k in Source.keys() if isinstance(Source[k],dict) and 'shortName' in Source[k].keys() ]) d['sourceSpec'] = json.dumps(SourceNameDict,default=ju.default) d['agency'] = SourceNameDict['agency'] d['subagency'] = SourceNameDict['subagency'] d['dataset'] = SourceNameDict['dataset'] for k in SourceNameDict.keys(): d['source_' + str(k).lower()] = SourceNameDict[k] for k in SourceAbbrevDict.keys(): d['source_' + str(k).lower() + '_acronym'] = SourceAbbrevDict[k] d['source'] = ' '.join(SourceNameDict.values() + SourceAbbrevDict.values()) if 'subcollections' in collection.columns: ArgDict['subColInd'] = collection.columns.index('subcollections') value_processor_instructions = stringifyDictElements(collection.valueProcessors) vpcontext = commonjs.translatorContext(value_processor_instructions) ArgDict['valueProcessors'],ArgDict['valueProcessorsKey'] = get_processors(value_processor_instructions,collection, vpcontext ,commonjs.js_call) return d, ArgDict
def updateCollection(download_dir,collectionName,parserClass,checkpath,certpath,parserArgs=None,parserKwargs=None,incremental=False): connection = pm.Connection(document_class=pm.son.SON) source_metadata = get_source_data(collectionName) db = connection['govdata'] assert not '__' in collectionName, 'collectionName must not contain consecutive underscores' metaCollectionName = '__' + collectionName + '__' versionName = '__' + collectionName + '__VERSIONS__' sliceDBName = '__' + collectionName + '__SLICES__' collection = db[collectionName] metacollection = db[metaCollectionName] versions = db[versionName] sliceDB = db[sliceDBName] if incremental: if versionName not in db.collection_names(): startInc = 0 else: startInc = get_max_increment_fromDB(versions) + 1 endInc = get_max_increment(download_dir) sources = [increment_format(download_dir,i) for i in range(startInc,endInc + 1)] else: sources = [download_dir] startInc = endInc = None if parserArgs == None: parserArgs = () if parserKwargs == None: parserKwargs = {} if sources: iterator = parserClass(sources[0],*parserArgs,**parserKwargs) iterator.set_source_metadata(source_metadata) uniqueIndexes = iterator.uniqueIndexes ColumnGroups = iterator.columnGroups sliceColTuples = getSliceColTuples(iterator.sliceCols) sliceColTuplesFlat = uniqify([tuple(sorted(uniqify(Flatten(sct)))) for sct in sliceColTuples]) sliceColList = uniqify(Flatten(ListUnion(sliceColTuples))) ContentCols = set(sliceColList + getContentCols(iterator)) if hasattr(iterator,'dateFormat'): TimeFormatter = td.mongotimeformatter(iterator.dateFormat) if collectionName in db.collection_names(): versionNumber = max(versions.distinct('versionNumber')) + 1 storedAllMetadata = metacollection.find_one({'name':'','versionNumber':versionNumber-1}) totalVariables = storedAllMetadata['columns'] VarMap = dict(zip(totalVariables,[str(x) for x in range(len(totalVariables))])) #check things are the same #and check consistent do so for all soruces else: versionNumber = 0 IndexCols = uniqify([x for x in ['subcollections'] + sliceColList + ListUnion([ColGroupsFlatten(ColumnGroups,k) for k in ['indexColumns','labelColumns','timeColumns','spaceColumns']]) if x not in uniqueIndexes]) totalVariables = SPECIAL_KEYS + uniqueIndexes + IndexCols assert not any(['.' in x or ('__' in x and x not in SPECIAL_KEYS) or x in ColumnGroups.keys() for x in totalVariables]) VarMap = dict(zip(totalVariables,map(str,range(len(totalVariables))))) cols = zip([VarMap[c] for c in uniqueIndexes + ['__versionNumber__']],[pm.DESCENDING]*(len(uniqueIndexes) + 1)) collection.ensure_index(cols,unique=True,dropDups=True) for col in IndexCols: collection.ensure_index(VarMap[col]) sliceDB.ensure_index('slice',unique=True,dropDups=True) vNInd = VarMap['__versionNumber__'] retInd = VarMap['__retained__'] specialKeyInds = [VarMap[k] for k in SPECIAL_KEYS] if 'timeColumns' in iterator.columnGroups.keys(): tcs = iterator.columnGroups['timeColumns'] else: tcs = [] if 'spaceColumns' in iterator.columnGroups.keys(): spcs = iterator.columnGroups['spaceColumns'] else: spcs = [] toParse = ListUnion([RecursiveFileList(source + '__PARSE__') for source in sources]) oldc = None SpaceCache = {} volumes = {'':0} dimensions = {'':[]} times = {'':[]} locations = {'':[]} varFormats = {} for file in toParse: iterator.refresh(file) checkMetadata(iterator) tcs = iterator.columnGroups.get('timeColumns',[]) spcs = iterator.columnGroups.get('spaceColumns',[]) index = 0 for c in iterator: newVars = [x for x in c.keys() if not x in totalVariables] assert not any (['__' in x or '.' in x or x in ColumnGroups.keys() for x in newVars]) , '__ and . must not appear in key names.' totalVariables += newVars VarMap.update(dict(zip(newVars,map(str,range(len(totalVariables) - len(newVars),len(totalVariables)))))) for tc in tcs: #time handling if tc in c.keys(): c[tc] = TimeFormatter(c[tc]) if COMPLETE_SPACE: for spc in spcs: if spc in c.keys(): #space t = getT(c[spc]) if t in SpaceCache.keys(): c[spc] = SpaceCache[t].copy() else: c[spc] = loc.SpaceComplete(c[spc]) SpaceCache[t] = c[spc].copy() if index % 100 == 0: print 'At', index index += 1 sctf = processSct(sliceColTuplesFlat,oldc,c) processRecord(c,collection,VarMap,totalVariables,uniqueIndexes,versionNumber,specialKeyInds,incremental,sliceDB,sctf,ContentCols) incrementThings(c,volumes,dimensions,times,locations,varFormats,tcs,spcs) oldc = c any_deleted = False if incremental: collection.update({vNInd:{'$lte': versionNumber - 1}}, {'$set':{vNInd:versionNumber}}) sliceDB.update({},{'$set':{'version':versionNumber}}) else: deleted = collection.find({vNInd:versionNumber - 1, retInd : {'$exists':False}}) for d in deleted: any_deleted = True sliceDelete(d,collection,sliceColTuples,VarMap,sliceDB,version) if any_deleted: subColInd = str(totalVariables.index('Subcollections')) subcols = [''] + uniqify(ListUnion(collection.distinct(subColInd))) for sc in subcols: volumes[sc] = collection.find({subColInd:sc}).count() dimensions[sc] = [k for k in totalVariables if collection.find_one({subColInd:sc,str(totalVariables.index(k)):{'$exists':True}})] times[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in tcs]) locations[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in spcs]) updateMetacollection(iterator,metacollection,incremental,versionNumber,totalVariables,tcs,spcs,volumes,dimensions,times,locations,varFormats) updateAssociatedFiles(sources,collection) updateVersionHistory(versionNumber,versions,startInc,endInc) updateSourceDBFromCollections(collectionNames = [collectionName]) connection.disconnect() createCertificate(certpath,'Collection ' + collectionName + ' written to DB.')