Пример #1
0
def getCommonDatesLocations(iterator,metadata,times,locations,dimensions,k):
    vNInd = '0'
    overallDateFormat = iterator.overallDateFormat if hasattr(iterator,'overallDateFormat') else ''
    dateFormat = iterator.dateFormat if hasattr(iterator,'dateFormat') else ''
    overallDate = iterator.overallDate if hasattr(iterator,'overallDate') else ''
    if overallDateFormat or dateFormat:
        DF = overallDateFormat + dateFormat
        F = td.mongotimeformatter(DF)
        T1 = [F(overallDate + x) for x in iterator.columnGroups['timeColNames'] if x in dimensions[k]]
        if overallDateFormat:
            reverseF = td.reverse(dateFormat)
            T2 = [F(overallDate + y) for y in map(reverseF,times[k])]
        else:
            T2 = times[k]
        mindate = min(T1 + T2)
        maxdate = max(T1 + T2)
        divisions = uniqify(ListUnion([td.getLowest(t) for t in T1 + T2]))
        metadata[k]['beginDate'] = mindate
        metadata[k]['endDate'] = maxdate
        metadata[k]['dateDivisions'] = divisions
    #locations
    if locations[k]:
        if hasattr(iterator,'overallLocation'):
            locs = [loc.integrate(iterator.overallLocation,l) for l in locations[k]]
        else:
            locs = locations[k]
        locs = locListUniqify(locs)
        metadata[k]['spatialDivisions'] = uniqify(ListUnion([loc.divisions(x) for x in locs]))
        metadata[k]['commonLocation'] = reduce(loc.intersect,locs)
Пример #2
0
def initialize_argdict(collection):

    d = {} ; ArgDict = {}
    
    sliceCols = uniqify(Flatten(collection.sliceCols))
    sliceColList = ListUnion([[x] if x.split('.')[0] in collection.columns else collection.columnGroups.get(x,[]) for x in sliceCols])
    
    if hasattr(collection,'contentCols'):
        contentColList = ListUnion([[x] if x.split('.')[0] in collection.columns else collection.columnGroups.get(x,[]) for x in collection.contentCols])
        contentCols = uniqify(contentColList + sliceColList)
    else:
        contentCols = sliceColList
    contentColNums = getStrs(collection,contentCols)
    ArgDict['contentColNums'] = contentColNums
    
    if hasattr(collection,'dateFormat'):
        dateFormat = collection.dateFormat
        ArgDict['overallDateFormat'] = dateFormat
        timeFormatter = td.mongotimeformatter(dateFormat)
        ArgDict['timeFormatter'] = timeFormatter
    else:
        dateFormat = ''
        
    if hasattr(collection,'overallDate'):
        od = collection.overallDate['date']
        odf = collection.overallDate['format']
        ArgDict['overallDate'] = od
        overallDateFormat = odf + dateFormat
        ArgDict['overallDateFormat'] = overallDateFormat
        timeFormatter = td.mongotimeformatter(overallDateFormat)
        ArgDict['timeFormatter'] = timeFormatter

        OD = timeFormatter(overallDate +'X'*len(dateFormat))
        ArgDict['dateDivisions'] = td.getLowest(OD)
        ArgDict['datePhrases'] = [td.phrase(OD)]
        ArgDict['mindate'] = OD
        ArgDict['maxdate'] = OD             
        
        if dateFormat:
            reverseTimeFormatter = td.reverse(dateFormat)
            ArgDict['reverseTimeFormatter'] = reverseTimeFormatter
            
    else:
        od = ''
                    
    if 'timeColNames' in collection.columnGroups.keys():
        timeColNamesInd = getNums(collection,collection.columnGroups['timeColNames'])
        tcs = [timeFormatter(od + t) for t in collection.columnGroups['timeColNames']]
        ArgDict['timeColNames'] = tcs 
        ArgDict['timeColNameInds'] = timeColNamesInd
        ArgDict['timeColNameDivisions'] = [[td.TIME_DIVISIONS[x] for x in td.getLowest(tc)] for tc in tcs] 
        ArgDict['timeColNamePhrases'] = [td.phrase(t) for t in tcs]

    if 'timeColumns' in collection.columnGroups.keys():
        ArgDict['timeColInds'] = getNums(collection,collection.columnGroups['timeColumns'])
            
    #overall location
    if hasattr(collection,'overallLocation'):
        ol = collection.overallLocation
        ArgDict['overallLocation'] = ol
    else:
        ol = None
        
    #get divisions and phrases from OverallLocation and SpaceColNames
    if 'spaceColNames' in collection.columnGroups.keys():
        spaceColNames = collection.columnGroups['spaceColNames']
        ArgDict['spaceColNames'] = [loc.integrate(ol,x) for x in spaceColNames]

        
    if 'spaceColumns' in collection.columnGroups.keys():
        ArgDict['spaceColInds'] = getNums(collection,collection.columnGroups['spaceColumns'])

    Source = collection.source
    SourceNameDict = son.SON([(k,Source[k]['name'] if isinstance(Source[k],dict) else Source[k]) for k in Source.keys()])
    SourceAbbrevDict = dict([(k,Source[k]['shortName']) for k in Source.keys() if isinstance(Source[k],dict) and 'shortName' in Source[k].keys() ])
    d['sourceSpec'] = json.dumps(SourceNameDict,default=ju.default)
    d['agency'] = SourceNameDict['agency']
    d['subagency'] = SourceNameDict['subagency']
    d['dataset'] = SourceNameDict['dataset']
    for k in SourceNameDict.keys():
        d['source_' + str(k).lower()] = SourceNameDict[k]
    for k in SourceAbbrevDict.keys():
        d['source_' + str(k).lower() + '_acronym'] = SourceAbbrevDict[k]
    d['source'] = ' '.join(SourceNameDict.values() + SourceAbbrevDict.values())
        
    if 'subcollections' in collection.columns:
        ArgDict['subColInd'] = collection.columns.index('subcollections')
     
    value_processor_instructions = stringifyDictElements(collection.valueProcessors)
    vpcontext = commonjs.translatorContext(value_processor_instructions)
    ArgDict['valueProcessors'],ArgDict['valueProcessorsKey'] = get_processors(value_processor_instructions,collection, vpcontext ,commonjs.js_call)
    
                                    
    return d, ArgDict
Пример #3
0
def updateCollection(download_dir,collectionName,parserClass,checkpath,certpath,parserArgs=None,parserKwargs=None,incremental=False):
    
    connection =  pm.Connection(document_class=pm.son.SON)
    
    source_metadata = get_source_data(collectionName)
    
    db = connection['govdata']
    assert not '__' in collectionName, 'collectionName must not contain consecutive underscores'
    metaCollectionName = '__' + collectionName + '__'
    versionName = '__' + collectionName + '__VERSIONS__'
    sliceDBName =  '__' + collectionName + '__SLICES__'
    
    collection = db[collectionName]
    metacollection = db[metaCollectionName]
    versions = db[versionName]     
    sliceDB = db[sliceDBName]
            
    if incremental:     
        if versionName not in db.collection_names():
            startInc = 0
        else:
            startInc = get_max_increment_fromDB(versions) + 1
        endInc = get_max_increment(download_dir)
        sources = [increment_format(download_dir,i) for i in range(startInc,endInc + 1)]
    else:
        sources = [download_dir]
        startInc = endInc = None
        
    if parserArgs == None:
        parserArgs = ()
    if parserKwargs == None:
        parserKwargs = {}
        
    if sources:
        iterator = parserClass(sources[0],*parserArgs,**parserKwargs)
        iterator.set_source_metadata(source_metadata)
    
        uniqueIndexes = iterator.uniqueIndexes
        ColumnGroups = iterator.columnGroups
        
        sliceColTuples = getSliceColTuples(iterator.sliceCols)
        sliceColTuplesFlat = uniqify([tuple(sorted(uniqify(Flatten(sct)))) for sct in sliceColTuples])
      
        sliceColList = uniqify(Flatten(ListUnion(sliceColTuples)))
        ContentCols = set(sliceColList + getContentCols(iterator))
            
        if hasattr(iterator,'dateFormat'):
            TimeFormatter = td.mongotimeformatter(iterator.dateFormat)
            
    
        if collectionName in db.collection_names():
            versionNumber = max(versions.distinct('versionNumber')) + 1
            storedAllMetadata = metacollection.find_one({'name':'','versionNumber':versionNumber-1})
            totalVariables = storedAllMetadata['columns']
            VarMap = dict(zip(totalVariables,[str(x) for x in range(len(totalVariables))]))   
            
            #check things are the same 
            #and check consistent  do so for all soruces
            
        else:
            versionNumber = 0
            IndexCols = uniqify([x for x in ['subcollections'] + sliceColList + ListUnion([ColGroupsFlatten(ColumnGroups,k) for k in ['indexColumns','labelColumns','timeColumns','spaceColumns']]) if x not in uniqueIndexes])
            
            totalVariables = SPECIAL_KEYS + uniqueIndexes + IndexCols
            
            assert not any(['.' in x or ('__' in x and x not in SPECIAL_KEYS) or x in ColumnGroups.keys() for x in totalVariables])
            
            VarMap = dict(zip(totalVariables,map(str,range(len(totalVariables)))))  
            
            cols = zip([VarMap[c] for c in uniqueIndexes + ['__versionNumber__']],[pm.DESCENDING]*(len(uniqueIndexes) + 1))
            collection.ensure_index(cols,unique=True,dropDups=True)
    
            for col in IndexCols:
                collection.ensure_index(VarMap[col])
            
            sliceDB.ensure_index('slice',unique=True,dropDups=True)
                            
        vNInd = VarMap['__versionNumber__']
        retInd = VarMap['__retained__']
        
        specialKeyInds = [VarMap[k] for k in SPECIAL_KEYS]
    
        if 'timeColumns' in iterator.columnGroups.keys():
            tcs = iterator.columnGroups['timeColumns']
        else:
            tcs = []
        
        if 'spaceColumns' in iterator.columnGroups.keys():
            spcs = iterator.columnGroups['spaceColumns']
        else:
            spcs = []
                  
        toParse = ListUnion([RecursiveFileList(source + '__PARSE__') for source in sources])
            
        oldc = None
        SpaceCache = {}    
        volumes = {'':0} 
        dimensions = {'':[]}
        times = {'':[]}
        locations = {'':[]}
        varFormats = {}
        for file in toParse:
            iterator.refresh(file)
            checkMetadata(iterator)
            tcs = iterator.columnGroups.get('timeColumns',[])
            spcs = iterator.columnGroups.get('spaceColumns',[])
            index = 0
            for c in iterator: 
                newVars = [x for x in c.keys() if not x in totalVariables]
                assert not any (['__' in x or '.' in x or x in ColumnGroups.keys() for x in newVars]) , '__ and . must not appear in key names.'     
                totalVariables += newVars
                VarMap.update(dict(zip(newVars,map(str,range(len(totalVariables) - len(newVars),len(totalVariables))))))
                
                for tc in tcs:   #time handling 
                    if tc in c.keys():
                        c[tc] = TimeFormatter(c[tc])
                if COMPLETE_SPACE:        
                    for spc in spcs:
                        if spc in c.keys():   #space
                            t = getT(c[spc])
                            if t in SpaceCache.keys():
                                c[spc] = SpaceCache[t].copy()
                            else:
                                c[spc] = loc.SpaceComplete(c[spc])
                                SpaceCache[t] = c[spc].copy()      
                if index % 100 == 0:
                    print 'At', index
                index += 1
                sctf = processSct(sliceColTuplesFlat,oldc,c)
                processRecord(c,collection,VarMap,totalVariables,uniqueIndexes,versionNumber,specialKeyInds,incremental,sliceDB,sctf,ContentCols)
                incrementThings(c,volumes,dimensions,times,locations,varFormats,tcs,spcs)
                
                oldc = c
                
        any_deleted = False                
        if incremental:
            collection.update({vNInd:{'$lte': versionNumber - 1}}, {'$set':{vNInd:versionNumber}})                    
            sliceDB.update({},{'$set':{'version':versionNumber}})  
   
        else:
            deleted = collection.find({vNInd:versionNumber - 1, retInd : {'$exists':False}})
            for d in deleted:
                any_deleted = True
                sliceDelete(d,collection,sliceColTuples,VarMap,sliceDB,version)
                                   
        if any_deleted:
            subColInd = str(totalVariables.index('Subcollections'))
            subcols = [''] + uniqify(ListUnion(collection.distinct(subColInd)))
            for sc in subcols:
                volumes[sc] = collection.find({subColInd:sc}).count()
                dimensions[sc] = [k for k in totalVariables if collection.find_one({subColInd:sc,str(totalVariables.index(k)):{'$exists':True}})]
                times[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in tcs])
                locations[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in spcs])
                
               
        updateMetacollection(iterator,metacollection,incremental,versionNumber,totalVariables,tcs,spcs,volumes,dimensions,times,locations,varFormats)
        
        updateAssociatedFiles(sources,collection)
        
        updateVersionHistory(versionNumber,versions,startInc,endInc)
    
    updateSourceDBFromCollections(collectionNames = [collectionName])
    connection.disconnect()
    createCertificate(certpath,'Collection ' + collectionName + ' written to DB.')