def getGridList(self): """connect to mongoDB and get gridlist results Args: collection (string): collection name that stored city grid set Returns: TYPE: false grid uid list, true grid object collection """ conn, mdb = connectMongo(self.db['dbname']) grid = mdb[self.db['gridcolname']] falseuidlist = [] truegridobj = {} gridlists = list( grid.find({}, { "properties.typevalid": 1, "properties.uid": 1, "properties.vec": 1 })) for gridlist in gridlists: typevalid, uid, vec = gridlist['properties']['typevalid'], str( gridlist['properties']['uid']), gridlist['properties']['vec'] if typevalid == False: falseuidlist.append(uid) else: truegridobj[uid] = vec conn.close() return falseuidlist, truegridobj
def __init__(self, city, citylocs, defaultRadius, IP): super(CityGrid, self).__init__() self.city = city self.citylocs = citylocs self.defaultRadius = defaultRadius * 2 self.maxQRadius = 500 self.db = { 'url': IP, 'port': 27017, 'dbname': 'tdnormal', 'gridcolname': 'newgrids_%s' % city, 'POIcolname': 'pois_%s' % city } conn, db = connectMongo(self.db['dbname']) grid = db[self.db['gridcolname']] POIs = db[self.db['POIcolname']] try: grid.create_index([ ("properties.typevalid", pymongo.ASCENDING), ("properties.vec", pymongo.ASCENDING) ]) grid.create_index([("properties.center", pymongo.GEOSPHERE)]) POIs.create_index([("properties.center", pymongo.GEOSPHERE)]) except Exception as e: raise e conn.close()
def aggregateEntropy(type, dbName, collectionName, data): conn, db = connectMongo(dbName) rawData = list(db[collectionName].find({}, {'_id': 1, 'vec': 1})) if type == 'temporal': pass elif type == 'spatial': pass conn.close()
def sepFeatureTaskes(self, falseuidlist, truegridobj, userlist): """seperate getting feature work into different subtasks Args: falseuidlist (array): false grid uid list truegridobj (object): true grid object collection userlist (array): user list Returns: NULL: Description """ ppservers = () # remove all documents in beijing_features collection conn, mdb = connectMongo(self.db['dbname']) mdb[self.db['featurecolname']].remove({}) conn.close() job_server = pp.Server(ppservers=ppservers) logging.info("pp 可以用的工作核心线程数 %s workers" % job_server.get_ncpus()) start_time = time.time() jobs, index = [], 0 for sublist in userlist: jobs.append((index, job_server.submit(self.aggregateVector, ( sublist, falseuidlist, truegridobj, ), ( self.vecAdd, connectMongo, connectMYSQL, getCityLocs, initTimePeriods, judFeatureTP, ), ( "os", "time", "pp", "CommonFunc", "numpy", "pymongo", "MySQLdb", "logging", "sys", "gc", )))) index += 1 for index, job in jobs: job() logging.info("多线程下执行耗时: %ss" % str(time.time() - start_time)) job_server.print_stats()
def upDotsBlongedDis(city, dic): # 获得 grids grids = [] # 存储结果 count, validcount = 0, 0 conn, db = connectMongo('tdnormal') GRID = db['newgrids_%s' % city] # 获取所有 grids 结果 dbgrids = list(GRID.find({}, { "properties.uid": 1, "properties.center": 1 })) # 构建栅栏数组 disobjs = [] with open(os.path.join(dic, getAbbName(city) + '.json')) as f: data = json.load(f) features = data['features'] for each in features: disobjs.append({ 'name': each['properties']['name'], 'geo': shape(each['geometry']) }) # 处理围栏数据 for each in dbgrids: try: coords = each['properties']['center']['coordinates'] point = Point(coords[0], coords[1]) index = getDisIndex(point, disobjs) count += 1 if index != -1: validcount += 1 if validcount % 10000 == 0: print 'Valid Grids num: %d, Total Grids num: %d' % ( validcount, count) grids.append([each['properties']['uid'], index]) except Exception as e: print each print e continue finally: pass print "City %s owns valid distict grids %d" % (city, len(grids)) return grids
def calUsersEntropy(dbName, collectionName): conn, db = connectMongo(dbName) # get all user features rawData = list(db[collectionName].find({}, {'pVec': 1})) print 'Query %s data in %s' % (str(len(rawData)), collectionName) # update entropy values bulk = db[collectionName].initialize_ordered_bulk_op() for item in rawData: pVec = [each[:-1] for each in item['pVec']] # Calculate sum results by columns and rows # pVecSum = np.matrix(pVec).sum(dtype='float') pVecColSum, pVecRowSum = getMatrixSumbyDim(pVec, 'column'), getMatrixSumbyDim(pVec, 'row') if type(pVecColSum) is not int: # Calculates entropy results by temporal or spatial mode colEntropy = sc.entropy(pVecColSum) rowEntropy = sc.entropy(pVecRowSum) bulk.find({'_id': int(item['_id'])}).update({'$set': { 'entropy' : { 'col': colEntropy, 'row': rowEntropy } }}) else: bulk.find({'_id': int(item['_id'])}).update({'$set': { 'entropy' : { 'col': -1, 'row': -1 } }}) # insert them all into mongoDB result = bulk.execute() print result conn.close()
def main(argv): try: opts, args = getopt.getopt( argv, "hc:d:n:m:", ["help", "city=", 'directory=', 'number=', 'mode=']) except getopt.GetoptError as err: print str(err) usage() sys.exit(2) # 处理输入参数 city, directory, number, mode = 'beijing', '/enigma/tao.jiang/datasets/JingJinJi', 999, 'all' for opt, arg in opts: if opt == '-h': usage() sys.exit() elif opt in ("-c", "--city"): city = arg elif opt in ("-d", "--directory"): directory = arg elif opt in ('-n', '--number'): number = int(arg) elif opt in ('-m', '--mode'): mode = arg # cunchu STARTTIME = time.time() print "Start approach at %s" % STARTTIME conn, db = connectMongo('tdnormal') GRIDSNUM = db['newgrids_%s' % city].count() gridsData, validIDs = getGridsFromMongo(city, db) conn.close() CITYDISIND, CITYDISNUM = getCityDisInfo(city) if mode == 'all': # @多进程运行程序 START manager = Manager() jobs = [] for x in xrange(0, 20): # time.sleep(random.random()*2) PROP = { 'INDEX': x, 'DIRECTORY': directory, 'GRIDSNUM': GRIDSNUM, 'CITY': city, 'CITYDISIND': CITYDISIND, 'CITYDISNUM': CITYDISNUM, 'FILENUM': number } DATA = {'gridsData': gridsData, 'validIDs': validIDs} jobs.append(Process(target=processTask, args=(PROP, DATA))) jobs[x].start() # 等待所有进程结束 for job in jobs: job.join() # Start to merge result files MERGE = time.time() print "Start merge at %s" % MERGE mergeMatrixFiles(city, GRIDSNUM, directory) print "End merge in %s" % str(time.time() - MERGE) ENDTIME = time.time() print "End approach at %s" % ENDTIME
def gridGeneration(self, split, locs={}): """Generate City Grid sets Args: split (float): distance interval of lat and lng, the unit is degree gridname (string): grid collection name in mongoDB poiname (string): POI collection name in mongoDB locs (dict, optional): city grids' location region, in four directions Returns: NULL: Description """ logging.info("CityGrid generation is starting...") if locs == {}: locs = self.citylocs count = 100000 tmparray = [] centerincrement = 0.0015 # round(split / 2.0, 4) latnum = int((locs['north'] - locs['south']) / split + 1) lngnum = int((locs['east'] - locs['west']) / split + 1) conn, db = connectMongo(self.db['dbname']) grid = db[self.db['gridcolname']] POIs = db[self.db['POIcolname']] for latind in xrange(0, latnum): for lngind in xrange(0, lngnum): lat = round(locs['south'] + latind * split, 3) lng = round(locs['west'] + lngind * split, 3) lnginc = round(lng+split, 3) latinc = round(lat+split, 3) lngcen = round(lng+centerincrement, 4) latcen = round(lat+centerincrement, 4) # 一个正方形 geojson 对象,代表当前方块对应的地理边界 coordsarr = [ [lng, lat], [lnginc, lat], [lnginc, latinc], [lng, latinc], [lng, lat] ] featurelistarray = [0]*11 typevalid = False # query all the POIs less than maxQRadius nearPOIList = list(POIs.find({ "properties.center": { '$near': { '$geometry': { 'type': "Point", 'coordinates': [ lngcen, latcen ] }, '$minDistance': 0, '$maxDistance': self.maxQRadius } } })) # construct vector with POIs types info poilen = len(nearPOIList) if poilen != 0: typevalid = True featurelistsum = 0 # POI list is not null for each in nearPOIList: cpoint = each["properties"]["center"]["coordinates"] radius = each["properties"]["radius"] sigma = self.defaultRadius if radius > 0: sigma = radius * 2.0 P = gaussian2D([lngcen, latcen], cpoint, sigma ) featurelistsum += P curPInd = each["properties"]["ftype"] - 1 featurelistarray[ curPInd ] += P # update feature vector if featurelistsum: featurelistarray = [each/featurelistsum for each in featurelistarray] else: typevalid = False print featurelistsum # single feature format # uid: to locate grid index according to it's lat and lng # vec: feature type # center: center position of current feature tmparray.append({ "type": "Feature", "_id": "%s-%s-%s" % (self.city, str(lat), str(lng)), "properties": { "id": "%s-%s-%s" % (self.city, str(lat), str(lng)), "type": "Polygon", "vecvalid": typevalid, "center": {"type": "Point", "coordinates": [lngcen, latcen]}, "uid": int(lngind + latind * lngnum), "vec": featurelistarray, 'poinum': poilen }, "geometry": { "type": "Polygon", "coordinates": [ coordsarr ] } }) if len( tmparray ) == 100000: grid.insert( tmparray ) tmparray = [] gc.collect() logging.debug("100000 features has been inserted into mongoDB.") if len( tmparray ) != 0: grid.insert( tmparray ) logging.info("Grid generation complete!") conn.close()
def getValidGrids(city, dic): conn, mdb = connectMongo('tdnormal') collectname = 'newgrids_%s' % city citynames = { 'beijing': 'bj', 'tianjin': 'tj', 'zhangjiakou': 'zjk', 'tangshan': 'ts' } # 获取poi分布网格信息 poiDisRes = list(mdb[collectname].find({'properties.vecvalid': True}, { 'properties.vec': 1, 'properties.uid': 1 })) conn.close() # 获取density信息以及行政区划信息 db, cur = connectMYSQL('tdnormal') cur.execute( "SELECT id, dis, wpnumber AS 'num' from %sEmatrix WHERE wpnumber > 0;" % citynames[city]) denarr = {} disarr = [] for each in cur.fetchall(): # print each dis = str(each[1]) denarr[str(each[0])] = {'dis': dis, 'num': long(each[2])} if dis not in disarr: disarr.append(dis) cur.close() db.close() print "sql ready" res = {'total': [0.0] * 11} # 初始化各区划及城市总值对象 for each in disarr: res[each] = [0.0] * 11 # 遍历poi网格更新对象 # for each in poiDisRes: id = str(each['properties']['uid']) if id in denarr: # print 'here' for x in xrange(0, 11): increment = denarr[id]['num'] * each['properties']['vec'][x] dis = denarr[id]['dis'] if dis in res: res[dis][x] += increment else: res[dis] = [0.0] * 11 res[dis][x] += increment res['total'][x] += increment # 存储对象进文件 # output = {} # for k in res: # if k == 'total': # output[k] = res[k] # else: with open( os.path.join( '/home/joe/Documents/git/living-modes-visual-comparison/server/data/tmp', '%s_poidis.json' % citynames[city]), 'w+') as target: json.dump(res, target) target.close()
def matchUserRecords(self, split, jobID, userlist): """Enumerate users with all their records, matching with grid and store them into mongoDB Args: split (float): distance interval of lat and lng, the unit is degree jobID (int): Job Number userlist (array): userlist is consists of many tdid strings colname (string): Collection name in MongoDB Returns: NULL: Description """ # initialize database connections and data structure db, cur = connectMYSQL(self.db['mysqldb']) conn, mdb = connectMongo(self.db['dbname']) users = mdb[self.db['usercolname']] userrecords, usernum = [], 0 # enum user in userlist for user in userlist: usernum += 1 # execute the query and get records result cur.execute( "SELECT dayType, dateID, timeSegID, lat, lng FROM cbeijing WHERE tdid = %s", (user, )) res = cur.fetchall() tmprecords = [] # each (tuple) format: # 0: dayType # 1: dateID # 2: timeSegID # 3: lat # 4: lng for each in res: # if the record is not in region grid, then we don't consider it as a valid record dayType, dateID, timeSegID, lat, lng = each[0], int( each[1]), int(each[2]), float(each[3]), float(each[4]) if lat < self.citylocs['south'] or lat >= self.citylocs[ 'north'] or lng < self.citylocs[ 'west'] or lng >= self.citylocs['east']: continue lngnum = int((self.citylocs['east'] - self.citylocs['west']) / split + 1) latind = int((lat - self.citylocs['south']) / split) lngind = int((lng - self.citylocs['west']) / split) uid = int(lngind + latind * lngnum) tmprecords.append({ '_id': '%s-%s-%s' % (user, dateID, timeSegID), 'id': int(user), 'geometry': { 'type': 'Point', 'coordinates': [lng, lat] }, 'type': 'Feature', 'properties': { 'gridUID': uid, 'daytype': dayType, 'dateid': dateID, 'timesegid': timeSegID } }) userrecords.extend(tmprecords) # insert into mongoDB if usernum == 300: logging.info("Job %s inserted 300 records into database." % jobID) users.insert(userrecords) userrecords = [] usernum = 0 # there are still some users in the array if usernum != 0: users.insert(userrecords) cur.close() db.close() conn.close()
def aggregateVector(self, userlist, invalidlist, validobj): """aggregate user behavior vector, according to given userlist Args: userlist (array): user list invalidlist (array): false grid uid list validobj (object): true grid object collection Returns: NULL: Description """ conn, mdb = connectMongo(self.db['dbname']) features, users = mdb[self.db['featurecolname']], mdb[ self.db['usercolname']] userveclist = [] # used to store user vectors # enum users, aggreate each user's records in an average strategy for user in userlist: # used to store current user's unaggreated sub-vector sets tpCol = initTimePeriods() tmpvecs = tpCol['tpVectors'] typearr = tpCol['tpNames'] # user: user ID user = int(user) reclists = list( users.find({"id": user}, { "properties.gridUID": 1, "properties.timesegid": 1, "properties.daytype": 1 })) for x in reclists: uid = str(x['properties']['gridUID']) # if the grid is invalid, then we just jump over this record if uid in invalidlist: continue else: try: vec = validobj[uid] daytype = str(x['properties']['daytype']) timesegid = int(x['properties']['timesegid'] / 10) # judge which type of feature it is vecInd = judFeatureTP(daytype, timesegid) for eachvecInd in vecInd: tmpvecs[eachvecInd]['num'] += 1 tmpvecs[eachvecInd]['vec'] = self.vecAdd( tmpvecs[eachvecInd]['vec'], vec) except Exception as e: raise e # used to be a template for generating one's feature vector vectmpl = { '_id': user, 'pVec': [], 'tpNumVec': [], 'totalNum': len(reclists) } # aggreate vector notallnull = False for x in typearr: if tmpvecs[x]['num'] != 0: notallnull = True vectmpl['pVec'].append([ tmpvecs[x]['vec'][i] / float(tmpvecs[x]['num']) for i in xrange(len(tmpvecs[x]['vec'])) ]) vectmpl['tpNumVec'].append(int(tmpvecs[x]['num'])) else: vectmpl['pVec'].append([0] * len(tmpvecs[x]['vec'])) vectmpl['tpNumVec'].append(0) if notallnull: userveclist.append(vectmpl) if len(userveclist) == 300: features.insert(userveclist) userveclist = [] gc.collect() if len(userveclist) != 0: features.insert(userveclist) conn.close()
def calGridAoiDis(city): # 初始化网格,构建 POI 类别档案 locs = getCityLocs(city) grids = [] split = 0.05 centerincrement = 0.025 # round(split / 2.0, 4) latnum = int((locs['north'] - locs['south']) / split + 1) lngnum = int((locs['east'] - locs['west']) / split + 1) conn, db = connectMongo('tdnormal') POIs = db['pois_%s' % city] # 构建栅栏数组 disobjs = [] with open( os.path.join( '/home/taojiang/git/living-modes-visual-comparison/conf/data', getAbbName(city) + '.json')) as f: stream = json.load(f) features = stream['features'] for each in features: disobjs.append({ 'name': each['properties']['name'], 'geo': shape(each['geometry']) }) f.close() # 遍历查询网格周围 POI 并更新 for latind in xrange(0, latnum): for lngind in xrange(0, lngnum): # 前11元素均为分类别统计数量,最后一个元素为POI总量 tmpGrid = [0 for x in xrange(0, 15)] vaildGrid = False lat = round(locs['south'] + latind * split, 3) lng = round(locs['west'] + lngind * split, 3) lnginc = round(lng + split, 3) latinc = round(lat + split, 3) lngcen = round(lng + centerincrement, 4) latcen = round(lat + centerincrement, 4) # 一个正方形 geojson 对象,代表当前方块对应的地理边界 coordsarr = [[lng, lat], [lnginc, lat], [lnginc, latinc], [lng, latinc], [lng, lat]] featurelistarray = [0] * 11 typevalid = False # query all the POIs less than maxQRadius nearPOIList = list( POIs.find( { "properties.center": { '$near': { '$geometry': { 'type': "Point", 'coordinates': [lngcen, latcen] }, '$minDistance': 0, '$maxDistance': 2500 * (2**0.5) } } }, {'properties': 1})) # construct vector with POIs types info poilen = len(nearPOIList) if poilen != 0: vaildGrid = True # POI list is not null for each in nearPOIList: curPInd = each['properties']['ftype'] - 1 tmpGrid[curPInd] += 1 tmpGrid[11] += 1 if vaildGrid: tmpGrid[12] = lngcen tmpGrid[13] = latcen point = Point(lngcen, latcen) index = getDisIndex(point, disobjs) tmpGrid[14] = index if index == -1: print 'Invalid Grid Found.' grids.append(tmpGrid) print "%s City with valid grids %s" % (city, str(len(grids))) return grids
def main(argv): # 输入参数对照列表 try: opts, args = getopt.getopt(argv, "hc:d:n:", ["help", "city=", 'directory=', 'number=']) except getopt.GetoptError as err: print str(err) usage() sys.exit(2) # 处理输入参数 city, directory, number = 'zhangjiakou', '/home/tao.jiang/datasets/JingJinJi', 999 for opt, arg in opts: if opt == '-h': usage() sys.exit() elif opt in ("-c", "--city"): city = arg elif opt in ("-d", "--directory"): directory = arg elif opt in ('-n', '--number'): number = int(arg) STARTTIME = time.time() print "Start approach at %s" % STARTTIME # 连接数据获取网格信息,包括总数,具有有效POI的网格 conn, db = connectMongo('tdnormal') GRIDSNUM = db['newgrids_%s' % city].count() gridsData, validIDs = getGridsFromMongo(city, db) conn.close() # 获取城市起始区划下标,城市行政区划数目 CITYDISIND, CITYDISNUM = getCityDisInfo(city) # @多进程运行程序 START manager = Manager() jobs = [] for x in xrange(0,20): PROP = { 'INDEX': x, 'DIRECTORY': directory, 'GRIDSNUM': GRIDSNUM, 'CITY': city, 'CITYDISIND': CITYDISIND, 'CITYDISNUM': CITYDISNUM, 'FILENUM': number } DATA = { 'gridsData': gridsData, 'validIDs': validIDs } jobs.append( Process(target=processTask, args=(PROP, DATA)) ) jobs[x].start() # 等待所有进程结束 for job in jobs: job.join() # 合并结果文件 MERGE = time.time() print "Start merge at %s" % MERGE mergeMatrixFiles(city, GRIDSNUM, directory) mergeDistributionFiles(city, directory) print "End merge in %s" % str(time.time() - MERGE) # 结束 ENDTIME = time.time() print "End approach at %s" % ENDTIME