def run(db_host_name, db_port, directory, db_name, collection_name, collection_name_dbstat, n_threads): print('GEO INDEXING :::::::::::::::::::::::::::') start_time = time.time() # get files names file_list = getFilesList(directory) file_list.sort() print('dataset files : ') for f in file_list: print(str(f)) print('') print('I\'m working...') if len(file_list) < n_threads: n_threads = len(file_list) for file_name in file_list: if os.path.isfile(file_name): t = GeoIndexingThread(file_name, db_host_name, int(db_port), db_name, collection_name, collection_name_dbstat) t.start() if threading.active_count() > n_threads: while threading.active_count() > n_threads: time.sleep(0.1) while threading.active_count() > 1: time.sleep(0.1) # if there are more than row in 'globals' merge them dao = Dao(db_host_name, int(db_port)) dao.connect(db_name) generals_list = list(dao.query(collection_name_dbstat, '')) print(generals_list) db_stat = None if len(generals_list) > 1: for e in generals_list: if db_stat == None: db_stat = Db_stat(e['lat_max'], e['lon_max'], e['lat_min'], e['lon_min']) else: db_stat.merge(e) dao.removeAll(collection_name_dbstat) dao.addOne(collection_name_dbstat, db_stat.__dict__) dao.close() end_time = time.time() logs = {} logs['time'] = end_time - start_time return logs
def getBoundaries(host, port, db_name): # Get maps coordinate dao = Dao(host, port) dao.connect(db_name) c_list = list(dao.query('globals', '')) c_dict = dict(c_list[0]) dao.close() return (float(c_dict['lat_max']),float(c_dict['lon_max'])),(float(c_dict['lat_min']),float(c_dict['lon_min']))
def plotMap(host, port, db_name): # Get maps coordinate dao = Dao(host, port) dao.connect(db_name) c_list = list(dao.query('globals', '')) c_dict = dict(c_list[0]) dao.close() # Select the map m = Basemap(projection='mill', llcrnrlat=int(c_dict['lat_min']), urcrnrlat=int(c_dict['lat_max'] + 1), llcrnrlon=int(c_dict['lon_min']), urcrnrlon=int(c_dict['lon_max'] + 1), resolution='i') m.drawcoastlines() m.drawcountries() m.drawstates() m.fillcontinents(color='#04BAE3', lake_color='#FFFFFF') m.drawmapboundary(fill_color='#FFFFFF') # Get maps coordinate dao = Dao(host, port) dao.connect(db_name) it = dao.query('clicks', '') counter = 0 hasNext = True while hasNext and counter < 20000: #x,y = m(lon,lat) #m.plot(x,y, 'ro') try: url = next(it, None) except StopIteration: hasNext = False if hasNext: loc = url['loc'] lat, lon = float(loc[0]), float(loc[1]) x, y = m(lon, lat) m.plot(x, y, 'ro') counter += 1 if counter % 1000 == 0: print(counter) dao.close() plt.title("Geo Plotting") plt.show()
def getPlotsMap(host, port, db_name, collection): # Get maps coordinate dao = Dao(host, port) dao.connect(db_name) c_list = list(dao.query(collection, '')) c_dict = dict(c_list[0]) dao.close() # Select the map m = Basemap(projection='mill',llcrnrlat=int(c_dict['lat_min']),urcrnrlat=int(c_dict['lat_max']+1),llcrnrlon=int(c_dict['lon_min']),urcrnrlon=int(c_dict['lon_max']+1),resolution='i') m.drawcoastlines() m.drawcountries() m.drawstates() m.fillcontinents(color='#04BAE3',lake_color='#FFFFFF') m.drawmapboundary(fill_color='#FFFFFF') return m
def persist(filename, host_name, port, db_name, collection_name, collection_name_dbstat): # Definition of max lat and lon lat_max = None lat_min = None lon_max = None lon_min = None # Used like a hinge first_url = None # Check if the file is a txt if filename.endswith('.txt'): with open(filename, 'r') as f: text = f.read() # Open db dao = Dao(host_name, port) dao.connect(db_name) rows = text.split('\n') counter = 0 size = len(rows) #print('# Number of rows in \"'+filename+'\": ' + str(size)+'\n') # Get the maximum coordinates from the db stat = list(dao.query(collection_name_dbstat, '')) l_db = None if len(stat) > 0: stat_dict = dict(stat[0]) lat_max = stat_dict['lat_max'] lat_min = stat_dict['lat_min'] lon_max = stat_dict['lon_max'] lon_min = stat_dict['lon_min'] l_db = Db_stat(lat_max, lon_max, lat_min, lon_min) for row in rows: url = parse_row(row) if url != None: loc = url.getCoordinates() if l_db == None: l_db = Db_stat(loc[0], loc[1], loc[0], loc[1]) l_db.setModified() else: l_db.updateLat(loc[0]) l_db.updateLon(loc[1]) res = dao.addOne(collection_name, url.__dict__) '''counter = counter + 1 if counter % (size // 15) == 0: print(str(100 // (size / counter)) + ' % Done of \"'+ filename+'\"')''' # add lat_max, lon_max, lat_min and lon_min to db if are better stat = list(dao.query(collection_name_dbstat, '')) if len(stat) > 0: stat_dict = dict(stat[0]) if l_db.isModify() == True: doc_id = stat_dict["_id"] if l_db.lat_max > float(stat_dict['lat_max']): dao.updateOne(collection_name_dbstat, {'_id': doc_id}, {'$set': { "lat_max": l_db.lat_max }}) if l_db.lat_min < float(stat_dict['lat_min']): dao.updateOne(collection_name_dbstat, {'_id': doc_id}, {'$set': { "lat_min": l_db.lat_min }}) if l_db.lon_max > float(stat_dict['lon_max']): dao.updateOne(collection_name_dbstat, {'_id': doc_id}, {'$set': { "lon_max": l_db.lon_max }}) if l_db.lon_min < float(stat_dict['lon_min']): dao.updateOne(collection_name_dbstat, {'_id': doc_id}, {'$set': { "lon_min": l_db.lon_min }}) else: dao.addOne(collection_name_dbstat, l_db.__dict__) dao.close() print('100 % Done of \"' + filename + '\"')
def main(args): if len(args) == 1 or args[1] == '--h': print('Parameters : [ hostname, port, s ]') return 0 # Parameters for the db host = args[1] port = int(args[2]) # Parameters for the matrix s = int(args[3]) # Parameters for http requests max_waiting_time = 1 # 1s timeout for each request l_fails = [] #list containing the fails url db_name = 'db_geo_index' # geoindex collection collection_name = 'topics' # topic collection collection_topics_name = 'topics_trentino_test_approximated' max_loc, min_loc = getBoundaries(host, port, db_name) matrix = Matrix(min_loc, max_loc, s) matrix.toString() print('') # connect to geo dao dao = GeoDao(host, port) dao.connect(db_name, collection_topics_name) # =================================================================== # Get the plot map m = getPlotsMap(host, port, db_name, 'globals') # =================================================================== empty_cell_counter = 0 n_cells = 0 while matrix.hasNext(): # For the plotting cell_full = False locs = matrix.next() #actual position of the iterator current = matrix.current print('Current cell : '+ str(current), end = '\r') bl = [locs[0],locs[1]] tr = [locs[2],locs[3]] result = dao.getUrlsByBox(bl,tr) #do something with result l_url = [] l_res = list(result) if len(l_res) == 0: empty_cell_counter = empty_cell_counter + 1 elif len(l_res) > 0: #print(l_res) #print('') # compute the coordinates for the center of the cell cluster_lon = bl[1] + (tr[1] - bl[1]) / 2 cluster_lat = bl[0] + (tr[0] - bl[0]) / 2 # For plotting cell_full = True # =================================================================== # Get the plot map if cell_full == True: x,y = m(cluster_lon,cluster_lat) m.plot(x,y, 'ro') # =================================================================== n_cells = n_cells + 1 dao.close() print('') print('# cells : '+str(n_cells)) print('# empty : '+str(empty_cell_counter)) plt.title("Geo Plotting of the full cells") plt.show() return 0
def main(args): if len(args) == 1 or args[1] == '--h': print('Parameters : [ hostname, port, s ]') return 0 # Parameters for the db host = args[1] port = int(args[2]) # Parameters for the matrix s = int(args[3]) # Parameters for http requests max_waiting_time = 1 # 1s timeout for each request l_fails = [] #list containing the fails url db_name = 'db_geo_index' # geoindex collection collection_name = 'clicks' # topic collection collection_topics_name = 'topics_mini' max_loc, min_loc = getBoundaries(host, port, db_name) matrix = Matrix(min_loc, max_loc, s) matrix.toString() print('') # connect to geo dao dao = GeoDao(host, port) dao.connect(db_name, collection_name) # =================================================================== # Get the plot map m = getPlotsMap(host, port, db_name, 'globals') # =================================================================== empty_cell_counter = 0 n_cells = 0 while matrix.hasNext(): # For the plotting cell_full = False locs = matrix.next() #actual position of the iterator current = matrix.current print('Current cell : '+ str(current), end = '\r') bl = [locs[0],locs[1]] tr = [locs[2],locs[3]] result = dao.getUrlsByBox(bl,tr) #do something with result l_url = [] l_res = list(result) if len(l_res) == 0: empty_cell_counter = empty_cell_counter + 1 elif len(l_res) > 0: # compute the coordinates for the center of the cell cluster_lon = bl[1] + (tr[1] - bl[1]) / 2 cluster_lat = bl[0] + (tr[0] - bl[0]) / 2 # extract url and put it in a list for row in l_res: d_row = dict(row) urls = d_row['urls'] for url in urls: l_url.append(url) # Get corpuses from of all the url into a cell http_ret = http.get_corpuses(l_url, max_waiting_time, l_fails) corpuses = http_ret[0] l_fails = list(set(l_fails + http_ret[1])) # merges fails list # remove empty sublist corpuses = [x for x in corpuses if x != []] if len(corpuses) > 0: ''' # ONLY FOR TEST : save all the corpus ============= print('Saving corpuses on DB ...', end = '\r') corpuses_collection_name= 'corpuses_mini' d_corpuses = {} d_corpuses['loc'] = [cluster_lat,cluster_lon] d_corpuses['corpuses'] = corpuses dao.addOne(corpuses_collection_name, d_corpuses) # ================================================= # Make lda on the corpuses print('Doing LDA ...', end = '\r') l_topics = tmpLda(corpuses) # Save the topic list into the db print('Saving topics on DB ...', end = '\r') d_topics = {} d_topics['loc'] = [cluster_lat,cluster_lon] d_topics['topics'] = l_topics dao.addOne(collection_topics_name, d_topics) # For plotting cell_full = True ''' # =================================================================== # Get the plot map if cell_full == True: x,y = m(cluster_lon,cluster_lat) m.plot(x,y, 'ro') # =================================================================== n_cells = n_cells + 1 dao.close() print('') print('# cells : '+str(n_cells)) print('# empty : '+str(empty_cell_counter)) plt.title("Geo Plotting of the full cells") plt.show() return 0