Exemplo n.º 1
0
def run(db_host_name, db_port, directory, db_name, collection_name,
        collection_name_dbstat, n_threads):

    print('GEO INDEXING :::::::::::::::::::::::::::')

    start_time = time.time()

    # get files names
    file_list = getFilesList(directory)
    file_list.sort()

    print('dataset files : ')
    for f in file_list:
        print(str(f))
    print('')
    print('I\'m working...')

    if len(file_list) < n_threads:
        n_threads = len(file_list)

    for file_name in file_list:
        if os.path.isfile(file_name):

            t = GeoIndexingThread(file_name, db_host_name, int(db_port),
                                  db_name, collection_name,
                                  collection_name_dbstat)
            t.start()

            if threading.active_count() > n_threads:
                while threading.active_count() > n_threads:
                    time.sleep(0.1)

    while threading.active_count() > 1:
        time.sleep(0.1)

    # if there are more than row in 'globals' merge them
    dao = Dao(db_host_name, int(db_port))
    dao.connect(db_name)

    generals_list = list(dao.query(collection_name_dbstat, ''))
    print(generals_list)
    db_stat = None
    if len(generals_list) > 1:
        for e in generals_list:
            if db_stat == None:
                db_stat = Db_stat(e['lat_max'], e['lon_max'], e['lat_min'],
                                  e['lon_min'])
            else:
                db_stat.merge(e)
        dao.removeAll(collection_name_dbstat)
        dao.addOne(collection_name_dbstat, db_stat.__dict__)
    dao.close()

    end_time = time.time()

    logs = {}
    logs['time'] = end_time - start_time

    return logs
Exemplo n.º 2
0
def getBoundaries(host, port, db_name):
	# Get maps coordinate
	dao = Dao(host, port)
	dao.connect(db_name)
	c_list = list(dao.query('globals', ''))
	c_dict = dict(c_list[0])
	dao.close()	
	return (float(c_dict['lat_max']),float(c_dict['lon_max'])),(float(c_dict['lat_min']),float(c_dict['lon_min']))
Exemplo n.º 3
0
def plotMap(host, port, db_name):

    # Get maps coordinate
    dao = Dao(host, port)
    dao.connect(db_name)

    c_list = list(dao.query('globals', ''))
    c_dict = dict(c_list[0])

    dao.close()

    # Select the map
    m = Basemap(projection='mill',
                llcrnrlat=int(c_dict['lat_min']),
                urcrnrlat=int(c_dict['lat_max'] + 1),
                llcrnrlon=int(c_dict['lon_min']),
                urcrnrlon=int(c_dict['lon_max'] + 1),
                resolution='i')

    m.drawcoastlines()
    m.drawcountries()
    m.drawstates()
    m.fillcontinents(color='#04BAE3', lake_color='#FFFFFF')
    m.drawmapboundary(fill_color='#FFFFFF')

    # Get maps coordinate
    dao = Dao(host, port)
    dao.connect(db_name)

    it = dao.query('clicks', '')

    counter = 0
    hasNext = True
    while hasNext and counter < 20000:
        #x,y = m(lon,lat)			#m.plot(x,y, 'ro')
        try:
            url = next(it, None)
        except StopIteration:
            hasNext = False

        if hasNext:
            loc = url['loc']
            lat, lon = float(loc[0]), float(loc[1])
            x, y = m(lon, lat)
            m.plot(x, y, 'ro')
        counter += 1

        if counter % 1000 == 0:
            print(counter)
    dao.close()

    plt.title("Geo Plotting")
    plt.show()
Exemplo n.º 4
0
def getPlotsMap(host, port, db_name, collection):
	# Get maps coordinate
	dao = Dao(host, port)
	dao.connect(db_name)

	c_list = list(dao.query(collection, ''))
	c_dict = dict(c_list[0])
	
	dao.close()

	# Select the map
	m = Basemap(projection='mill',llcrnrlat=int(c_dict['lat_min']),urcrnrlat=int(c_dict['lat_max']+1),llcrnrlon=int(c_dict['lon_min']),urcrnrlon=int(c_dict['lon_max']+1),resolution='i')

	m.drawcoastlines()
	m.drawcountries()
	m.drawstates()
	m.fillcontinents(color='#04BAE3',lake_color='#FFFFFF')
	m.drawmapboundary(fill_color='#FFFFFF')

	return m
Exemplo n.º 5
0
def persist(filename, host_name, port, db_name, collection_name,
            collection_name_dbstat):

    # Definition of max lat and lon
    lat_max = None
    lat_min = None
    lon_max = None
    lon_min = None
    # Used like a hinge
    first_url = None

    # Check if the file is a txt
    if filename.endswith('.txt'):
        with open(filename, 'r') as f:
            text = f.read()

            # Open db
            dao = Dao(host_name, port)
            dao.connect(db_name)

            rows = text.split('\n')

            counter = 0
            size = len(rows)

            #print('# Number of rows in \"'+filename+'\": ' + str(size)+'\n')

            # Get the maximum coordinates from the db
            stat = list(dao.query(collection_name_dbstat, ''))
            l_db = None
            if len(stat) > 0:
                stat_dict = dict(stat[0])
                lat_max = stat_dict['lat_max']
                lat_min = stat_dict['lat_min']
                lon_max = stat_dict['lon_max']
                lon_min = stat_dict['lon_min']
                l_db = Db_stat(lat_max, lon_max, lat_min, lon_min)

            for row in rows:
                url = parse_row(row)
                if url != None:

                    loc = url.getCoordinates()
                    if l_db == None:
                        l_db = Db_stat(loc[0], loc[1], loc[0], loc[1])
                        l_db.setModified()
                    else:
                        l_db.updateLat(loc[0])
                        l_db.updateLon(loc[1])

                    res = dao.addOne(collection_name, url.__dict__)
                '''counter = counter + 1					
				if counter % (size // 15) == 0:
					print(str(100 // (size / counter)) + ' % Done of \"'+ filename+'\"')'''

            # add lat_max, lon_max, lat_min and lon_min to db if are better
            stat = list(dao.query(collection_name_dbstat, ''))
            if len(stat) > 0:
                stat_dict = dict(stat[0])
                if l_db.isModify() == True:

                    doc_id = stat_dict["_id"]

                    if l_db.lat_max > float(stat_dict['lat_max']):
                        dao.updateOne(collection_name_dbstat, {'_id': doc_id},
                                      {'$set': {
                                          "lat_max": l_db.lat_max
                                      }})
                    if l_db.lat_min < float(stat_dict['lat_min']):
                        dao.updateOne(collection_name_dbstat, {'_id': doc_id},
                                      {'$set': {
                                          "lat_min": l_db.lat_min
                                      }})
                    if l_db.lon_max > float(stat_dict['lon_max']):
                        dao.updateOne(collection_name_dbstat, {'_id': doc_id},
                                      {'$set': {
                                          "lon_max": l_db.lon_max
                                      }})
                    if l_db.lon_min < float(stat_dict['lon_min']):
                        dao.updateOne(collection_name_dbstat, {'_id': doc_id},
                                      {'$set': {
                                          "lon_min": l_db.lon_min
                                      }})
            else:
                dao.addOne(collection_name_dbstat, l_db.__dict__)

            dao.close()
    print('100 % Done of \"' + filename + '\"')
Exemplo n.º 6
0
def main(args):
	
	if len(args) == 1 or args[1] == '--h':
		print('Parameters : [ hostname, port, s ]')
		return 0


	# Parameters for the db
	host = args[1]
	port = int(args[2])
	
	# Parameters for the matrix
	s = int(args[3])

	# Parameters for http requests
	max_waiting_time = 1 # 1s timeout for each request
	l_fails = [] #list containing the fails url	

	db_name = 'db_geo_index'	
	
	# geoindex collection
	collection_name = 'topics'
	
	# topic collection
	collection_topics_name = 'topics_trentino_test_approximated'

		
	max_loc, min_loc = getBoundaries(host, port, db_name)
	
	matrix = Matrix(min_loc, max_loc, s)
	matrix.toString()
	print('')

	# connect to geo dao
	dao = GeoDao(host, port)
	dao.connect(db_name, collection_topics_name)

	# ===================================================================
	# Get the plot map
	m = getPlotsMap(host, port, db_name, 'globals')
	# ===================================================================

	empty_cell_counter = 0
	n_cells = 0
	while matrix.hasNext():
		# For the plotting		
		cell_full = False

		locs = matrix.next()

		#actual position of the iterator
		current = matrix.current
		print('Current cell : '+ str(current), end = '\r')

		bl = [locs[0],locs[1]]
		tr = [locs[2],locs[3]]
		
		result = dao.getUrlsByBox(bl,tr)
	
		#do something with result
		l_url = []
		l_res = list(result)
		if len(l_res) == 0:
			empty_cell_counter = empty_cell_counter + 1
		elif len(l_res) > 0:
			
			#print(l_res)
			#print('')

			# compute the coordinates for the center of the cell
			cluster_lon = bl[1] + (tr[1] - bl[1]) / 2
			cluster_lat = bl[0] + (tr[0] - bl[0]) / 2

			
			# For plotting
			cell_full = True				
					
		# ===================================================================
		# Get the plot map
		if cell_full == True:
			x,y = m(cluster_lon,cluster_lat)
			m.plot(x,y, 'ro') 
		# ===================================================================

	
		n_cells = n_cells + 1

	dao.close()

	print('')
	print('# cells : '+str(n_cells))	
	print('# empty : '+str(empty_cell_counter))

	plt.title("Geo Plotting of the full cells")
	plt.show()

	

	return 0
Exemplo n.º 7
0
def main(args):
	
	if len(args) == 1 or args[1] == '--h':
		print('Parameters : [ hostname, port, s ]')
		return 0


	# Parameters for the db
	host = args[1]
	port = int(args[2])
	
	# Parameters for the matrix
	s = int(args[3])

	# Parameters for http requests
	max_waiting_time = 1 # 1s timeout for each request
	l_fails = [] #list containing the fails url	

	db_name = 'db_geo_index'	
	
	# geoindex collection
	collection_name = 'clicks'
	
	# topic collection
	collection_topics_name = 'topics_mini'

		
	max_loc, min_loc = getBoundaries(host, port, db_name)
	
	matrix = Matrix(min_loc, max_loc, s)
	matrix.toString()
	print('')

	# connect to geo dao
	dao = GeoDao(host, port)
	dao.connect(db_name, collection_name)

	# ===================================================================
	# Get the plot map
	m = getPlotsMap(host, port, db_name, 'globals')
	# ===================================================================

	empty_cell_counter = 0
	n_cells = 0
	while matrix.hasNext():
		# For the plotting		
		cell_full = False

		locs = matrix.next()

		#actual position of the iterator
		current = matrix.current
		print('Current cell : '+ str(current), end = '\r')

		bl = [locs[0],locs[1]]
		tr = [locs[2],locs[3]]
		
		result = dao.getUrlsByBox(bl,tr)
	
		#do something with result
		l_url = []
		l_res = list(result)
		if len(l_res) == 0:
			empty_cell_counter = empty_cell_counter + 1
		elif len(l_res) > 0:

			# compute the coordinates for the center of the cell
			cluster_lon = bl[1] + (tr[1] - bl[1]) / 2
			cluster_lat = bl[0] + (tr[0] - bl[0]) / 2

			
			# extract url and put it in a list
			for row in l_res:
				d_row = dict(row)
				urls = d_row['urls']
				for url in urls:
					l_url.append(url)
					
				# Get corpuses from of all the url into a cell
				http_ret = http.get_corpuses(l_url, max_waiting_time, l_fails)
				corpuses = http_ret[0]				
				l_fails = list(set(l_fails + http_ret[1])) # merges fails list
				
				# remove empty sublist
				corpuses = [x for x in corpuses if x != []]

				if len(corpuses) > 0:
					'''
					# ONLY FOR TEST : save all the corpus =============
					print('Saving corpuses on DB ...', end = '\r')
					corpuses_collection_name= 'corpuses_mini'
							
					d_corpuses = {}
					d_corpuses['loc'] = [cluster_lat,cluster_lon]
					d_corpuses['corpuses'] = corpuses

					
					dao.addOne(corpuses_collection_name, d_corpuses)
					# =================================================

					# Make lda on the corpuses
					print('Doing LDA ...', end = '\r')
					l_topics = tmpLda(corpuses)					
				
					# Save the topic list into the db
					print('Saving topics on DB ...', end = '\r')
					d_topics = {}
					d_topics['loc'] = [cluster_lat,cluster_lon]
					d_topics['topics'] = l_topics
								
					dao.addOne(collection_topics_name, d_topics)
					
					# For plotting
					cell_full = True				
					'''
		# ===================================================================
		# Get the plot map
		if cell_full == True:
			x,y = m(cluster_lon,cluster_lat)
			m.plot(x,y, 'ro') 
		# ===================================================================

	
		n_cells = n_cells + 1

	dao.close()

	print('')
	print('# cells : '+str(n_cells))	
	print('# empty : '+str(empty_cell_counter))

	plt.title("Geo Plotting of the full cells")
	plt.show()

	

	return 0