def distance_based_sequence(self, band_size, outfile): obs = self._distance(band_size) # First step - get the categories for observation ID obs_1 = obs.merge(self._pois[['categories']], left_on='observation', right_index=True).rename( columns={'categories': 'cat_observation'}) # Second step - get the categories for observed ID obs_2 = obs_1.merge(self._pois[['categories']], left_on='observed', right_index=True).rename( columns={'categories': 'cat_observed'}) # Order by inverse of distance, which is not the real distance but the interaction value from PySal. # The interaction among points decreases as the distance increase. obs_2.sort_values(by=['observation', 'distance'], ascending=False, inplace=True) # Third step - build the sequence joining the words. We keep sequences with at least 3 words. obs_3 = obs_2.groupby(['observation', 'cat_observation']).apply( lambda x: '\t'.join(x['cat_observed']) if len(x) > 2 else None).reset_index().dropna().rename( columns={0: "sequence"}) obs_3.loc[:, "complete"] = obs_3['cat_observation'] + "\t" + obs_3['sequence'] # Fourth step - join the pois dataframe with the sequences and save into a csv logger.info("Save sequences") self._pois[['categories', 'geometry']].merge(obs_3, left_index=True, right_on='observation')[ ['categories', 'geometry', 'complete']].to_csv(outfile.split(".csv")[0] + "_check.csv", sep='\t', index=False) obs_3[['complete']].to_csv(outfile, index=False, header=False)
def _distance(self, band_size=100): logger.info("Building sequences for each point in the sapce") wthresh = pysal.weights.DistanceBand.from_dataframe( self._pois, band_size, p=2, binary=False, ids=self._pois.index) ds = [] for index, indexes in wthresh.neighbors.items(): if len(indexes) == 0: d = {} d['observation'] = index d['observed'] = index d['distance'] = None ds.append(d) else: for i in range(len(indexes)): d = {} d['observation'] = index d['observed'] = indexes[i] d['distance'] = wthresh.weights[index][i] ds.append(d) obs = pd.DataFrame(ds) return obs
def merge_features_targets(features_path, targets_path, merge_strategy): """ Reads Features and Targets Dataframes (Urban Atlas data) Merges them according to the provided merge strategy """ # load Targets DataFrame targets_df = pd.read_csv(targets_path) # load Features DataFrame features_df = pd.read_csv(features_path, sep="\t") # select only relevant columns targets_df_relevant = targets_df.loc[:, ['cellID', 'predominant']] features_df_relevant = features_df.loc[:, ['cellID']] features_df_relevant.drop_duplicates(subset=["cellID"], inplace=True) # Merge Features and Targets if merge_strategy not in [0, 1]: logger.info( "Please select a correct merge strategy. Options are (1) Left, (2) Right, (3) Inner Join." ) if merge_strategy == 0: # Merge the UA and cellvector dataframes merged_df = targets_df_relevant.merge(features_df_relevant, on="cellID", how='left') else: # Merge the UA and cellvector dataframes merged_df = targets_df_relevant.merge(features_df_relevant, on="cellID", how='inner') return merged_df
def __init__(self, pois, w2v_model, binary=False): super(cell2vec, self).__init__() logger.info("Loading w2v model") self._pois = pois self._categories = pois["category"].drop_duplicates().values self._model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model, binary=binary)
def alphabetically_sequence(self, outfile): if('cellID' not in self._pois.columns): raise ValueError( "The input file with POIs must contains the column cellID.") logger.info("Build the sequences") self._pois.sort_values(by=["cellID", "categories"]).groupby('cellID')\ .apply(lambda x: '\t'.join(x['categories']) if len(x) > 2 else None).dropna().to_csv(outfile, index=False, header=None)
def merge_features_targets(features_path, targets_path, merge_strategy): """ Reads Features and Targets Dataframes (Urban Atlas data) Merges them according to the provided merge strategy """ # load Features DataFrame features_df = pd.read_csv(features_path, sep='\t', header=None) cols = [int(i) for i in features_df.columns] cols[0] = 'cellID' features_df.columns = cols features_df.columns = list( map(lambda x: 'f_fs_' + str(x) if x != "cellID" else x, features_df.columns)) features_df.head(2) # load Targets DataFrame targets_df = pd.read_csv(targets_path) targets_df.columns = list( map(lambda x: 't_' + x if x != "cellID" else x, targets_df.columns)) # select only relevant columns targets_df_relevant = targets_df.loc[:, ['cellID', 't_predominant']] # Merge Features and Targets if merge_strategy not in [1, 2, 3]: logger.info( "Please select a correct merge strategy. Options are (1) Left, (2) Right, (3) Inner Join." ) if merge_strategy == 1: # Merge the UA and cellvector dataframes merged_features_targets = targets_df_relevant.merge(features_df, on="cellID", how='left') elif merge_strategy == 2: # Merge the UA and cellvector dataframes merged_features_targets = targets_df_relevant.merge(features_df, on="cellID", how='left') else: # Merge the UA and cellvector dataframes merged_features_targets = targets_df_relevant.merge(features_df, on="cellID", how='left') # remove empty merged_features_targets.dropna(inplace=True) return merged_features_targets
def from_csv(cls, input, model, binary=False, sep='\t', category_column='categories', level=5): logger.info("Loading mapped POIs") # load foursquare dataset mapped on a particular grid df = pd.read_csv(input, sep=sep) df[category_column] = df[category_column].astype(str) # assign category to each record of the dataset df.loc[:, "category"] = utils.select_category(list(df[category_column]), level) # drop entry with empty category df = df.loc[df["category"] != "nan"] return cls(df, model,binary=binary)
def from_csv(cls, inputfile, sep='\t', crs=constants.default_crs): """ Read csv file with POIs details, including latitude and longitude :param inputfile: :param sep: :return: """ # Read foursquare MAPPED onto the grid logger.info("Reading POIs dataset.") df = pd.read_csv(inputfile, sep=sep) # Create GeoDataFrame from the read DataFrame logger.info("Create GeoDataFrame") geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)] gdf = gpd.GeoDataFrame( df, index=df.index, geometry=geometry, crs={'init': crs}) return cls(gdf.to_crs({'init': constants.universal_crs}))
def start(self, grid, output, restart=None): # Set output as a global variable self.output = output # Initalize Foursquare client authentication fs_client = foursquare.Foursquare(self.client_id, self.client_secret) start_point = 0 # Remove the file if it already exists if restart is None: try: os.remove(output) except OSError: pass else: start_point = restart logger.info("Calls to do: " + str(len(grid) - start_point)) # Iterate over the spatial grid cells. For each cell call Foursquare API for ind in range(start_point, len(grid)): # Set bounding box for the request row = grid.iloc[ind] g = str(row.geometry) g_parse = g.split("((")[1].split("))")[0].split(", ") sw = g_parse[0].split(" ") # South-West ne = g_parse[2].split(" ") # North-East logger.info( str(ind) + " - " + str(sw[1]) + ", " + str(sw[0]) + ", " + str(ne[1]) + ", " + str(ne[0])) # Setup parameters for calling venue search API params = dict(sw=sw[1] + ", " + sw[0], ne=ne[1] + ", " + ne[0], intent="browse") self.get_venues_search(fs_client, params) self.write_file() # Sanity check and removing duplicates logger.info("Sanity check and removing duplicates.") df = pd.read_csv(self.output) df.drop_duplicates(['name', 'latitude', 'longitude'], inplace=True) df.to_csv(self.output, encoding='utf-8', index=False)
def nearest_based_sequence(self, outfile, inputgrid): logger.info("Load the grid.") # Load inputgrid g = Grid.from_file(inputgrid) grid = g.grid.to_crs({'init': constants.universal_crs}) grid.loc[:, 'centroid'] = grid.centroid df = self._pois.copy() df = df.merge(grid[['cellID', 'centroid']], on='cellID') logger.info("Compute centroid for cells and build the sequences") df.loc[:, 'distance'] = df.apply(self._centroid_distance, axis=1) df.sort_values(by=['cellID', 'distance'], inplace=True, ascending=True) logger.info("Save sequences") df.groupby('cellID').apply(self._nearest).dropna().to_csv(outfile, index=False, header=None)
def get_venues_search(self, fs_client, params): call_flag = False self.request_counter += 1 logger.info("# Requests " + str(self.request_counter)) url = "https://api.foursquare.com/v2/venues/search" logger.info(url) # ------------ start request! --------------- while call_flag is False: try: data = fs_client.venues.search(params) call_flag = True except foursquare.RateLimitExceeded as rle: waiting_time = 3600 logger.info("wait", waiting_time) self.write_file() time.sleep(waiting_time) except Exception as exc: logger.error("ERROR: {0}".format(exc)) # ----------- end request --------------------- tot = data['venues'] logger.info("Number of venues: " + str(len(tot))) # Iterate over venues for glob in range(0, len(tot)): current_cat = data['venues'][glob]['categories'] if len(current_cat) == 0: continue checkin = data['venues'][glob]['stats']['checkinsCount'] user = data['venues'][glob]['stats']['usersCount'] name = data['venues'][glob]['name'] current_loc = data['venues'][glob]['location'] lat = current_loc['lat'] lon = current_loc['lng'] # Check presence of address and cross street if 'address' not in current_loc: address = "" else: address = current_loc['address'] if 'crossStreet' not in current_loc: crossStreet = "" else: crossStreet = current_loc['crossStreet'] # Get categories if ('pluralName' in current_cat[0]): current_cat = current_cat[0]['pluralName'] else: current_cat = current_cat[0]['name'] if current_cat not in self.cat.index: continue cat_name = [ self.cat.loc[current_cat][e] for e in self.cat.loc[current_cat].index if e.endswith('name') and self.cat.loc[current_cat][e] != "-" ] # append date self.foursquare_data = self.foursquare_data.append( { "name": name, "address": address, "crossStreet": crossStreet, "categories": ':'.join(cat_name), "checkin": checkin, "usercount": user, "latitude": lat, "longitude": lon }, ignore_index=True) # Check if there is still rate remaining to call API if int(fs_client.rate_remaining) <= 100 and int( fs_client.rate_limit) > 0: waiting_time = 3600 logger.info("wait", waiting_time) self.write_file() time.sleep(waiting_time) # Recursive if there are more than 10 places and the distance is greater than 20 meters if len(tot) >= 10 and great_circle(params['ne'], params['sw']).meters >= 20: x1, y1 = params['ne'].split(',') x2, y2 = params['sw'].split(',') x12 = str((float(x1) + float(x2)) / 2.0) y12 = str((float(y1) + float(y2)) / 2.0) new_params = [ dict(ne=x12 + ", " + y1, sw=x2 + ", " + y12, intent="browse"), dict(ne=x1 + ", " + y1, sw=x12 + ", " + y12, intent="browse"), dict(ne=x12 + ", " + y12, sw=x2 + ", " + y2, intent="browse"), dict(ne=x1 + ", " + y12, sw=x12 + ", " + y2, intent="browse"), ] for param in new_params: self.get_venues_search(fs_client, param)
def main(argv): parser = argparse.ArgumentParser('Build your own grid.') parser.add_argument('-o', '--outputfolder', help='Output folder where to save the matrix.', action='store', dest='outputfolder', required=True, type=str) parser.add_argument( '-i', '--input', help= 'Input file with point-of-interests. NOTE: in the case of strategy=nearest|alphabetically, the input file must contains the column cellID.', action='store', dest='inputfile', required=True, type=str) parser.add_argument('-a', '--area', action='store', dest='area', help='Area name', default=None, type=str) parser.add_argument( '-s', '--size', action='store', dest='size', help='Word2Vec words size. Used when employing Google News model.', default=None, type=str) parser.add_argument('-v', '--verbose', help='Level of output verbosity.', action='store', dest='verbosity', default=0, type=int, nargs="?") args = parser.parse_args() if (args.verbosity == 1): logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) elif (args.verbosity == 2): logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG) logger.info("Loading w2v model.") model = None ext = tuple([".biz", ".bin"]) if (args.inputfile.endswith(ext)): model = gensim.models.KeyedVectors.load_word2vec_format(args.inputfile, binary=True) else: model = gensim.models.Word2Vec.load(args.inputfile) tree = pd.read_csv(pkg_resources.resource_filename( 'geol', '/resources/category_tree.csv'), encoding='iso-8859-1') words = tree['level1_name'].dropna().drop_duplicates().tolist() + \ tree['level2_name'].dropna().drop_duplicates().tolist() + \ tree['level3_name'].dropna().drop_duplicates().tolist() + \ tree['level4_name'].dropna().drop_duplicates().tolist() m = re.search('_s([0-9]+)_', args.inputfile) if args.size: size = args.size else: if m: size = m.group(1) m = re.search('.+/(.+).model', args.inputfile) if m: model_details = m.group(1) else: model_details = 'gnews' outputfile = os.path.abspath( os.path.join(args.outputfolder, "matrix_" + args.area + "_" + model_details + ".txt")) f = open(outputfile, 'w', encoding='utf-8') for word in words: word = utils.normalize_word(word) w = word.split(' ') v = [0] * int(size) if len(w) > 1: tmp_w2v = [] for e in w: if e in model: tmp_w2v.append(model[e]) if len(tmp_w2v) > 0: v = np.mean(tmp_w2v, axis=0) elif word in model: v = model[word] v = map(str, v) s = ','.join(map(str, v)) f.write(word.replace(" ", "_") + "::n" + "\t1.0\t0\t" + s + "\n") f.close()
def main(argv): parser = argparse.ArgumentParser('Foursquare mapping to a spatial grid.') parser.add_argument('-i', '--input', help='POIs file with relative coordinates.', action='store', dest='input', required=True, type=str) parser.add_argument('-p', '--prefix', action='store', dest='prefix', help='Prefix for the filename specifying the city name.', required=True, type=str) parser.add_argument('-g', '--grid', help='Input grid for the mapping. If crs is not WGS84, specify it with the param -c', action='store', dest='grid', required=True, type=str) parser.add_argument('-c', '--crs', help='Coordinate Reference System for the input grid. It is requested only if it is different from WGS84.', action='store', dest='crs', default='epsg:4326', type=str) parser.add_argument('-o', '--outputfolder', help='Output folder where to save the mapped file.', action='store', dest='outputfolder', required='True', type=str) parser.add_argument('-lat', '--latitude', help='Latitude name.', action='store', dest='latitude', default='latitude', type=str) parser.add_argument('-long', '--longitude', help='Longitude name.', action='store', dest='longitude', default='longitude', type=str) parser.add_argument('-v', '--verbose', help='Level of output verbosity.', action='store', dest='verbosity', default=0, type=int, nargs="?") args = parser.parse_args() latitude = args.latitude longitude = args.longitude if(args.verbosity == 1): logger.setLevel(logging.INFO) elif(args.verbosity == 2): logger.setLevel(logger.DEBUG) # Load the grid logger.info("Load the grid") gdf = gpd.GeoDataFrame.from_file(args.grid) gdf.crs = {'init': args.crs} if args.crs != 'epsg:4326': gdf = gdf.to_crs({'init': 'epsg:4326'}) # Load POIs logger.info("Load POIs") df = pd.DataFrame(pd.read_csv(args.input, sep=",", low_memory=False)) # Create Point from latitude, longitude pairs and build a GeoDataFrame logger.info("Build geometry") geometry = [Point(xy) for xy in zip(df[longitude], df[latitude])] data = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'}, geometry=geometry) data.to_crs(gdf.crs, inplace=True) # Check Geometry Validity ans = data.geometry.is_valid invalid = ans[ans == False] data.drop(invalid.index, axis=0, inplace=True) # Spatial Join with the grid to associate each entry to the related cell ('within') - LEFT join = gpd.sjoin(gdf[['cellID', 'geometry']], data, how='left', op='within') # Remove additional columns join.drop(['index_right', 'geometry'], axis=1, inplace=True) # Save output logger.info("Save output file") outputfile = os.path.abspath(os.path.join(args.outputfolder, args.prefix + "_mapped_foursquare_pois.csv")) join.to_csv(outputfile, index=False, sep='\t', float_format='%.6f')