def write_tweets_to_mongo(): """ load tweets, decide the grid of tweets and insert the tweets into mongo this function serves as the initial step of the pipeline. all following db-specific operations are based on the dataset inserted here. This function is only called once for the entire pipeline. """ data_file = "data/sandy_all.txt" kml_file = ["data/nj_ct.kml", "data/nyc_ct_sea.kml"] # kml_file = ['data/nyc_cb_sea.kml'] tweets = load_tweets(data_file) # trajectories = tweets_to_trajectories(tweets) grid_db = GridDB() grid_db.load_grid_from_file(kml_file[0]) grid_db.load_grid_from_file(kml_file[1]) # grid_db.write_grids_to_json('shapefile.json') grid_db.check_and_add(tweets) ##################################################### ############## index Tweets into MongoDB ############ ##################################################### # mongodb mg = MongoDB() mg.connect() print("connected...") mg.drop() # tweets to dicts; rst = [] for t in grid_db.get_tweets(): rst.append(t.to_dict()) print("inserting...") mg.insert_tweets(rst)