def geopycheck(): print("In version 0.6.2 beta and above, your geocoder scheme needs to get set, based on your OS.", "PyWeather can automatically do this now, or you can manually define your scheme.", "Type in 'automaticsetup' for the automatic setup, and 'manualsetup' for manual setup", "in the prompt below.", sep="\n") setupmethod = input("Input here: ").lower() if setupmethod == "manualsetup": print("Geopy's Google geocoder can work in HTTPS-enabled mode on 95% of platforms,", "but has a tendancy to fail on OS X, or other platforms. In the prompt below,", "enter 'https' for geopy to work in https mode, or 'http' for http mode.", "Please note: Your settings will not be validated!", sep="\n") geopymode = input("Input here: ").lower() if geopymode == "https": config['GEOCODER']['scheme'] = 'https' print("Changes saved.") else: config['GEOCODER']['scheme'] = 'https' if geopymode == "http": print("Changes saved.") else: print("Couldn't understand your input. Defaulting to 'http'.") else: if setupmethod == "automaticsetup": print("Starting automatic setup.") else: print("Couldn't understand your input. Defaulting to automatic setup.") import geopy from geopy import GoogleV3 geocoder = GoogleV3(scheme='https') # Warm-up geocode try: geocoder.geocode("123 5th Avenue, New York, NY") except: isthisisheresopythondoesntyellatme = True try: geocoder.geocode("123 5th Avenue, New York, NY") print("The geocoder can operate with HTTPS enabled on your OS. Saving these changes...") config['GEOCODER']['scheme'] = 'https' print("Changes saved.") except geopy.exc.GeocoderServiceError: print("Geopy probably can't run without HTTPS (or your internet went down). Trying HTTP as the scheme...") geocoder = GoogleV3(scheme='http') try: geocoder.geocode("123 5th Avenue, New York, NY") print("The geocoder can operate, but without HTTPS enabled on your OS. Saving these changes...") config['GEOCODER']['scheme'] = 'http' print("Changes saved.") except geopy.exc.GeocoderServiceError: print("You probably don't have an internet connection, as HTTPS and HTTP validation both failed.", "Defaulting to HTTP as the geopy scheme...", sep="\n") config['GEOCODER']['scheme'] = 'http' print("Changes saved.")
def create_afs_house(soup, url, bedrooms): price = soup.find("div", class_="style12x") price = price.text price = re.search('£(.*)pw', price) # Find price per week price = price.group(1) price = int(price) price = int((price * 52) / 12) print(price) pattern = re.compile( '[A-Z]{1,2}[0-9][0-9A-Z]?\s?[0-9][A-Z]{2}') # Find postcode location_string = soup.find(text=pattern) location_string = str(location_string.string) location_string = re.sub(r'\([^)]*\)', '', location_string) # Remove whitespace geolocator = GoogleV3() location = geolocator.geocode(location_string) print(location.latitude) print(location.longitude) furnished = soup.find(text=re.compile("Furnished")) house = accommodation.Accommodation(price, bedrooms, "UNSURE", location.address, 1, url) house.lat = location.latitude house.long = location.longitude return house
def geocode(address, attempt=0): # Google is free up to 2500 requests per day, then 0.50€ per 1000. We don't use # Nominatim because it doesn't like bulk requests. Other services cost money. service = GoogleV3(api_key=GOOGLE_API_KEY) query = '{address}, {locality}'.format(**address) point = None if attempt > 2: warning('Google timed out 3 times. Giving up on %s', query) else: try: point = service.geocode(query) if not point: raise GeopyError('Google returned empty object') if 'partial_match' in point.raw.keys(): warning('Google partly matched %s', query) else: debug('Google matched %s', query) except GeocoderQuotaExceeded: raise except GeocoderTimedOut: geocode(address, attempt=attempt+1) except GeopyError as e: warning('Error geocoding %s (%s)', address['address'], e) return point
def coordinates(address): """We leverage GoogleV3 to geocode your specified location. Your location is used to determine the correct civil twilight, which helps with camera configuration.""" client = GoogleV3() response = client.geocode(address) return (response.latitude, response.longitude)
def save(self, *args, **kwargs): if not self.lat_long_points: geolocator = GoogleV3() location = geolocator.geocode(self.zipcode.encode('utf-8')) if location: self.lat_long_points = Point((location.longitude, location.latitude,)) return super(Instructors, self).save(*args, **kwargs)
def obtain_localisation(location): final_location = ", ".join(location.tolist()) googlemaps = GoogleV3(user_agent="estate", api_key=config["API_KEY"]) geocoder = RateLimiter(googlemaps.geocode) geocoded_location = geocoder(final_location) if geocoded_location: latitude, longitude = geocoded_location.latitude, geocoded_location.longitude else: latitude, longitude = np.nan, np.nan return latitude, longitude
def _collect_label_meta(self): if (self._data and not (self.locations is None) and os.getenv('GOOGLE') and not self._label_meta): # Group locations into lists by label labels = {} for i, val in enumerate(self._data): if val.location.label == -1: continue labels.setdefault(int(val.location.label), list()).append(val.location) # Randomly select 1-5 locations / label sample_labels = {} for key in labels.iterkeys(): limit = min(5, len(labels[key])) indices = [ randrange(0, len(labels[key])) for i in range(limit) ] sample_labels[key] = [labels[key][j] for j in indices] # Get City/Place names for each label's locations geocoder = GoogleV3(api_key=os.getenv('GOOGLE')) name_lists = {} # i = 0 for key in sample_labels.iterkeys(): for location in sample_labels[key]: name_lists.setdefault(key, list()).append( SocialExplorer._reverse_geocode( geocoder, 'PlaceName', location.latitude, location.longitude)) # Associate the label with the most common name label_names = {} for key in labels.iterkeys(): place_counts = {} max_count = 0 max_name = None for name in name_lists[key]: place_counts[name] = place_counts.setdefault(name, 0) + 1 if place_counts[name] > max_count: max_count = place_counts[name] max_name = name label_names[key] = max_name self._label_meta = {} for key, val in label_names.iteritems(): self._label_meta[key] = { "name": val, "count": len(labels[key]) } else: print 'Cannot get label names'
def geocode_all(addresses): geo = GoogleV3() for name, address in addresses: for t in [.1, .2, .4, .8, 1.6, 3.2, 6.4]: try: loc = geo.geocode(address.encode('ascii', 'ignore')) yield (name, address, loc.latitude, loc.longitude) break except GeocoderQuotaExceeded: sleep(t) except: print('Couldn\'t find %s - %s' % (name, repr(address))) break
def search_street(street, lang='en'): location = None try: geolocator = Nominatim() location = geolocator.geocode(street, language=lang) except: try: geolocator = GoogleV3(api_key=config.GOOGLE_API_KEY) location = geolocator.geocode(street, language=lang) except: pass return location
def get(self, request, *args, **kwargs): query_dict = request.GET address = query_dict.get('location') distance = query_dict.get('distance') geolocator = GoogleV3() location = geolocator.geocode(address.encode('utf-8'), timeout=10) instructors = Instructors.objects.get_near_instructors( location.latitude, location.longitude, distance=distance) data = dict() data['instructors'] = [ self.prepare_response_data(instructor) for instructor in instructors ] data['origin'] = {'lat': location.latitude, 'lon': location.longitude} return Response(data)
def create_rightmove_house(item): page_url = item.find("meta", property="og:url") page_url = page_url["content"] price = item.find(id="propertyHeaderPrice") if price is not None: price = price.text price = re.search('£(.*) pcm', price) # Get price per month price = price.group(1) price = int(price) bedrooms = item.find(string=re.compile("bedroom")) bedrooms = str(bedrooms.string) print(page_url) print(bedrooms) bedrooms = bedrooms.rsplit("bedroom", 1)[0] bedrooms = bedrooms.strip() bedrooms = int(bedrooms) address = item.find("address", class_="pad-0 fs-16 grid-25") address_string = str(address.string) geolocator = GoogleV3() address_string = geolocator.geocode(address_string) if address_string is None: address = "" latitude = "" longitude = "" else: address = address_string.address latitude = address_string.latitude longitude = address_string.latitude furnished_type = item.find(id="furnishedType") furnished_string = str(furnished_type.string) is_furnished = 0 if furnished_string == "Furnished": is_furnished = 1 house = accommodation.Accommodation(price, bedrooms, "UNSURE", address, is_furnished, page_url) house.lat = latitude house.long = longitude return house
def location(self): """ Get the location, but as a geopy location object Returns ------- Location """ # if the input was a string, we do a google lookup if isinstance(self._location, str): location = GoogleV3().geocode(self._location) # if the input was an iterable, it is latitude and longitude elif hasattr(self._location, '__iter__'): lat, long = self._location gepoint = Point(latitude=lat, longitude=long) location = Location(point=gepoint) else: raise ValueError('Invalid location') return location
def tz(self): """ Get the local timezone of the requested location Returns ------- pytz.timezone """ if self._tz is not None: tz = self._tz # if there already are some forecasts, the timezone is in there elif self._forecasts: tz = self._lookup_timezone() # use Google geocoder to lookup timezone else: lat, long, _alt = self.location.point tz = GoogleV3().timezone(location=(lat, long)).zone # return as a pytz object return pytz.timezone(tz)
def save(self, *args, **kwargs): my_locations = [l.name for l in self.profile.locations.all()] locations = self.cleaned_data.get('locations').split(',') should_delete = [x for x in my_locations if x not in locations] should_add = [x for x in locations if x not in my_locations] geolocator = None for name in should_add: if name: loc, created = Location.objects.get_or_create(name=name.lower()) if created: if not geolocator: geolocator = GoogleV3() address, (la, lo) = geolocator.geocode(name) loc.latitude = la loc.longitude = lo loc.profiles.add(self.profile) loc.save() for name in should_delete: if name: loc, _ = Location.objects.get_or_create(name=name.lower()) self.profile.locations.remove(loc) self.profile.save()
print("RESULT 1:", ins.getsource(object)) # source code print("RESULT 2:", ins.getmodule(object)) # module in which defined print("RESULT 3:", ins.currentframe().f_lineno) # own line number lines, lnum = ins.getsourcelines(object) print("RESULT 4:", ''.join(lines)) # %% Join / concatenate a list or tuple of strings together ''.join(['a', 'b', 'c']) ''.join(('a', 'b', 'c')) # %% # conda install -c conda-forge geopy from geopy import GoogleV3 place = "221b Baker Street, London" key = 'enter API key' # From Google Cloud location = GoogleV3(api_key=key).geocode(place) print(location.address) print(location.point) location.raw # %% Look inside python object x = {'a': 1, 'b': 2} dir(x) dir(dir) # %% Import features from future versions of python from __future__ import print_function print("Hello World!") # %% any, all and not x = [True, True, False]
# Reads Wine mag csv and appends geocoded lat/lng coords ######################################################## import http.client import json import time import sys import collections import csv from geopy import GoogleV3 from geopy.exc import GeopyError csv_name = 'winemag-data-185k-03272019.csv' api_key = str(sys.argv[1]) geocoder = GoogleV3(api_key=api_key) request_count = 0 request_limit = 100000 country_index = 5 province_index = 8 region_index = 6 header_row = [] rows = [] location_cache = {} def read_csv(): global header_row with open(csv_name, encoding='utf8') as csv_file:
import csv from geopy import GoogleV3 from invisibleroads_macros.disk import make_folder from os.path import join from sys import argv target_folder, address_text_path = argv[1:] geocode = GoogleV3().geocode location_table_path = join(make_folder(target_folder), 'locations.csv') csv_writer = csv.writer(open(location_table_path, 'w')) csv_writer.writerow(['Address', 'Latitude', 'Longitude']) for address in open(address_text_path): location = geocode(address) csv_writer.writerow([ address.strip(), location.latitude, location.longitude]) print('location_table_path = ' + location_table_path)
def update_lat(max_row): max_query_number = 2450 with sqlite3.connect(db) as conn: cur = conn.cursor() cur.execute( "SELECT count(*) FROM tbl_address_lat WHERE create_date > date('now')" ) today_query_number = cur.fetchone()[0] with sqlite3.connect(db) as conn: cur = conn.cursor() cur_update = conn.cursor() cur.execute( "SELECT address, id, state, postcode FROM tbl_property_ad " # "WHERE state = 'NSW' AND type = 'residential'") "WHERE lat is NULL and state = 'NSW' AND (type = 'residential' or type = 'house land package') LIMIT ?", (max_row, )) # rs = cur.fetchall() # it = iter(rs) # property_ = it.next() retry_num = 0 while True: property_ = cur.fetchone() if not property_: break try: address_text = property_[0] property_id = property_[1] state = property_[2] postcode = property_[3] normalized_address = "" lat_ = None lng_ = None if address_text: ADDRESS_TEXT = address_text.strip().upper() dict_geo = saved_address_num(ADDRESS_TEXT) if isinstance(dict_geo, dict): try: lat_ = dict_geo['geometry']['location']['lat'] lng_ = dict_geo['geometry']['location']['lng'] normalized_address = dict_geo['formatted_address'] except KeyError: # empty dict_geo, so go for suburb location lat_ = None lng_ = None # set saved address to suburb ADDRESS_TEXT = state + " " + str(postcode) normalized_address = ADDRESS_TEXT dict_geo = saved_address_num(ADDRESS_TEXT) if isinstance(dict_geo, dict): try: lat_ = dict_geo['geometry']['location'][ 'lat'] lng_ = dict_geo['geometry']['location'][ 'lng'] except KeyError: lat_ = None lng_ = None if not lat_: # if not found in db, go for geopy if today_query_number < max_query_number: geo = GoogleV3( api_key= "AIzaSyALRQvXf8IwBIU6HI8btqv4TtSMarfm-98", timeout=20) location = geo.geocode(ADDRESS_TEXT) time.sleep(0.2) today_query_number += 1 now_ = str(datetime.now()) print "Geocode quotation:", today_query_number, ":", address_text if location: lat_ = location.latitude lng_ = location.longitude if not normalized_address: normalized_address = location.address cur_update.execute( "INSERT INTO tbl_address_lat (address_text, lat, long, api_string, create_date) " "VALUES (?, ?, ?, ?, ?)", (ADDRESS_TEXT, lat_, lng_, json.dumps(location.raw), now_)) conn.commit() else: cur_update.execute( "INSERT INTO tbl_address_lat (address_text, lat, long, api_string, create_date) " "VALUES (?, ?, ?, ?, ?)", (ADDRESS_TEXT, None, None, None, now_)) conn.commit() lat_ = 0 lng_ = 0 else: continue # update back to tbl_property_ad cur_update.execute( "UPDATE tbl_property_ad SET lat = ?, long = ?, address_normalized = ?" "WHERE id = ?", (lat_, lng_, normalized_address, property_id)) conn.commit() except Exception as err: print err retry_num += 1 if ((retry_num > 5) and (today_query_number > max_query_number - 200)) or ( retry_num > 15): print input("Enter to exit:") break time.sleep(60) else: retry_num = 0
'postcode': '10132' }, { 'country': 'China', 'city': 'Shenzhen', 'postcode': '518012' }, { 'country': 'China', 'city': 'Fuzhou', 'postcode': '350022' }, { 'country': 'Russia', 'city': 'Moscow', 'postcode': '105122' }] geo_locator = GoogleV3(api_key=GOOGLE_API_KEY) for location in postcodes: geo_location = geo_locator.geocode( components={ 'country': location.get('country'), 'locality': location.get('city'), 'postal_code': location.get('postcode') }) if geo_location: #print(geo_location.raw) print('https://www.google.com/maps/place/?q=place_id:{}'.format( geo_location.raw.get('place_id', ))) else: print('Location {country}, {city}, {postcode} not found'.format( country=location.get('country'), city=location.get('city'),
# Author/source: Koosha Golmohammadi # Convert a list of addresses from an input file to geocodes import sys from geopy import GoogleV3 geolocator = GoogleV3() input_file = open(sys.argv[1],'r') output_file = open(sys.argv[2],'w') for line in input_file: print line address, (latitude, longitude) = geolocator.geocode(line, timeout=10) output_line = '%s,%s,%s\n' % (address, latitude, longitude) output_file.write(output_line) input_file.close() output_file.close() #return 0
#address is always on the next line address = lines[index + 1] #some address (below) is on the html of every single page and it obviously does not belong (Jamaica) if address == " 14692 Guy R Brewer Blvd<br>Jamaica, NY 11434": pass else: address = address.replace(" ", "") address = address.replace("<br>", ", ") addresses.append(address) haveNameLFaddress = False #found address for previous name of supermarket print(address) #Map all addresses coords = [] NoneType = [] geolocator = GoogleV3() # Nominatim() for i in addresses: try: try: location = geolocator.geocode(i) if type(location) != type(None): coords.append([location.latitude, location.longitude]) print("Valid count: %d, address: %s" % (len(coords), i)) time.sleep(1) #Improper formatting of addresses cannot be mapped (thanks Yelp) else: NoneType.append(i) print("Invalid count: %d, address: %s" % (len(NoneType), i)) time.sleep(1) except GeocoderTimedOut as e:
def __init__(self): self.geocoder = GoogleV3(api_key=secrets['api-key']['google'])
def get_zoopla_houses(location, bedrooms, price, bills_inc): house_list = [] monthly_price = int(price) weekly_price = int((monthly_price * 12) / 52) parameters = { 'area': location, 'radius': 5, 'listing_status': 'rent', 'maximum_price': weekly_price, 'minimum_beds': bedrooms, 'maximum_beds': bedrooms, 'api_key': 'zwqrekb5d6zawqmxud9bnpte' } r = requests.get('http://api.zoopla.co.uk/api/v1/property_listings.js', params=parameters) result = r.json() for item in result['listing']: print(item['details_url']) conn = sqlite3.connect("houses.db") c = conn.cursor() house_url = item['details_url'] house_bedrooms = int(item['num_bedrooms']) house_price = item['rental_prices']['per_month'] house_bills = bills_inc house_lat = item['latitude'] house_long = item['longitude'] geolocator = GoogleV3() house_location = geolocator.geocode(item['displayable_address']) if house_location is None: house = accommodation.Accommodation(house_price, house_bedrooms, house_bills, item['displayable_address'], "UNSURE", house_url) else: house = accommodation.Accommodation(house_price, house_bedrooms, house_bills, house_location.address, "UNSURE", house_url) house.lat = house_lat house.long = house_long c.execute('''SELECT * FROM accommodations WHERE url=?''', (house_url, )) result = c.fetchone() if result is None: # Check if house is not already in database database.add_house_to_db(house) house_list.append(house) else: house = accommodation.Accommodation(result[1], result[2], result[3], result[6], result[7], result[0]) house.lat = result[4] house.long = result[4] house_list.append(house) return house_list
def create_app( schema_path='schemas/', rest_subdir='indexes', template_subdir='templates', ): # type: (str, str, str) -> flask.app.Flask logging.basicConfig(level=logging.DEBUG) app = Flask(__name__) # Configure configure_app(app) @app.errorhandler(Exception) def jsonify_exceptions(error): # type: (Exception) -> flask.app.Response # TODO put in sentry/rollbar/airbrake app.logger.exception('Unhandled error: %s', error) try: # ES exception problem = error.info['error']['reason'] except AttributeError: problem = error.message return views.error_response(500, problem) # Connect to Elasticsearch es = configure_elasticsearch(app) app.cluster = ClusterClient(es) app.datastore = DataStore(es) # Connect to Database from .database import db db.init_app(app) app.db = db with app.app_context(): db.engine.execute('CREATE EXTENSION IF NOT EXISTS HSTORE') db.create_all() # Create Geocoder app.geocode = partial(GoogleV3().geocode, exactly_one=False) # Setup auth app.register_blueprint(auth) app.before_first_request(create_oauth_flow) app.before_request(authenticate_user) # Setup URL rules configure_endpoints(app) # Load schemas app.schemastore = SwaggerSchemaStore() schema_dir = Path(schema_path) rest_dir = schema_dir.joinpath(rest_subdir) template_dir = schema_dir.joinpath(rest_subdir, template_subdir) # Non-REST endpoints for json_file in schema_dir.glob('*.json'): _add_schema(app, json_file) # Load templates before configuring indexes for json_file in template_dir.glob('*.json'): _add_template(app, json_file) # REST'ish indexes for json_file in rest_dir.glob('*.json'): index, swagger_spec = _add_schema(app, json_file, force_security=True) configure_index( index, swagger_spec, app.config['ELASTICSEARCH_NON_RESETTABLE_INDEX_SETTINGS'], es) configure_mappings(index, swagger_spec, es) _list_routes(app) app.logger.info('RelES reporting for duty...') return app
from geopy import GoogleV3 import config geolocator = GoogleV3(api_key=config.GOOGLE_API_KEY) location = geolocator.geocode('Eraclea', language='it') print(location)
from geopy import GoogleV3 place = "221b Baker Street, London" location = GoogleV3().geocode(place) print(location.address) print(location.location) # There’s also a useful distance class. It calculates the distance between two locations in your favorite unit of measurement.
def __init__(self): self.geo_locator = GoogleV3(api_key=settings.GOOGLE_API_KEY)
# seperating city and state from address to do further fact-based quantitative analysis address = fullTableDf["Address"].str.split(", ", n = 1, expand = True) fullTableDf["City"]= address[0] fullTableDf["State"]= address[1] fullTableDf = fullTableDf.loc[:, ["Symbol", "Address", 'City', 'State', 'DateAdded', 'Sector']] # GENERATING THE MAP OF OUR HEADQUARTERS # creating a smaller dataframe here because the complete dataframe is huge and my API times out before iterating over all entries # 150 rows are chosen fom the beginning to plot on the map # results will be similar on the complete dataset if we upgrade our API smallDf = fullTableDf.iloc[:150] print("\nHead of the smaller dataframe for plotting the map: ") print(smallDf.head()) geocode = GoogleV3('AIzaSyDNqc0tWzXHx_wIp1w75-XTcCk4BSphB5w').geocode addresses = smallDf['Address'].tolist() # generating latitudes and longitudes of all the HQ addresses in the dataset print("\n\nPlease wait... generating coordinates for our map! It takes some time.\n") latitudes = [] longitudes = [] for address in addresses: x = geocode(address) latitudes.append(x.latitude) longitudes.append(x.longitude) smallDf['Latitude'] = pd.DataFrame({'Latitude': latitudes}) smallDf['Longitude'] = pd.DataFrame({'Longitude': longitudes}) print("\nColumns in the small dataframe after adding the coordinates: ") print(smallDf.columns)
def demo(): ########################## # 1. GET NEW DATASET # # 2. ADD LOCATIONS # # 3. TRAIN CLASSIFIERS # # 4. MAKE PREDICTIONS # # 5. FILTER, SORT, GROUP # # 6. VISUALIZE # ########################## print() ###################### # 1. GET NEW DATASET # ###################### print('\n1. GET NEW DATASET') # read Twitter tokens consumer_key, consumer_secret, access_token, access_token_secret = read_twitter_tokens('tokens/twitter_tokens.txt') # connect with the Twitter API twitter_api: tweepy.API = connect_to_twitter_api(consumer_key, consumer_secret, access_token, access_token_secret) # define keywords # define keywords # COVID_KEYWORDS: List[str] = [ # 'corona', 'covid', 'quaranteen', 'home', 'stay', 'inside', 'virology', 'doctor', 'nurse', 'virus', 'grandma', # 'vaccin', 'sars', 'alone', 'strongtogether', 'elbow', 'mouth mask', 'protective equipment', 'hospitalization', # 'increas', 'death', 'dead', 'impact', 'ICU', 'intensive care', 'applause', 'stay healthy', 'take care', 'risk', # 'risk group', 'environment', # 'U+1F637', # Medical Mask Emoji # 'U+1F691', # Amublance Emoji # 'U+1F92E', # Vomiting Emoji # 'U+1F912', # Thermometer Emoji # ] # COVID_FAKE_KEYWORDS: List[str] = [ # 'coronascam', 'fakecorona', 'fake', 'coronahoax', 'hoaxcorona', 'gooutside', 'donotstayhome''fuckvirology', # 'donttrustvirologists', 'coronadoesntexist', 'chinesevirushoax', # ] keywords: Dict[str, int] = { 'covid': 100, # get 100 tweets with 'covid' in it 'corona': 100, # get 100 tweet with 'corona' in it 'coronahoax': 100, # get tweets 100 with 'coronahoax' in it } # get new dataset new_dataset: List[Tweet] = get_new_tweets(twitter_api, keywords) print(f'First tweet:\n{new_dataset[0]}') # save new dataset save_tweets(new_dataset, 'tweets/new_dataset.pickle') #################### # 2. ADD LOCATIONS # #################### print('\n2. ADD LOCATION TO THOSE TWEETS') # read Google token geocoding_api_key: str = read_google_token('tokens/google_token.txt') # initialize Google API google_api: GoogleV3 = GoogleV3(api_key=geocoding_api_key) # add location to tweets when possible num_tweets_with_location_before: int = 0 num_tweets_with_location_after: int = 0 for tweet in new_dataset: if tweet.country_code is not None and tweet.continent is not None: num_tweets_with_location_before += 1 tweet.add_location(google_api) if tweet.country_code is not None and tweet.continent is not None: num_tweets_with_location_after += 1 print(f'Number of tweets with location before: {num_tweets_with_location_before}') print(f'Number of tweets with location after: {num_tweets_with_location_after}') # save new dataset with locations included save_tweets(new_dataset, 'tweets/new_dataset.pickle') ######################## # 3. TRAIN CLASSIFIERS # ######################## print('\n3. TRAIN CLASSIFIERS') # load train dataset train_dataset = load_tweets('tweets/train_dataset.pickle') # pre-process train dataset X: List[str] = [tweet.text for tweet in train_dataset] X: List[str] = preprocess_corpus(X) labels: List[bool] = [tweet.denier for tweet in train_dataset] # train on part of the data # train, validation split X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2) # vectorize vectorizer: CountVectorizer = CountVectorizer() X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) # create Complement Naive Bayes classifier naive_bayes_classifier = ComplementNB() # train Complement Naive Bayes classifier naive_bayes_classifier = naive_bayes_classifier.fit(X_train, y_train) # validate Complement Naive Bayes classifier naive_bayes_accuracy: float = naive_bayes_classifier.score(X_test, y_test) print(f'Naive Bayes accuracy:\t{naive_bayes_accuracy * 100:>3.2f}%') # save Naive Bayes classifier save_model(naive_bayes_classifier, 'models/naive_bayes.pickle') # create Decision Tree classifier decision_tree_classifier = DecisionTreeClassifier() # train Decision Tree classifier decision_tree_classifier = decision_tree_classifier.fit(X_train, y_train) # validate Decision Tree classifier decision_tree_accuracy: float = decision_tree_classifier.score(X_test, y_test) print(f'Decision Tree accuracy:\t{decision_tree_accuracy * 100:>3.2f}%') # save Decision Tree classifier save_model(decision_tree_classifier, 'models/decision_tree.pickle') # retrain best model on all of the data # vectorize vectorizer: CountVectorizer = CountVectorizer() X: List[str] = vectorizer.fit_transform(X) best_model = ComplementNB().fit(X, labels) \ if naive_bayes_accuracy >= decision_tree_accuracy \ else DecisionTreeClassifier().fit(X, labels) # save best mode save_model(best_model, 'models/best_model.pickle') ####################### # 4. MAKE PREDICTIONS # ####################### print('\n4. USE CLASSIFIERS') # load test dataset test_dataset = load_tweets('tweets/test_dataset.pickle') # pre-processing X: List[str] = [tweet.text for tweet in test_dataset] X: List[str] = preprocess_corpus(X) # vectorize X = vectorizer.transform(X) # make predictions y = best_model.predict(X) # add predictions to tweet for tweet, label in zip(test_dataset, y): tweet.denier = label ########################## # 5. FILTER, SORT, GROUP # ########################## print('\n5. USE VARIOUS FILTERS') # use filters tweets_filtered_by_hashtag: List[Tweet] = filter_by_hashtag(test_dataset, '#coronahoax') tweets_filtered_by_hashtags_all: List[Tweet] = filter_by_hashtags_all(test_dataset, ['#corona', '#coronahoax']) tweets_filtered_by_hashtags_any: List[Tweet] = filter_by_hashtags_any(test_dataset, ['#corona', '#coronahoax', '#coronavirus', '#covid19']) tweets_filtered_before: List[Tweet] = filter_before(test_dataset, datetime(2020, 4, 19, 18, 58, 46)) tweets_filtered_at: List[Tweet] = filter_at(test_dataset, datetime(2020, 4, 19, 18, 58, 46)) tweets_filtered_after: List[Tweet] = filter_after(test_dataset, datetime(2020, 4, 19, 18, 58, 46)) tweets_filtered_between: List[Tweet] = filter_between(test_dataset, datetime(2020, 4, 19, 18, 0, 0), datetime(2020, 4, 19, 19, 0, 0)) tweets_filtered_by_country_code: List[Tweet] = filter_by_country_code(test_dataset, 'US') tweets_filtered_by_country_codes: List[Tweet] = filter_by_country_codes(test_dataset, ['US', 'GB']) tweets_filtered_by_continent: List[Tweet] = filter_by_continent(test_dataset, 'Europe') tweets_filtered_by_continents: List[Tweet] = filter_by_continents(test_dataset, ['Europe', 'North America']) tweets_sorted_by_date_ascending: List[Tweet] = sort_by_date_ascending(test_dataset) tweets_sorted_by_date_descending: List[Tweet] = sort_by_date_descending(test_dataset) tweets_grouped_by_country_code: defaultdict = group_by_country_code(test_dataset) tweets_grouped_by_continent: defaultdict = group_by_continent(test_dataset) ################ # 6. VISUALIZE # ################ print('\n6. VISUALIZE') # continents CONTINENTS: Dict[str, str] = { 'Asia': 'asia', 'Europe': 'europe', 'Africa': 'africa', 'North America': 'north_america', 'South America': 'south_america', 'Oceania': 'oceania', 'Antarctica': 'antartica', } # create series to plot num_tweets_per_country_per_continent_absolute = defaultdict(lambda: defaultdict(int)) num_tweets_per_country_absolute = defaultdict(lambda: defaultdict(int)) num_tweets_per_continent_absolute = defaultdict(lambda: defaultdict(int)) for tweet in test_dataset: if tweet.has_location(): country_code: str = tweet.country_code.lower() continent: str = CONTINENTS[tweet.continent] num_tweets_per_country_per_continent_absolute[tweet.continent][country_code] += 1 num_tweets_per_country_absolute['World'][country_code] += 1 num_tweets_per_continent_absolute['World'][continent] += 1 # visualize plots title = 'Absolute number of tweets per country and per continent' series = num_tweets_per_country_per_continent_absolute filename = 'num_tweets_per_country_per_continent_absolute' visualize(title, series, filename, per_continent=False) title = 'Absolute number of tweets per country' series = num_tweets_per_country_absolute filename = 'num_tweets_per_country_absolute' visualize(title, series, filename, per_continent=False) title = 'Absolute number of tweets per continent' series = num_tweets_per_continent_absolute filename = 'num_tweets_per_continent_absolute' visualize(title, series, filename, per_continent=True)
# Begin to determine geocoder scheme print("Attempting to detect a geocoder scheme for your system, this should only take a few moments.") try: import geopy geopy_installed = True except ImportError: print("Failed to import geopy. It's recommended that you install geopy for PyWeather to work.", "Skipping geocoder scheme detection and defaulting to 'http' as the scheme.", sep="\n") geopy_installed = False if geopy_installed is True: # HTTPS validation from geopy import GoogleV3 geocoder = GoogleV3(scheme='https') # I've found that one "warm up request", and then waiting ~15 seconds somehow helps determine if a platform is HTTP/HTTPS compatible. try: geocoder.geocode("123 5th Avenue, New York, NY") except: didthewarmupgeocodefail = "you bet" print("A warmup geocode has just been completed which helps with determining which scheme will work", "on your OS. Waiting 5 seconds before making another geocode requests (to prevent rate limiting).", sep="\n") time.sleep(5) try: geocoder.geocode("123 5th Avenue, New York, NY") print("The geocoder can operate with HTTPS enabled on your OS. Saving these changes...") config['GEOCODER']['scheme'] = 'https' print("Changes saved.")