def test_geocode_pass(self): code,(lat,lon,text) = geocoder.geocode('Audubon Park') self.assertEqual(code,'OK') code,(lat,lon,text) = geocoder.geocode('Superdome') self.assertEqual(code,'OK') code,(lat,lon,text) = geocoder.geocode('643 Magazine Street') self.assertEqual(code,'OK')
def match_phrase(phrase, verbose=False): # First step: attempt to map nicknames to standard place names if (phrase in nicknames): if verbose: print "Resolved nickname \"%s\" as \"%s\": " % (repr(phrase), repr(nicknames[phrase])) phrase = nicknames[phrase] # Second step: attempt to match mapped string in Nominatim try: G = geocoder.geocode(phrase,site='osm') if G['countrycode']: return G except ValueError: pass # Third step: attempt to run query, removing any (parenthesized) expressions paren_regex = "\(.+?\)" noparens = re.sub(paren_regex,'',phrase) if noparens != phrase: if verbose: print "* Removing parenthetical expression: \"%s\" -> \"%s\"" % (repr(noparens), repr(phrase)) try: G = geocoder.geocode(noparens, site='osm') if G['countrycode']: if verbose: print "** Resolved \"%s\" as \"%s\"" % (repr(phrase), repr(noparens)) return G except ValueError: pass return G
def main(): p = optparse.OptionParser() p.add_option('-x', help='Specify number of random names to generate.', type='int', dest='times', default=None) p.add_option('-n', help='Specify filename that contains names.', dest='names') p.add_option('-a', help='Specify filename that contains addresses.', dest='addresses') (opts, args) = p.parse_args() generatenames(opts.times, opts.names, opts.addresses) print '=' * 40 print 'Generating Geocodes.' print '=' * 40 geocode('geocodes.csv') print """
def test_geocode_fail(self): code,(lat,lon,text) = geocoder.geocode('Audubon Park') self.assertEqual(code,'OK') code,(lat,lon,text) = geocoder.geocode('Superdome') self.assertEqual(code,'OK') code,(lat,lon,text) = geocoder.geocode('643 Magazine Street') self.assertEqual(code,'OK') # not specific code,result = geocoder.geocode('New Orleans, LA') self.assertNotEqual(code,'OK') self.assertIsNone(result) # ambiguous results code,result = geocoder.geocode('Starbucks') self.assertNotEquals(code,'OK') self.assertIsNone(result)
def add_ad(): form = AdForm() db_sess = db_session.create_session() categories = db_sess.query(Categories).all() if form.submit.data: from random import choice n = '' for _ in range(100): n += choice('1234567890') if geocode(form.address.data): ad = Ad( photo=n, address=form.address.data, name=form.name.data, category=db_sess.query(Categories).filter( Categories.id == form.category.data).first().title, description=form.description.data, number=form.number.data, user_id=db_sess.query(User).filter( User.login == flask_login.current_user.login).first().id) else: return render_template('add_ad.html', form=form, cate=categories, m="нет такого города") db_sess.add(ad) db_sess.commit() return redirect(f'/add_photo/{ad.id}/{n}') return render_template('add_ad.html', form=form, cate=categories)
def trySearch(line, place, woetype): woeTypes = [woetype] # town, admin3, suburb townWoeTypes = ['7', '10', '22'] if woetype in townWoeTypes: woeTypes = townWoeTypes try: g = geocoder.geocode(place, { 'woeRestrict': ','.join(woeTypes), 'allowedSources': 'geonameid' }) if g and g.geonameid() and g.isFull(): return GeocodeSuccess(u'\t'.join([ unicode(g.geonameid()), unicode(g.woeType()), unicode(g.lat()), unicode(g.lng()), g.matchedName(), line.decode('utf-8') ])) else: return GeocodeFailure(line.decode('utf-8')) except: print 'timeout' return GeocodeTimeout(line.decode('utf-8'))
def SHOw(self): adres, ok_pressed = QInputDialog.getText( self, "Адрес", 'введите адрес: (Москва Гурьянова 2)') if ok_pressed: if adres: try: self.coords_pt = list(get_coordinates(adres)) except: pass if self.coords_pt: self.adress = geocode(adres)['metaDataProperty'][ 'GeocoderMetaData']['Address']['formatted'] self.adressPt.setText( geocode(adres)['metaDataProperty']['GeocoderMetaData'] ['Address']['formatted']) self.map.setPixmap(self.get_map())
def match_gps(locstr, verbose=False): suffix = locstr.split(':')[-1] (lat,lon) = [float(x) for x in suffix.strip('( )').split(',')] reverse = True query = "%s,%s" % (lat, lon) if verbose: print "Trying to match string \"%s\" as lat/lon..." % repr(query) return geocoder.geocode(query,site='osm')
def GET(self): if 'mode' not in web.input(): return render.app(timemode='now',time=datetime.today().strftime("%H:%M")) tvars = dict(web.input()) tvars['error'] = None fromplace = getattr(web.input(),'from') toplace = web.input().to if not fromplace or not toplace: tvars['error'] = 'Please enter an address or landmark for From and To' return render.app(**tvars) from_result,fromgeo = geocoder.geocode(fromplace) if from_result != 'OK': tvars['error'] = 'Unable to find address for %s' % fromplace return render.app(**tvars) tvars['fromgeo'] = fromgeo to_result,togeo = geocoder.geocode(toplace) if to_result != 'OK': tvars['error'] = 'Unable to find address for %s' % toplace return render.app(**tvars) tvars['togeo'] = togeo timemode = web.input().get('timemode') if timemode == 'now': result = otp.plan(fromgeo[0:2],togeo[0:2],web.input().mode) else: try: time = dateparser.parse_time(web.input().time) except ValueError: tvars['error'] = "Invalid time format" return render.app(**tvars) result = otp.plan(fromgeo[0:2],togeo[0:2],web.input().mode,time,timemode) if 'plan' in result: tvars['result'] = result else: # no itinerary found - rare but possible tvars['error'] = result['error']['msg'] return render.app(**tvars)
def directions(dirfrom,dirto,mode,at,atmode): if not mode: mode = 'ANY' from_result,from_place = geocode(dirfrom) if from_result != 'OK': return error_for_geocode(from_result,dirfrom) to_result,to_place = geocode(dirto) if to_result != 'OK': return error_for_geocode(to_result,dirto) date = None datemode = None if at: date = dateparser.parse_time(at) if atmode == 'arrive': datemode = 'arrive' else: datemode = 'depart' print "date chosen %s,mode=%s" % (date,datemode) plan = otp.plan(from_place[0:2],to_place[0:2],mode.upper(),date,datemode) return plan_instructions(plan)
def set_location(self): logging.info("Set Location for Address: %s" % self.address) if self.address: r = geocoder.geocode(self.address) if not r: raise "Cannot find geolocation for address: %s" %s pts = r.split(",") logging.info("Geocode Lookup Results: %s" % pts) if pts and len(pts) >= 2: self.location = db.GeoPt(pts[2], pts[3]) logging.info("Geo Point: %s" % self.location) self.update_location()
def trySearchHelper(line, place, woeTypes): try: g = geocoder.geocode(place, { 'woeRestrict': ','.join(woeTypes), 'allowedSources': 'geonameid' }) if g and g.geonameid() and g.isFull(): return GeocodeSuccess(u'\t'.join([unicode(g.geonameid()), unicode(g.woeType()), unicode(g.lat()), unicode(g.lng()), g.matchedName(), line.decode('utf-8')])) else: return GeocodeFailure(line.decode('utf-8')) except: traceback.print_exc() print 'timeout' return GeocodeTimeout(line.decode('utf-8'))
def indPt(self): self.flag = not self.flag try: ind = geocode( self.adress )['metaDataProperty']['GeocoderMetaData']['Address']['postal_code'] except: ind = '(Нет почтового индекса)' if self.flag: if self.adressPt.text() == self.adress: self.adressPt.setText(f'{self.adressPt.text()} {ind}') else: if self.adressPt.text() == f'{self.adress} {ind}': self.adressPt.setText(self.adress)
def trySearch(line, place, woetype): woeTypes = [woetype] # town, admin3, suburb townWoeTypes = ['7', '10', '22'] if woetype in townWoeTypes: woeTypes = townWoeTypes try: g = geocoder.geocode(place, { 'woeRestrict': ','.join(woeTypes), 'allowedSources': 'geonameid' }) if g and g.geonameid() and g.isFull(): return GeocodeSuccess(u'\t'.join([unicode(g.geonameid()), unicode(g.woeType()), unicode(g.lat()), unicode(g.lng()), g.matchedName(), line.decode('utf-8')])) else: return GeocodeFailure(line.decode('utf-8')) except: print 'timeout' return GeocodeTimeout(line.decode('utf-8'))
size = width, height = 650, 500 screen = pygame.display.set_mode(size) pygame.init() toponym_to_find = get_toponym() ll, spn = geocoder.get_ll_span(toponym_to_find) finded_place = ll + "," + "pmgnm" q = 0 mapp = button(q) button2() button3((255, 0, 0), 't') u = 0 get_image(ll, spn, mapp, finded_place) while 1: address_to_out = geocoder.geocode(toponym_to_find)['metaDataProperty']['GeocoderMetaData']['text'] for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() raise SystemExit elif event.type == pygame.MOUSEBUTTONDOWN: x, y = event.pos if y < 50 and x < 300: q += 1 mapp = button(q) get_image(ll, spn, mapp, finded_place) if y < 50 and x > 300 and x < 600: toponym_to_find = get_toponym() ll, spn = geocoder.get_ll_span(toponym_to_find) finded_place = ll + "," + "pmgnm" q = 0
def main(): args = parser.parse_args() config = configparser.ConfigParser() config.optionxform=str # to preserve case config.read(args.config_file) logging.basicConfig( format='%(asctime)s %(filename)s:%(lineno)d %(message)s', filename='cleanup.log', filemode="w", level=logging.INFO) sheets = get_GoogleSheets(config) for_github = [] # Load geocoder early so that invalid tsv paths errors are caught early on. geocoder = csv_geocoder.CSVGeocoder( config['GEOCODING'].get('TSV_PATH'), arcgis) for s in sheets: logging.info("Processing sheet %s", s.name) ### Clean Private Sheet Entries. ### # note : private sheet gets updated on the fly and redownloaded to ensure continuity between fixes (granted its slower). range_ = f'{s.name}!A:AG' data = values2dataframe(s.read_values(range_)) # Expand aggregated cases into one row each. logging.info("Rows before expansion: %d", len(data)) if len(data) > 150000: logging.warning("Sheet %s has more than 150K rows, it should be split soon", s.name) data.aggregated_num_cases = pd.to_numeric(data.aggregated_num_cases, errors='coerce') data = duplicate_rows_per_column(data, "aggregated_num_cases") logging.info("Rows after expansion: %d", len(data)) # Generate IDs for each row sequentially following the sheet_id-inc_int pattern. data['ID'] = s.base_id + "-" + pd.Series(range(1, len(data)+1)).astype(str) # Remove whitespace. data = trim_df(data) # Fix columns that can be fixed easily. data.sex = fix_sex(data.sex) # fix N/A => NA for col in data.select_dtypes("string"): data[col] = fix_na(data[col]) # Regex fixes fixable, non_fixable = generate_error_tables(data) if len(fixable) > 0: logging.info('fixing %d regexps', len(fixable)) s.fix_cells(fixable) data = values2dataframe(s.read_values(range_)) # ~ negates, here clean = data with IDs not in non_fixable IDs. clean = data[~data.ID.isin(non_fixable.ID)] clean = clean.drop('row', axis=1) clean.sort_values(by='ID') s.data = clean non_fixable = non_fixable.sort_values(by='ID') # Save error_reports # These are separated by Sheet. logging.info('Saving error reports') directory = config['FILES']['ERRORS'] file_name = f'{s.name}.error-report.csv' error_file = os.path.join(directory, file_name) non_fixable.to_csv(error_file, index=False, header=True, encoding="utf-8") for_github.append(error_file) # Combine data from all sheets into a single datafile all_data = [] for s in sheets: logging.info("sheet %s had %d rows", s.name, len(s.data)) all_data.append(s.data) all_data = pd.concat(all_data, ignore_index=True) all_data = all_data.sort_values(by='ID') logging.info("all_data has %d rows", len(all_data)) # Fill geo columns. geocode_matched = 0 for i, row in all_data.iterrows(): geocode = geocoder.geocode(row.city, row.province, row.country) if not geocode: continue geocode_matched += 1 all_data.at[i, 'latitude'] = geocode.lat all_data.at[i, 'longitude'] = geocode.lng all_data.at[i, 'geo_resolution'] = geocode.geo_resolution all_data.at[i, 'location'] = geocode.location all_data.at[i, 'admin3'] = geocode.admin3 all_data.at[i, 'admin2'] = geocode.admin2 all_data.at[i, 'admin1'] = geocode.admin1 all_data.at[i, 'admin_id'] = geocode.admin_id all_data.at[i, 'country_new'] = geocode.country_new logging.info("Geocode matched %d/%d", geocode_matched, len(all_data)) logging.info("Top 10 geocode misses: %s",geocoder.misses.most_common(10)) with open("geocode_misses.csv", "w") as f: geocoder.write_misses_to_csv(f) logging.info("Wrote all geocode misses to geocode_misses.csv") if len(geocoder.new_geocodes) > 0: logging.info("Appending new geocodes to geo_admin.tsv") with open(config['GEOCODING'].get('TSV_PATH'), "a") as f: geocoder.append_new_geocodes_to_init_file(f) for_github.append(config['GEOCODING'].get('TSV_PATH')) # Reorganize csv columns so that they are in the same order as when we # used to have those geolocation within the spreadsheet. # This is to avoid breaking latestdata.csv consumers. all_data = all_data[["ID","age","sex","city","province","country","latitude","longitude","geo_resolution","date_onset_symptoms","date_admission_hospital","date_confirmation","symptoms","lives_in_Wuhan","travel_history_dates","travel_history_location","reported_market_exposure","additional_information","chronic_disease_binary","chronic_disease","source","sequence_available","outcome","date_death_or_discharge","notes_for_discussion","location","admin3","admin2","admin1","country_new","admin_id","data_moderator_initials","travel_history_binary"]] # ensure new data is >= than the last one. latest_name = os.path.join(config['FILES']['LATEST'], 'latestdata.csv') line_diff = len(all_data) - len(pd.read_csv(latest_name)) if line_diff >= 0: logging.info(f"Line check passed, {line_diff} new lines") else: logging.info("Line check failed") return # save logging.info("Saving files to disk") dt = datetime.now().strftime('%Y-%m-%dT%H%M%S') file_name = config['FILES']['DATA'].replace('TIMESTAMP', dt) all_data.to_csv(file_name, index=False, encoding="utf-8") all_data.to_csv(latest_name, index=False, encoding="utf-8") logging.info("Wrote %s, %s", file_name, latest_name) if args.push_to_git: logging.info("Pushing to github") # Create script for uploading to github for_github.extend([file_name, latest_name]) script = 'set -e\n' script += 'cd {}\n'.format(config['GIT']['REPO']) script += 'git pull origin master\n' for g in for_github: script += f'git add {g}\n' script += 'git commit -m "data update"\n' script += 'git push origin master\n' script += f'cd {os.getcwd()}\n' print(script) os.system(script)
def test_geocode_latlon(self): code,(lat,lon,text) = geocoder.geocode('29.949803,-90.068858') self.assertEqual(code,'OK') self.assertAlmostEqual(lat,29.949803) self.assertAlmostEqual(lon,-90.068858)
def scrape(source="dcn", provided_url_key=False, limit=False, since="last_record", until="now", test=False): """Extracts new certificates by scraping CSP websites and writes data to the web_certificates table in the database. Parameters: - `source` (str): Specifies source webstie being scraped for CSP's. Can be either `dcn` for Daily Commercial News or `ocn` for Ontario Construction News. - `provided_url_key` (str of False): provided_url_key that is to be scraped. False by default. - `limit` (int): Specifies a limit for the amount of certificates to be scraped. Default is no limit. - `since` (str): Specifies date from when to begin looking for new CSP's. Can be either `last_record` or `yyyy-mm-dd` string format. - `until` (str): Specifies date for when to end the search for new CSP's. Can be either `now` or `yyyy-mm-dd` string format. - `test` (bool): Set to True to cancel writing to the database and return DataFrame of scraped certificates instead. Returns: - `True` if 1 or more certificates were scraped - `False` if no certificates were scraped - a Pandas DataFrame containing new certificates if Test=True """ # Initialize string and lambda functions based on source : def get_details(entry): entry = base_url + entry url_key = entry.split(base_aug_url)[1] while True: try: response = requests.get(entry) break except requests.exceptions.ConnectionError: sleep(1) continue if response.status_code == 404: return html = response.content entry_soup = BeautifulSoup(html, "html.parser") if source == "dcn": pub_date = entry_soup.find("time").get_text() cert_type = entry_soup.find("h1").get_text() if cert_type == "Certificates and Notices": cert_type = ( "csp" ) # old style -> assume csp by default even if it might not be true city = (entry_soup.find("div", { "class": "content-left" }).find("h4").get_text()) address = entry_soup.find("p", { "class": "print-visible" }).get_text() title = (entry_soup.find_all( "section", {"class": "content"})[3].find("p").get_text()) else: cert_type = ("csp" if cert_type == "Certificate of Substantial Performance" else cert_type) city = entry_soup.find_all("dl")[0].find("dt").get_text() address = entry_soup.find_all("dl")[1].find("dt").get_text() title = entry_soup.find_all("dl")[2].find("dd").get_text() if address.startswith( "This is to certify" ): # no address available. chnage sequence going forward address = "" title = entry_soup.find_all("dl")[1].find("dd").get_text() company_results = { key.get_text(): value.get_text() for key, value in zip(entry_soup.find_all("dt"), entry_soup.find_all("dd")) } owner = company_results.get( "Name of owner:", company_results.get("Name of Owner", np.nan)) contractor = company_results.get( "Name of contractor:", company_results.get("Name of Contractor", np.nan)) engineer = company_results.get( "Name of payment certifier:", company_results.get( "Name of Certifier", company_results.get("Name of certifier:", np.nan), ), ) elif source == "ocn": if ("Non-Payment" in entry_soup.find("h1", { "class": "entry-title" }).get_text()): cert_type = "np" else: try: header = entry_soup.find("h2", { "class": "ocn-heading" }).find_next_sibling("p").get_text() except AttributeError: header = ' ' if "Notice of Termination" in header: cert_type = "term" else: cert_type = "csp" pub_date = str( dateutil.parser.parse( entry_soup.find("date").get_text()).date()) try: city = entry_soup.find("h2", { "class": "ocn-subheading" }).get_text().split(":")[0] except AttributeError: city = '' if cert_type == "csp": address = (entry_soup.find("div", { "class": "ocn-certificate" }).find("p").get_text()) try: title = (entry_soup.find("h2", { "class": "ocn-heading" }).find_next_sibling("p").get_text()) except AttributeError: title = '' company_soup = entry_soup.find( "div", {"class": "ocn-participant-wrap"}) company_results = { key.get_text(): value.get_text() for key, value in zip( company_soup.find_all( "div", {"class": "participant-type"})[::2], company_soup.find_all( "div", {"class": "participant-name-wrap"}), ) } owner = company_results.get("Name of Owner", np.nan) contractor = company_results.get("Name of Contractor", np.nan) engineer = company_results.get("Name of Payment Certifier", np.nan) elif cert_type == "np": address = (entry_soup.find("h4", { "class": "ocn-subheading" }).find_next("p").get_text()) title = address # temporary until we see more of these for x in entry_soup.find_all("strong"): try: if x.get_text() == "Name of owner:": owner = x.find_parent().get_text().split(": ")[1] if x.get_text() == "Name of contractor:": contractor = x.find_parent().get_text().split( ": ")[1] except AttributeError: pass engineer = np.nan elif cert_type == "term": address = (entry_soup.find("h1", { "class": "entry-title" }).get_text()) title = address # temporary until we see more of these for x in entry_soup.find_all("strong"): try: if x.get_text() == "Name of owner:": owner = x.find_parent().get_text().split(": ")[1] if x.get_text() == "Name of contractor:": contractor = x.find_parent().get_text().split( ": ")[1] except AttributeError: pass engineer = np.nan elif source == "l2b": cert_type_text = entry_soup.find("h2").get_text() #cert_type = ("csp" if "Form 9" in cert_type_text else cert_type_text) if "Form 9" in cert_type_text: cert_type = "csp" elif "Form 10" in cert_type_text: cert_type = "ccs" else: cert_type = cert_type_text attr_pairs = {} fields = entry_soup.find_all('p', {'class': 'mb-25'}) for field in fields: try: attr_pair = [ s for s in re.findall('[^\t^\n^\r]*', field.get_text()) if s ] attr_pairs.update({attr_pair[0]: attr_pair[1]}) except IndexError: pass retry_count = 0 while True: try: response = requests.get(base_url) break except requests.exceptions.ConnectionError: logger.info( f"L2B not responding again ({retry_count}). waiting 2 seconds and retrying..." ) retry_count += 1 sleep(2) html = response.content soup = BeautifulSoup(html, "html.parser") pub_date = [ str(parse_date(entry.find_all('td')[1].get_text()).date()) for entry in soup.find('tbody').find_all('tr') if url_key in str(entry) ][0] if cert_type == 'ccs': city = attr_pairs.get('Of premises at', np.nan) address = attr_pairs.get('Of premises at', np.nan) title = ' '.join((attr_pairs.get( 'The subcontract provided for the supply of the following services or materials', ''), attr_pairs.get('To the following improvement', ''))) title = np.nan if title in ('', ' ') else title else: city = attr_pairs.get('Where the Premises is Situated', np.nan) address = attr_pairs.get('Where the Premises is Located', np.nan) title = attr_pairs.get( 'This is to certify that the contract for the following improvement', np.nan) owner = attr_pairs.get('Name of Owner', np.nan) contractor = attr_pairs.get('Name of Contractor', np.nan) engineer = attr_pairs.get('Name of Payment Certifier', np.nan) return ( pub_date, city, address, title, owner, contractor, engineer, url_key, cert_type, source, ) pub_date, city, address, title, owner, contractor, engineer, url_key, cert_type = [ [] for _ in range(9) ] if until == "now": until = datetime.datetime.now().date() else: try: until = re.findall("\d{4}-\d{2}-\d{2}", until)[0] except KeyError: raise ValueError( "`until` parameter should be in the format yyyy-mm-dd if not a key_word" ) if since == "last_record": hist_query = """ SELECT pub_date FROM web_certificates WHERE source=%s ORDER BY pub_date DESC LIMIT 1 """ with create_connection() as conn: cur = conn.cursor() cur.execute(hist_query, [source]) last_date = cur.fetchone()[0] ld_year = int(last_date[:4]) ld_month = int(last_date[5:7]) ld_day = int(last_date[8:]) since = datetime.datetime(ld_year, ld_month, ld_day).date() else: valid_since_date = re.search("\d{4}-\d{2}-\d{2}", since) if not valid_since_date: raise ValueError( "`since` parameter should be in the format yyyy-mm-dd if not a " "predefined term.") if source == "dcn": base_url = "https://canada.constructconnect.com" base_aug_url = ( "https://canada.constructconnect.com/dcn/certificates-and-notices/" ) base_search_url = "https://canada.constructconnect.com/dcn/certificates-and-\ notices?perpage=1000&phrase=&sort=publish_date&owner=&contractor=" custom_param_url = "&date=custom&date_from={}&date_to={}#results" get_number_of_matches = lambda soup: int( re.compile("\d\d*").findall( (soup.find("span", { "class": "search-results__total" }).get_text()))[0]) get_entries = lambda soup: [ x.find("a").get("href") for x in soup.find_all("article", {"class": "cards-item"}) ] elif source == "ocn": base_url = "" base_aug_url = "https://ontarioconstructionnews.com/certificates/" base_search_url = "https://ontarioconstructionnews.com/certificates/?\ per_page=1000&certificates_page=1&search=&form_id=&owner_name_like\ =&contractor_name_like=" custom_param_url = ( "&date_published=custom&date_published_from={}&date_published_to={}" ) get_number_of_matches = lambda soup: int((soup.find_all( "span", {"class": "items-found"})[1].get_text().split(" of ")[1])) get_entries = lambda soup: [ x.find("a").get("href") for x in soup.find_all("td", {"class": "col-location"}) ] elif source == "l2b": base_url = "https://certificates.link2build.ca/" base_aug_url = "Search/Detail/" base_search_url = "https://certificates.link2build.ca/" custom_param_url = "" since = str(since) until = str(until) get_entries = lambda soup: [ entry.find('a').get('href') for entry in soup.find('tbody').find_all('tr') if parse_date(since) <= parse_date( entry.find_all('td')[1].get_text()) <= parse_date(until) ] get_number_of_matches = lambda soup: len(get_entries(soup)) else: raise ValueError("Must specify CSP source.") if provided_url_key: details = get_details(provided_url_key) return pd.DataFrame( data={ "pub_date": details[0], "city": details[1], "address": details[2], "title": details[3], "owner": details[4], "contractor": details[5], "engineer": details[6], "url_key": details[7], "cert_type": details[8], "source": [source] * len(details[0]), }) date_param_url = custom_param_url.format(since, until) response = requests.get(base_search_url + date_param_url) html = response.content soup = BeautifulSoup(html, "html.parser") number_of_matches = get_number_of_matches(soup) if not number_of_matches: logger.info( "Nothing new to scrape in timeframe specified - exiting scrape function." ) return False # signaling that scrape returned nothing logger.info( f"scraping all of {number_of_matches} new certificates since {since}..." ) bar = progressbar.ProgressBar( maxval=number_of_matches + 1, widgets=[ progressbar.Bar("=", "[", "]"), " ", progressbar.Percentage() ], ) bar.start() logged_key_query = """ SELECT url_key FROM web_certificates WHERE source=%s """ with create_connection() as conn: logged_url_keys = list( pd.read_sql(logged_key_query, conn, params=[source]).url_key) entries = get_entries(soup) for i, entry in enumerate(entries, 1): check_url_key = (base_url + entry).split(base_aug_url)[1] if not test and check_url_key in logged_url_keys: logger.info( f"entry for {check_url_key} was already logged - continuing with the next one (if any)..." ) continue details = get_details(entry) # print(entry) if not details: logger.info( f"entry for {check_url_key} was a 404 page - continuing with the next one (if any)..." ) continue for cumulative, item in zip( [ pub_date, city, address, title, owner, contractor, engineer, url_key, cert_type, ], details, ): cumulative.append(item) if limit and (i >= limit): logger.info("limit reached - breaking out of loop.") break bar.update(i + 1) bar.finish() with create_connection() as conn: last_cert_id = (pd.read_sql( "SELECT * from web_certificates ORDER BY cert_id DESC LIMIT 1", conn).iloc[0].cert_id) df_web = pd.DataFrame( data={ "pub_date": pub_date, "city": city, "address": address, "title": title, "owner": owner, "contractor": contractor, "engineer": engineer, "url_key": url_key, "cert_type": cert_type, "source": [source] * len(pub_date), }) if not len(df_web): return False df_web = df_web.sort_values("pub_date", ascending=True) df_web["cert_id"] = [ int(x) for x in range(last_cert_id + 1, last_cert_id + 1 + len(df_web)) ] # make date into actual datetime object df_web["pub_date"] = df_web.pub_date.apply(lambda x: str( parse_date(str(x)).date()) if (x and str(x) != 'nan') else np.nan) logger.info("Fetching geocode information...") df_web = geocode(df_web) if test: return df_web attrs = [ "cert_id", "pub_date", "city", "address_lat", "address_lng", "city_lat", "city_lng", "city_size", "address", "title", "owner", "contractor", "engineer", "url_key", "cert_type", "source", ] query = f""" INSERT INTO web_certificates ({', '.join(attrs)}) VALUES ({','.join(['%s']*len(attrs))}) """ new_certs = [[row[attr] for attr in attrs] for _, row in df_web.iterrows()] with create_connection() as conn: conn.cursor().executemany(query, new_certs) conn.commit() return True # signaling that something scrape did return some results
#!/usr/bin/python # -*- coding: utf-8 -*- ##################Query the longitude and latitude for######################## ###################all departments and regions in ivory coast################## ############################################################################### from geocoder import geocode infile = open("cities_departmentID.txt", "r") all_lines = infile.readlines() infile.close() place_geo = {} for line in all_lines: query = line.split()[-1] if query != 'NA' and query not in place_geo.keys(): geo = geocode(query, site='bing') lon = str(geo['longitude']) lat = str(geo['latitude']) place_geo[query] = lon + ',' + lat print query, geo write_lines = ['department\tlongitude\tlatitude\n'] for place, geo in place_geo.items(): lon, lat = geo.split(',') newline = place + '\t' + lon + '\t' + lat + '\n' write_lines.append(newline) outfile = open('ivc_dept_geo1.txt', 'w') outfile.writelines(write_lines) outfile.close()
from argparse import ArgumentParser from geocoder import geocode import json if __name__ == '__main__': parser = ArgumentParser(description='Use Mapzen to geocode a location') parser.add_argument('location', type=str, help='A human-readable description of your location/address') parser.add_argument('api_key', type=str, help='Your Mapzen API key') args = parser.parse_args() mapzen_result = geocode(api_key=args.api_key, location_name=args.location) if not mapzen_result: print("Sorry, could not geocode the location:", args.location) else: # print dictionary as a prettified JSON txt = json.dumps(mapzen_result, indent=2) print(txt)
map_file = "map.png" with open(map_file, "wb") as file: file.write(response.content) for event in pygame.event.get(): if event.type == pygame.QUIT: running = False if event.type == pygame.KEYDOWN: delta = (18 - int(parms['z'])) // 2 if active: if event.key == pygame.K_RETURN: active = False parms['ll'] = str(get_coordinates(text)[0]) + ',' + str(get_coordinates(text)[1]) parms['pt'] = parms['ll'] + ',flag' print(geocode(text)) text = geocode(text)['metaDataProperty']['GeocoderMetaData']['Address']['formatted'] index = f( f"http://geocode-maps.yandex.ru/1.x/?apikey=40d1649f-0493-4b70-98ba-98533de7710b&geocode={text}&format=json") text += index elif event.key == pygame.K_BACKSPACE: text = text[:-1] else: text += event.unicode if event.key == pygame.K_PAGEUP and int(parms['z']) < 22: parms['z'] = str(int(parms['z']) + 1) if event.key == pygame.K_PAGEDOWN and 0 < int(parms['z']): parms['z'] = str(int(parms['z']) - 1) if event.key == pygame.KMOD_SHIFT: if index not in text:
import csv SOURCE_FILENAME = './static/data/AdultResidentialFacilities06052016.csv' OUTPUT_FILENAME = 'static/data/geocoded_facilities.csv' # Open the old data with open(SOURCE_FILENAME, 'r') as f: facilities = [] for row in csv.DictReader(f): facilities.append(row) xcount = 0 # now we geocode for f in facilities: xcount += 1 address = f['Facility Address'] city = f['Facility City'] coordinates = geocode(address, city) f['latitude'] = coordinates['latitude'] f['longitude'] = coordinates['longitude'] print(xcount, address, city, coordinates) print("Geocoding all done!") the_headers = list(facilities[0].keys()) with open(OUTPUT_FILENAME, 'w') as wfile: c = csv.DictWriter(wfile, fieldnames=the_headers) c.writeheader() c.writerows(facilities)