def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text: street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['street']) if 'BENU Gyógyszertár' not in poi_data['title']: name = poi_data['title'].strip() branch = None else: name = 'Benu gyógyszertár' branch = poi_data['title'].strip() code = 'hubenupha' website = poi_data['description'].strip( ) if poi_data['description'] is not None else None website = website[19:] nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None city = clean_city(poi_data['city']) postcode = poi_data['postal_code'].strip() lat, lon = check_hu_boundary(poi_data['lat'], poi_data['lng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) original = poi_data['street'] ref = None if 'phone' in poi_data and poi_data['phone'] != '': phone = clean_phone(poi_data['phone']) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) data = [] insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` table = soup.find('table', attrs={'class': 'contenttable is-header-top'}) table_body = table.find('tbody') rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') cols = [element.text.strip() for element in cols] data.append(cols) for poi_data in data: # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data[2]) name = 'Aldi' code = 'hualdisup' postcode = poi_data[0].strip() city = clean_city(poi_data[1]) branch = None website = None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None original = poi_data[2] geom = None ref = None phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): xml = save_downloaded_xml( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] root = etree.fromstring(xml) for e in root.iter('place'): name = 'MOL Bubi' code = 'hububibir' housenumber = None conscriptionnumber = None street = None city = 'Budapest' branch = e.attrib['name'].split( '-')[1].strip() if e.attrib['name'] is not None else None ref = e.attrib['name'].split( '-')[0].strip() if e.attrib['name'] is not None else None capacity = e.attrib['bike_racks'].strip( ) if e.attrib['bike_racks'] is not None else None website = None nonstop = True mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None lat, lon = check_hu_boundary(e.attrib['lat'].replace(',', '.'), e.attrib['lng'].replace(',', '.')) geom = check_geom(lat, lon) postcode = query_postcode_osm_external(self.prefer_osm_postcode, self.session, lat, lon, None) original = None ref = None phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) print(insert_data) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): if self.link: with open(self.link, 'r') as f: insert_data = [] text = json.load(f) for poi_data in text['results']: first_element = next(iter(poi_data)) if self.name == 'CIB bank': name = 'CIB bank' code = 'hucibbank' else: name = 'CIB' code = 'hucibatm' postcode, city, street, housenumber, conscriptionnumber = extract_all_address( poi_data[first_element]['address']) branch = None website = None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None lat, lon = check_hu_boundary( poi_data[first_element]['latitude'], poi_data[first_element]['longitude']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) original = poi_data[first_element]['address'] ref = None phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename), POST_DATA) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text: name = 'MOL' code = 'humolfu' postcode = poi_data['postcode'].strip() street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) city = clean_city(poi_data['city']) branch = None website = None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None original = poi_data['address'] ref = None lat, lon = check_hu_boundary(poi_data['lat'], poi_data['lng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): try: if self.link: with open(self.link, 'r') as f: text = json.load(f) data = POIDataset() for poi_data in text['results']: first_element = next(iter(poi_data)) if self.name == 'K&H Bank': data.name = 'K&H Bank' data.code = 'hukhbank' data.public_holiday_open = False elif self.name == 'K&H Bank ATM': data.name = 'K&H Bank ATM' data.code = 'hukhatm' data.public_holiday_open = True if data.code == 'hukhatm': data.nonstop = True else: data.nonstop = False data.lat, data.lon = check_hu_boundary( poi_data.get(first_element)['latitude'], poi_data.get(first_element)['longitude']) if poi_data.get(first_element)['address'] is not None and \ poi_data.get(first_element)['address'] != '': data.postcode, data.city, data.street, data.housenumber, data.conscriptionnumber = \ extract_all_address( poi_data.get(first_element)['address']) data.original = poi_data.get( first_element)['address'] if poi_data.get( 'phoneNumber' ) is not None and poi_data.get('phoneNumber') != '': data.phone = clean_phone_to_str( poi_data.get('phoneNumber')) else: data.phone = None data.add() if data is None or data.lenght() < 1: logging.warning('Resultset is empty. Skipping ...') else: insert_poi_dataframe(self.session, data.process()) except Exception as e: logging.exception('Exception occurred') logging.error(e) logging.error(poi_data)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename), self.filetype) if soup is not None: text = json.loads(soup) data = POIDataset() for poi_data in text['items']: if poi_data['type'] == 'posta': if 'mobilposta' in poi_data['name']: data.name = 'Mobilposta' data.code = 'hupostamp' else: data.name = 'Posta' data.code = 'hupostapo' data.public_holiday_open = False elif poi_data['type'] == 'csekkautomata': data.name = 'Posta csekkautomata' data.code = 'hupostacse' data.public_holiday_open = True elif poi_data['type'] == 'postamachine': data.name = 'Posta csomagautomata' data.code = 'hupostacso' data.public_holiday_open = True elif poi_data['type'] == 'postapoint': data.name = 'PostaPont' data.code = 'hupostapp' data.public_holiday_open = False else: logging.error('Non existing Posta type.') data.postcode = poi_data['zipCode'].strip() data.city = clean_city(poi_data['city']) data.branch = poi_data['name'] data.lat = poi_data['lat'] data.lon = poi_data['lng'] data.street, data.housenumber, data.conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) data.original = poi_data['address'] data.add() if data is None or data.lenght() < 1: logging.warning('Resultset is empty. Skipping ...') else: insert_poi_dataframe(self.session, data.process())
def process(self): try: if self.link: with open(self.link, 'r') as f: text = json.load(f) data = POIDataset() for poi_data in text['availableLocations']: if 'locationStatus' in poi_data and poi_data[ 'locationStatus'] == 'IN_SERVICE': if self.name == 'CIB Bank': data.name = 'CIB Bank' data.code = 'hucibbank' data.public_holiday_open = False else: data.name = 'CIB Bank ATM' data.code = 'hucibatm' data.public_holiday_open = True data.lat, data.lon = check_hu_boundary( poi_data['location']['lat'], poi_data['location']['lon']) data.city = clean_city(poi_data['city']) data.postcode = poi_data.get('zip').strip() data.housenumber = poi_data['streetNo'].strip() data.street = poi_data['streetName'].strip() data.branch = poi_data['name'] if 'phone' in poi_data and poi_data['phone'] != '': data.phone = clean_phone_to_str( poi_data['phone']) if 'email' in poi_data and poi_data['email'] != '': data.email = poi_data['email'].strip() data.original = poi_data['fullAddress'] data.add() if data is None or data.lenght() < 1: logging.warning('Resultset is empty. Skipping ...') else: insert_poi_dataframe(self.session, data.process()) except Exception as e: logging.exception('Exception occurred') logging.error(e) logging.error(poi_data)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text: # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) if 'xpres' in poi_data['name']: name = 'Spar Expressz' code = 'husparexp' elif 'INTER' in poi_data['name']: name = 'Interspar' code = 'husparint' elif 'market' in poi_data['name']: name = 'Spar' code = 'husparsup' else: name = 'Spar' code = 'husparsup' poi_data['name'] = poi_data['name'].replace( 'INTERSPAR', 'Interspar') poi_data['name'] = poi_data['name'].replace('SPAR', 'Spar') ref_match = PATTERN_SPAR_REF.search(poi_data['name']) ref = ref_match.group( 1).strip() if ref_match is not None else None city = clean_city(poi_data['city']) postcode = poi_data['zipCode'].strip() branch = poi_data['name'].split('(')[0].strip() website = poi_data['pageUrl'].strip() nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None lat, lon = check_hu_boundary(poi_data['latitude'], poi_data['longitude']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) original = poi_data['address'] phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup('{}'.format(self.link), os.path.join(self.download_cache, self.filename), None, self.verify_link) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` pattern = re.compile('^\s*var\s*places.*') script = soup.find('script', text=pattern) m = pattern.match(script.get_text()) data = m.group(0) data = clean_javascript_variable(data, 'places') text = json.loads(data) for poi_data in text: poi_data = poi_data['addresses'][0] # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) name = 'Rossmann' code = 'hurossmche' city = clean_city(poi_data['city']) postcode = poi_data['zip'].strip() branch = None website = None nonstop = False if poi_data['business_hours']['monday'] is not None: mo_o, mo_c = clean_opening_hours(poi_data['business_hours']['monday']) else: mo_o, mo_c = None, None if poi_data['business_hours']['tuesday'] is not None: th_o, th_c = clean_opening_hours(poi_data['business_hours']['tuesday']) else: th_o, th_c = None, None if poi_data['business_hours']['wednesday'] is not None: we_o, we_c = clean_opening_hours(poi_data['business_hours']['wednesday']) else: we_o, we_c = None, None if poi_data['business_hours']['thursday'] is not None: tu_o, tu_c = clean_opening_hours(poi_data['business_hours']['thursday']) else: tu_o, tu_c = None, None if poi_data['business_hours']['friday'] is not None: fr_o, fr_c = clean_opening_hours(poi_data['business_hours']['friday']) else: fr_o, fr_c = None, None if poi_data['business_hours']['saturday'] is not None: sa_o, sa_c = clean_opening_hours(poi_data['business_hours']['saturday']) else: sa_o, sa_c = None, None if poi_data['business_hours']['sunday'] is not None: su_o, su_c = clean_opening_hours(poi_data['business_hours']['sunday']) else: su_o, su_c = None, None lat, lon = check_hu_boundary(poi_data['position'][0], poi_data['position'][1]) geom = check_geom(lat, lon) postcode = query_postcode_osm_external(self.prefer_osm_postcode, self.session, lat, lon, postcode) original = poi_data['address'] ref = None phone = None email = None insert_data.append( [code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) data = [] insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` table = soup.find('table', attrs={'class': 'tescoce-table'}) table_body = table.find('tbody') rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') link = cols[0].find('a').get( 'href') if cols[0].find('a') != None else [] cols = [element.text.strip() for element in cols] cols[0] = cols[0].split('\n')[0] del cols[-1] del cols[-1] cols.append(link) data.append(cols) for poi_data in data: # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data[3]) tesco_replace = re.compile('(expressz{0,1})', re.IGNORECASE) poi_data[0] = tesco_replace.sub('Expressz', poi_data[0]) if 'xpres' in poi_data[0]: name = 'Tesco Expressz' code = 'hutescoexp' elif 'xtra' in poi_data[0]: name = 'Tesco Extra' code = 'hutescoext' else: name = 'Tesco' code = 'hutescosup' poi_data[0] = poi_data[0].replace('TESCO', 'Tesco') poi_data[0] = poi_data[0].replace('Bp.', 'Budapest') postcode = poi_data[1].strip() city = clean_city(poi_data[2].split(',')[0]) branch = poi_data[0] website = poi_data[4] nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None original = poi_data[3] geom = None ref = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): csv = save_downloaded_pd( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) if csv is not None: csv[['Post code']] = csv[['Post code']].fillna('0000') csv[['Post code']] = csv[['Post code']].astype(int) csv[['Telephone']] = csv[['Telephone']].fillna('0') csv[['Telephone']] = csv[['Telephone']].astype(int) csv[['City']] = csv[['City']].fillna('') csv[['Name']] = csv[['Name']].fillna('') insert_data = [] poi_dict = csv.to_dict('records') for poi_data in poi_dict: if poi_data['Brand'] == 'Shell': name = 'Shell' code = 'hushellfu' elif poi_data['Brand'] == 'Mobilpetrol': name = 'Mobil Petrol' code = 'humobpefu' postcode = poi_data['Post code'] steet_tmp = poi_data['Address'].lower().split() for i in range(0, len(steet_tmp) - 2): steet_tmp[i] = steet_tmp[i].capitalize() steet_tmp = ' '.join(steet_tmp) street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( steet_tmp) if poi_data['City'] != '': city = clean_city(poi_data['City'].title()) else: if poi_data['Name'] != '': city = clean_city(poi_data['Name'].title()) else: city = None branch = poi_data['Name'].strip() website = None if poi_data['24 Hour'] == True: nonstop = True mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None else: nonstop = False mo_o = '06:00' th_o = '06:00' we_o = '06:00' tu_o = '06:00' fr_o = '06:00' sa_o = '06:00' su_o = '06:00' mo_c = '22:00' th_c = '22:00' we_c = '22:00' tu_c = '22:00' fr_c = '22:00' sa_c = '22:00' su_c = '22:00' original = poi_data['Address'] ref = None lat, lon = check_hu_boundary(poi_data['GPS Latitude'], poi_data['GPS Longitude']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) if 'Telephone' in poi_data and poi_data['Telephone'] != '': phone = clean_phone(str(poi_data['Telephone'])) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` pattern = re.compile('var\s*markers\s*=\s*((.*\n)*\]\;)', re.MULTILINE) script = soup.find('script', text=pattern) m = pattern.search(script.get_text()) data = m.group(0) data = data.replace("'", '"') data = clean_javascript_variable(data, 'markers') text = json.loads(data) for poi_data in text: if poi_data['cim'] is not None and poi_data['cim'] != '': postcode, city, street, housenumber, conscriptionnumber = extract_all_address( poi_data['cim']) name = 'Avia' code = 'huaviafu' branch = None if city is None: city = poi_data['title'] ref = poi_data['kutid'] if poi_data[ 'kutid'] is not None and poi_data['kutid'] != '' else None lat, lon = check_hu_boundary(poi_data['lat'], poi_data['lng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) website = '/toltoallomas/?id={}'.format(str(poi_data['kutid'])) if poi_data['kutid'] is not None and \ poi_data['kutid'] != '' else None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None original = poi_data['cim'] if 'tel' in poi_data and poi_data['tel'] != '': phone = clean_phone(poi_data['tel']) else: phone = None if 'email' in poi_data and poi_data['email'] != '': email = clean_email(poi_data['email']) else: email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` pattern = re.compile('^\s*var\s*boltok_nyers.*') script = soup.find('script', text=pattern) m = pattern.match(script.get_text()) data = m.group(0) data = clean_javascript_variable(data, 'boltok_nyers') text = json.loads(data) # for l in text: # print ('postcode: {postcode}; city: {city}; address: {address}; alt_name: {alt_name}'.format(postcode=l['A_IRSZ'], city=l['A_VAROS'], address=l['A_CIM'], alt_name=l['P_NAME'])) for poi_data in text: # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['A_CIM']) city = clean_city(poi_data['A_VAROS']) postcode = poi_data['A_IRSZ'].strip() branch = poi_data['P_NAME'].strip() name = 'Príma' if 'Príma' in branch else 'CBA' code = 'huprimacon' if 'Príma' in branch else 'hucbacon' website = None nonstop = None mo_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_1'] ) if poi_data['PS_OPEN_FROM_1'] is not None else None th_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_2'] ) if poi_data['PS_OPEN_FROM_2'] is not None else None we_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_3'] ) if poi_data['PS_OPEN_FROM_3'] is not None else None tu_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_4'] ) if poi_data['PS_OPEN_FROM_4'] is not None else None fr_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_5'] ) if poi_data['PS_OPEN_FROM_5'] is not None else None sa_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_6'] ) if poi_data['PS_OPEN_FROM_6'] is not None else None su_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_7'] ) if poi_data['PS_OPEN_FROM_7'] is not None else None mo_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_1'] ) if poi_data['PS_OPEN_TO_1'] is not None else None th_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_2'] ) if poi_data['PS_OPEN_TO_2'] is not None else None we_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_3'] ) if poi_data['PS_OPEN_TO_3'] is not None else None tu_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_4'] ) if poi_data['PS_OPEN_TO_4'] is not None else None fr_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_5'] ) if poi_data['PS_OPEN_TO_5'] is not None else None sa_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_6'] ) if poi_data['PS_OPEN_TO_6'] is not None else None su_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_7'] ) if poi_data['PS_OPEN_TO_7'] is not None else None original = poi_data['A_CIM'] lat, lon = check_hu_boundary(poi_data['PS_GPS_COORDS_LAT'], poi_data['PS_GPS_COORDS_LNG']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) ref = None if 'PS_PUBLIC_TEL' in poi_data and poi_data[ 'PS_PUBLIC_TEL'] != '': phone = clean_phone(poi_data['PS_PUBLIC_TEL']) else: phone = None if 'PS_PUBLIC_EMAIL' in poi_data and poi_data[ 'PS_PUBLIC_EMAIL'] != '': email = poi_data['PS_PUBLIC_EMAIL'] else: email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text: name = 'Foxpost' code = 'hufoxpocso' postcode = poi_data['zip'].strip() street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['street']) city = clean_city(poi_data['city']) branch = poi_data['name'] website = None nonstop = None if poi_data['open']['hetfo'] is not None: mo_o, mo_c = clean_opening_hours(poi_data['open']['hetfo']) else: mo_o, mo_c = None, None if poi_data['open']['kedd'] is not None: th_o, th_c = clean_opening_hours(poi_data['open']['kedd']) else: th_o, th_c = None, None if poi_data['open']['szerda'] is not None: we_o, we_c = clean_opening_hours( poi_data['open']['szerda']) else: we_o, we_c = None, None if poi_data['open']['csutortok'] is not None: tu_o, tu_c = clean_opening_hours( poi_data['open']['csutortok']) else: tu_o, tu_c = None, None if poi_data['open']['pentek'] is not None: fr_o, fr_c = clean_opening_hours( poi_data['open']['pentek']) else: fr_o, fr_c = None, None if poi_data['open']['szombat'] is not None: sa_o, sa_c = clean_opening_hours( poi_data['open']['szombat']) else: sa_o, sa_c = None, None if poi_data['open']['vasarnap'] is not None: su_o, su_c = clean_opening_hours( poi_data['open']['vasarnap']) else: su_o, su_c = None, None original = poi_data['address'] ref = None lat, lon = check_hu_boundary(poi_data['geolat'], poi_data['geolng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def export_list(self): if self.data is None or self.data.lenght() < 1: logging.warning('Resultset is empty. Skipping ...') else: insert_poi_dataframe(self.session, self.data.process())
def process(self): ''' soup = save_downloaded_soup('{}'.format(self.link), os.path.join(self.download_cache, self.filename), POST_DATA) insert_data = [] if soup != None: text = json.loads(soup.get_text()) ''' with open(os.path.join(self.download_cache, self.filename), 'r') as f: insert_data = [] text = json.load(f) for poi_data in text: street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['cim']) if 'Kulcs patika' not in poi_data['nev']: name = poi_data['nev'].strip() branch = None else: name = 'Kulcs patika' branch = poi_data['nev'].strip() code = 'hukulcspha' website = poi_data['link'].strip( ) if poi_data['link'] is not None else None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None city = clean_city(poi_data['helyseg']) lat, lon = check_hu_boundary( poi_data['marker_position']['latitude'], poi_data['marker_position']['longitude']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, poi_data['irsz'].strip()) original = poi_data['cim'] ref = None phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): xml = save_downloaded_xml( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] root = etree.fromstring(xml) for e in root.findall('post'): if e.find('ServicePointType').text == 'PM': name = 'Posta' code = 'hupostapo' elif e.find('ServicePointType').text == 'CS': name = 'Posta csomagautomata' code = 'hupostacso' elif e.find('ServicePointType').text == 'PP': name = 'PostaPont' code = 'hupostapp' else: logging.error('Non existing Posta type.') postcode = e.get('zipCode') street_tmp_1 = e.find('street/name').text.strip() if e.find( 'street/name').text is not None else None street_tmp_2 = e.find('street/type').text.strip() if e.find( 'street/type').text is not None else None if street_tmp_1 is None: street = None elif street_tmp_2 is None: street = street_tmp_1 elif street_tmp_1 is not None and street_tmp_2 is not None: street = '{} {}'.format(street_tmp_1, street_tmp_2) else: logging.error('Non handled state!') housenumber = e.find('street/houseNumber').text.strip().lower( ) if e.find('street/houseNumber').text is not None else None conscriptionnumber = None city = clean_city(e.find('city').text) branch = e.find('name').text if e.find( 'name').text is not None else None website = None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None lat, lon = check_hu_boundary( e.find('gpsData/WGSLat').text.replace(',', '.'), e.find('gpsData/WGSLon').text.replace(',', '.')) geom = check_geom(lat, lon) postcode = query_postcode_osm_external(self.prefer_osm_postcode, self.session, lat, lon, postcode) original = None ref = None phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text['items']: if poi_data['type'] == 'posta': if 'mobilposta' in poi_data['name']: name = 'Mobilposta' code = 'hupostamp' else: name = 'Posta' code = 'hupostapo' elif poi_data['type'] == 'csekkautomata': name = 'Posta csekkautomata' code = 'hupostacse' elif poi_data['type'] == 'postamachine': name = 'Posta csomagautomata' code = 'hupostacso' elif poi_data['type'] == 'postapoint': name = 'PostaPont' code = 'hupostapp' else: logging.error('Non existing Posta type.') postcode = poi_data['zipCode'].strip() street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) city = clean_city(poi_data['city']) branch = poi_data['name'] website = None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None geom = check_geom(poi_data['lat'], poi_data['lng']) original = poi_data['address'] ref = None phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename), POST_DATA) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text['results']: name = 'OMV' code = 'huomvfu' postcode = poi_data['postcode'].strip() street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address_l']) city = clean_city(poi_data['town_l']) branch = None website = None nonstop = None if poi_data['open_hours'] is not None: oho, ohc = clean_opening_hours(poi_data['open_hours']) if oho == '00:00' and ohc == '24:00': nonstop = True oho, ohc = None, None else: oho, ohc = None, None mo_o = oho th_o = oho we_o = oho tu_o = oho fr_o = oho sa_o = oho su_o = oho mo_c = ohc th_c = ohc we_c = ohc tu_c = ohc fr_c = ohc sa_c = ohc su_c = ohc original = poi_data['address_l'] ref = None lat, lon = check_hu_boundary(poi_data['y'], poi_data['x']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) if 'telnr' in poi_data and poi_data['telnr'] != '': phone = clean_phone(poi_data['telnr']) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` # script = soup.find('div', attrs={'data-stores':True}) script = soup.find(attrs={'data-stores': True}) text = json.loads(script['data-stores']) for poi_data in text: # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) city = clean_city(poi_data['city']) branch = poi_data['name'] if 'xpres' in poi_data['name']: name = 'Tesco Expressz' code = 'hutescoexp' elif 'xtra' in poi_data['name']: name = 'Tesco Extra' code = 'hutescoext' else: name = 'Tesco' code = 'hutescosup' website = poi_data['url'] nonstop = None opening = json.loads(poi_data['opening']) mo_o = opening['1'][0] th_o = opening['2'][0] we_o = opening['3'][0] tu_o = opening['4'][0] fr_o = opening['5'][0] sa_o = opening['6'][0] su_o = opening['0'][0] mo_c = opening['1'][1] th_c = opening['2'][1] we_c = opening['3'][1] tu_c = opening['4'][1] fr_c = opening['5'][1] sa_c = opening['6'][1] su_c = opening['0'][1] lat, lon = check_hu_boundary(poi_data['gpslat'], poi_data['gpslng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, None) original = poi_data['address'] ref = None if 'phone' in poi_data and poi_data['phone'] != '': phone = clean_phone(poi_data['phone']) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)