def test_tune_parameter(f, numbers): """ :param f: the category like 10th has 'A' while 'B' is for science etc. given as first letter of the seat number :param numbers: the remaining 7 letters of the seat numbers all provided in a list e.g:[3413535,1355151,3153153..] :return: None . This function is just used to tune the values in the main function so we can get indexes of marks etc as table structure may change. """ goto('http://www.gseb.org/') for loop in numbers: driver.find_element_by_xpath( "//select[@name='drpInitChar']/option[text()='" + f + "']").click() username = driver.find_element_by_name("studentnumber") seat_no = loop username.send_keys(seat_no) go = driver.find_element_by_name("go") go.click() time.sleep(1) driver.switch_to.frame("marksheet") soup = BeautifulSoup(driver.page_source) table = soup.find("table", attrs={"class": "maintbl"}) extractor = Extractor(table) extractor.parse() l = extractor.return_list() print(l)
def get_gseb_results_10(numbers, output_file_path, subjects): goto('http://gseb.org/indexssc.html') df = pd.DataFrame() for loop in numbers: detail = {} username = driver.find_element_by_name("studentnumber") seat_no = loop detail['seat_no'] = seat_no username.send_keys(seat_no) go = driver.find_element_by_name("go") go.click() time.sleep(1) driver.switch_to.frame("marksheet") soup = BeautifulSoup(driver.page_source) table = soup.find("table", attrs={"class": "maintbl"}) extractor = Extractor(table) extractor.parse() l = extractor.return_list() detail['Name'] = find(l[0][0], "Name: ") detail['result'] = find(l[1][0], "Result: ").strip('School Index: 55.224') for i in range(len(subjects)): detail[subjects[i] + ' in external'] = l[i + 3][1] detail[subjects[i] + ' in internal'] = l[i + 3][2] detail[subjects[i] + ' in total'] = l[i + 3][3] detail[subjects[i] + ' grade'] = l[i + 3][4] detail['Total'] = int(l[9][1].split()[0]) detail['Overall Grade'] = l[17][0] detail['Percentile Rank'] = l[17][2] df = df.append(detail, ignore_index=True) driver.switch_to_default_content() driver.find_element_by_name("studentnumber").clear() print(df) df.to_csv(output_file_path, index=False) driver.close()
class TestComplexExtractor(unittest.TestCase): def setUp(self): html = """ <table> <tr> <td rowspan=2>1</td> <td>2</td> <td>3</td> </tr> <tr> <td colspan=2>4</td> </tr> <tr> <td colspan=3>5</td> </tr> </table> """ self.extractor = Extractor(html) self.extractor.parse() def test_return_list(self): self.assertEqual( self.extractor.return_list(), [[u'1', u'2', u'3'], [u'1', u'4', u'4'], [u'5', u'5', u'5']] )
class TestWriteToCsv(unittest.TestCase): def setUp(self): html = """ <table> <tr> <td>1</td> <td>2</td> </tr> <tr> <td>3</td> <td>4</td> </tr> </table> """ self.extractor = Extractor(html) self.extractor.parse() mock.mock_open() def test_write_to_csv_default(self, csv_mock, _): self.extractor.write_to_csv() csv_mock.assert_called_with('./output.csv', 'w') def test_write_to_csv_custom_path_and_filename(self, csv_mock, _): self.extractor.write_to_csv(path='/test/path', filename='testfile.csv') csv_mock.assert_called_with('/test/path/testfile.csv', 'w')
def _fetch_air_quality_routine(self, day: datetime): """ Populate the air quality of the provinces Fetches data from `http://www.arpab.it/aria/qa.asp` :param day: The day of which the air quality wants to be known (instance of `~datetime`) """ super()._fetch_air_quality_routine(day) res = requests.get('http://www.arpab.it/aria/qa.asp', params=[ ('giorno', day.strftime('%d/%m/%Y')) ] ) soup = BeautifulSoup(res.text, 'html.parser') table = soup.select_one('.tabellenav') if table is not None: extractor = Extractor(table) extractor.parse() table_data = extractor.return_list()[1:] for province in self.provinces: province_rows = [x for idx, x in enumerate(table_data) if idx in self.province_stations[province.short_name]] for indicator, key in self.indicator_map.items(): values = [self.extract_float(x[key]) for x in province_rows if self.extract_float(x[key]) is not None] if len(values) > 0: setattr(province.quality, indicator, round(mean(values), 2)) if self.on_quality_fetched is not None: self.on_quality_fetched(self)
def get_text_data(html): extractor = Extractor(html) extractor.parse() table = extractor.return_list() del table[0] for i in range(len(table)): a = table[i] del a[0] del a[3] return table
def parseTable(self, table): self.parseHeader(table) # skip if header not found if self.headers == []: return [] # parse the table, split merged cell and fill in value extractor = Extractor(str(table), transformer=unicode) extractor.parse() raw_data = extractor.return_list() # fill the empty cell by previous row value for i in xrange(0, len(raw_data)): for j in xrange(0, len(raw_data[i])): if raw_data[i][j].strip() == '': raw_data[i][j] = raw_data[i - 1][j] # select rows if there time qualifier appeared raw_data = [ a for a in raw_data if (''.join(a)).find(u'上午') != -1 or (''.join(a)).find(u'下午') != -1 ] # assigning columns with header as key data = [] # printJson( self.headers ) for i in xrange(0, len(raw_data)): data.append({}) raw_data[i] = self.unique(raw_data[i]) # handle colspan here if len(self.headers) != len( raw_data[i] ): # skip unformat row, likely to be the ending row continue for j in xrange(0, len(raw_data[i])): data[i][self.headers[j]] = raw_data[i][j] # mergeing cells content if the case no. is the same content = [] for i in xrange(0, len(data)): if not (u'案件號碼' in data[i] or u'案件編號' in data[i]): continue if content == [] or \ ( u'案件號碼' in data[i] and content[-1][u'案件號碼'].strip() != data[i][u'案件號碼'].strip() and content[-1][u'案件號碼'].strip().strip(u'─') != '' ) or \ ( u'案件編號' in data[i] and content[-1][u'案件編號'].strip() != data[i][u'案件編號'].strip() and content[-1][u'案件編號'].strip().strip(u'─') != '' ): content.append(data[i]) else: #print json.dumps(content[-1], ensure_ascii=False, indent=4) #print json.dumps(data[i], ensure_ascii=False, indent=4) for k, v in data[i].iteritems(): if content[-1][k] != data[i][k]: content[-1][k] += data[i][k] # done return content
def parseHeader(self, table): if "案件編號" in str(table) or "案件號碼" in str(table): extractor = Extractor(str(table), transformer=unicode) extractor.parse() headerTable = extractor.return_list() for i in xrange(0, len(headerTable)): if (''.join(headerTable[i])).find(u'案件號碼') != -1 or (''.join( headerTable[i])).find(u'案件編號') != -1: self.headers = headerTable[i] self.headers = self.unique(self.headers) for i in xrange(0, len(self.headers)): self.headers[i] = re.sub('[A-Za-z]', '', self.headers[i]).strip()
def get_gseb_results_12(f, seat_numbers, output_file_path, subjects): """ :param f: The category of student in which he belongs which is metioned as first letter of seat number like='A' for std.10 :param seat_numbers: The list of seat numbers of all stuents e.g.: [3513611,3136144,7724523] :param output_file_path: file address where the output result sheet is to be stored e.g. "D:/10_results/result.csv" :param subjects: the list of subjects e.g. ['English','Maths',...] :return: Nothing but csv is saved in the same folder """ goto('http://www.gseb.org/') df = pd.DataFrame() temp = [] for loop in seat_numbers: detail = {} driver.find_element_by_xpath( "//select[@name='drpInitChar']/option[text()='" + f + "']").click() username = driver.find_element_by_name("studentnumber") seat_no = loop detail['seat_no'] = seat_no username.send_keys(seat_no) go = driver.find_element_by_name("go") go.click() time.sleep(1) driver.switch_to.frame("marksheet") soup = BeautifulSoup(driver.page_source) table = soup.find("table", attrs={"class": "maintbl"}) extractor = Extractor(table) extractor.parse() l = extractor.return_list() detail['Name'] = find(l[0][0], "Name: ") detail['result'] = find(l[2][0], "Result: ").strip('School Index: 27.109') for i in range(len(subjects)): ttt = 5 - int(l[4 + i][0][-5:].find( re.findall(r"[A-Z]", l[4 + i][0][-5:])[0])) detail[subjects[i] + ' in total'] = l[4 + i][0][-5:-ttt] detail[subjects[i] + ' grade'] = l[4 + i][0][-ttt:] detail['Total'] = int(l[11][0][-3:]) detail['Overall Grade'] = find(l[1][0], 'Grade: ') detail['Percentile Rank'] = find( l[1][0], 'Percentile: ').split()[0].strip('Grade:') temp.append(detail) driver.switch_to_default_content() driver.find_element_by_name("studentnumber").clear() df = df.append(temp, ignore_index=True) print(df) df.to_csv(output_file_path, index=False) driver.close()
def get_low_currency_price_banks(currency): url = f'https://www.findrate.tw/{currency}/#.XRjSkVMkveR' result = requests.get(url) result.encoding = 'UTF-8' html = BeautifulSoup(result.text, 'html.parser') tables = html.findAll('table') extractor = Extractor(tables[1]) extractor.parse() currency_table = extractor.return_list() columns = currency_table.pop(0)[1:] indices = [data.pop(0).rstrip() for data in currency_table] df = pandas.DataFrame(currency_table, columns=columns, index=indices) top8_cash_sell = df.sort_values(['現鈔賣出']).iloc[:8] top8_spot_sell = df.sort_values(['即期賣出']).iloc[:8] return (top8_cash_sell, top8_spot_sell)
class TestExtractorTransformer(unittest.TestCase): def setUp(self): html = """ <table> <tr> <td>1</td> <td>2</td> </tr> <tr> <td>3</td> <td>4</td> </tr> </table> """ self.extractor = Extractor(html, transformer=int) self.extractor.parse() def test_config_transformer(self): self.assertEqual(self.extractor.return_list(), [[1, 2], [3, 4]])
class TestSimpleExtractor(unittest.TestCase): def setUp(self): html = """ <table> <tr> <td>1</td> <td>2</td> </tr> <tr> <td>3</td> <td>4</td> </tr> </table> """ self.extractor = Extractor(html) self.extractor.parse() def test_return_list(self): self.assertEqual(self.extractor.return_list(), [[u'1', u'2'], [u'3', u'4']])
def _fetch_air_quality_routine(self, day: datetime): """ Populate the air quality of the provinces Fetches data from `http://www.arpa.veneto.it/arpavinforma/bollettini/aria/aria_dati_validati_storico.php` :param day: The day of which the air quality wants to be known (instance of `~datetime`) """ super()._fetch_air_quality_routine(day) for province in self.provinces: data = { 'provincia': province.name.lower(), 'giorno': day.strftime('%d'), 'mese': day.strftime('%m'), 'anno': day.strftime('%Y'), 'Vai': 'Visualizza il bollettino' } response = requests.post( 'http://www.arpa.veneto.it/arpavinforma/bollettini/aria/aria_dati_validati_storico.php', data=data) soup = BeautifulSoup(response.text, 'html.parser') table = soup.select_one('#ariadativalidati table') if table: extractor = Extractor(table) extractor.parse() table_data = extractor.return_list()[3:] province.quality.co = self.indicator_value(table_data, 'co') province.quality.so2 = self.indicator_value(table_data, 'so2') province.quality.no2 = self.indicator_value(table_data, 'no2') province.quality.o3 = self.indicator_value(table_data, 'o3') province.quality.pm10 = self.indicator_value( table_data, 'pm10') province.quality.pm25 = self.indicator_value( table_data, 'pm25') province.quality.c6h6 = self.indicator_value( table_data, 'c6h6') if self.on_quality_fetched is not None: self.on_quality_fetched(self)
def scores(indx): table = all_tables[indx] # Grab the first table extractor = Extractor(table) extractor.parse() sias_list = extractor.return_list() sias_list = sias_list[:-3] sias_len = len(sias_list) sias_names = sias_list[sias_len - 1] sias_names = sias_names[1:] num_item = len(sias_names) sias_final = [[0] * num_item for _ in range(num_item)] for i in range(sias_len - 1): row = sias_list[i] row = row[1:] for j in range(len(row)): sias_final[i][j] = row[j] return sias_final
def extract_table(href: str) -> list: # from given reference download all the tables = class .table and put them in the list, which is returned def get_name(html_local): # the only way found to get the location name soup_local = BeautifulSoup( html_local, "html.parser" ) # I didnt know how to transfer the BeautifulSoup object in vse = soup_local.find_all('h3') for item in vse: a = item.text.split(':') if 'Obec' in a[0]: obec = a[1].strip(' "\n"') return obec # from given reference download all the tables = class .table and put them in the list, which is returned html = stahni_html(href) district_name = get_name(html) soup = BeautifulSoup(html, "html.parser") tables = [] #using Extractor to get tables from the html code for a_elem in soup.select('.table'): extractor = Extractor(a_elem) extractor.parse() tables.append(extractor.return_list()) return tables, district_name
def get_tables(tables): res = [] for t in tables: labels = t.find_all('span', attrs={'class': 'label'}) captions = t.find_all('span', attrs={'class': 'captions'}) legend = t.find_all('p', attrs={'class': 'legend'}) footnotes = t.find_all('dl', attrs={'class': 'footnotes'}) for t2 in t.find_all('table'): extractor = Extractor(t2) extractor.parse() content = extractor.return_list() tab = { 'label': labels[0].text if labels else '', 'caption': captions[0].text.replace(labels[0].text, '') if captions else '', 'legend': [x.text for x in legend] if legend else '', 'footnote': [x.text for x in footnotes] if footnotes else '', 'content': content } res.append(tab) return res
def data_table(s): """[summary] Args: s ([type]): [description] """ table_doc = s.find_all('table') extractor = Extractor(table_doc[0]) extractor = extractor.parse() tabla = extractor.return_list() tabla_columns = tabla[0] tabla_datos = tabla[1:] final = [] for fila in tabla_datos: for element in fila: a = [w.strip() for w in fila] final.append(a) df = pd.DataFrame(final, columns=tabla_columns) return (df)
def html_table_converter(self, offer_table): extractor = Extractor(offer_table) extractor.parse() table_list = extractor.return_list() return table_list
def destination(): error = request.args['error'] text = request.args['text'] number_input = request.args['number_input'] req = requests.get('https://www.google.com/search?q='+text, headers={'User-Agent': 'Mozilla/5.0'}) req.raise_for_status() soup_doc = BeautifulSoup(req.text, 'html.parser') linkelements = soup_doc.select('.r a') page = Request(("https://google.com" + linkelements[int(number_input)-1].get('href')), headers={'User-Agent': 'Mozilla/5.0'}) page_response = urlopen(page) url = "https://google.com" + linkelements[int(number_input)-1].get('href') html_read = page_response.read() soup = BeautifulSoup(html_read, 'html.parser') if error == '0' and request.method != 'POST': sample = 'Press enter to search for HTML attributes. 1 to navigate back, 2 to navigate to home page' myobj = gTTS(text=sample, lang='en', slow=False) myobj.save("text.mp3") os.system("mpg321 text.mp3") if request.method == 'POST': if request.form['btn'] == 'Home': return redirect(url_for('index', error = 1)) if request.form['btn'] == 'Back': return redirect(url_for('link_number', text=text, error=1)) if request.form['btn'] == 'search audio': if error == '0': sample = 'Speak html attributes like title, paragraph, bold, links, tables, etcetra.' myobj = gTTS(text=sample, lang='en', slow=False) myobj.save("text1.mp3") os.system("mpg321 text1.mp3") error = 1 r = sr.Recognizer() with sr.Microphone() as source: print("Say something or else say 'exit' to exit!") r.adjust_for_ambient_noise(source, duration = 1) audio = r.listen(source) try : text1 = r.recognize_google(audio) print(text1) if text1 == 'title': sample = 'Title of the given page is '+soup.title.string myobj = gTTS(text=sample, lang='en', slow=False) myobj.save("text.mp3") os.system("mpg321 text.mp3") elif text1 == 'bold' : sample = 'Bold attributes of the given page are' for x in soup.find_all('b'): sample = sample +", "+ x.string myobj = gTTS(text=sample, lang='en', slow=False) myobj.save("welcome.mp3") os.system("mpg321 welcome.mp3") elif text1 == 'links' : sample = 'links to other pages are' for x in soup.find_all('a'): sample = sample +", "+ x.string myobj = gTTS(text=sample, lang='en', slow=False) myobj.save("welcome.mp3") os.system("mpg321 welcome.mp3") elif text1 == 'para' : sample = 'pragraphs of the given page is' for x in soup.find_all('p'): sample = sample +", "+ x.string myobj = gTTS(text=sample, lang='en', slow=False) myobj.save("welcome.mp3") os.system("mpg321 welcome.mp3") elif text1 == 'tables' or text == 'table' : sample = 'Tables of the given page are : ' extractor = Extractor(soup) extractor.parse() table_list = extractor.return_list() print(len(table_list)) for rows in range(len(table_list)): if rows >0 : if len(table_list[rows]) != len(table_list[0]): break for columns in range(len(table_list[rows])): sample += table_list[0][columns]+" is "+table_list[rows][columns]+", " myobj = gTTS(text=sample, lang='en', slow=False) myobj.save("welcome.mp3") os.system("mpg321 welcome.mp3") else: sample = 'Unable to recognize HTML attribute, press enter and speak again' myobj = gTTS(text=sample, lang='en', slow=False) myobj.save("text.mp3") os.system("mpg321 text.mp3") return redirect(url_for('destination', text=text, number_input=number_input, error=1)) except : sample = 'Unable to recognize you! , press enter and speak again' myobj = gTTS(text=sample, lang='en', slow=False) myobj.save("text.mp3") os.system("mpg321 text.mp3") return redirect(url_for('destination', text=text, number_input=number_input, error=1)) return render_template('destination.html', url=url)
def _fetch_air_quality_routine(self, day: datetime): """ Populate the air quality of the provinces. Data is fetched from https://www.arpae.it/qualita-aria/bollettino-qa/{date} where {date} is the date of interest in the format YYYYMMDD :param day: The day of which the air quality wants to be known (instance of `~datetime`) """ super()._fetch_air_quality_routine(day) date_fmt = day.strftime('%Y%m%d') res = requests.get( f'https://www.arpae.it/qualita-aria/bollettino-qa/{date_fmt}') if res.status_code == 200: soup = BeautifulSoup(res.text, 'html.parser') table_rows = '\n'.join( [str(x) for x in soup.select('.tabella table tbody tr')]) big_table = f'<table>{table_rows}</table>' extractor = Extractor(big_table) extractor.parse() table = extractor.return_list() for province in self.provinces: province_rows = [ x for x in table if x[0] == province.short_name ] so2 = [ self.extract_float(x[9]) for x in province_rows if self.extract_float(x[9]) is not None ] no2 = [ self.extract_float(x[4]) for x in province_rows if self.extract_float(x[4]) is not None ] co = [ self.extract_float(x[8]) for x in province_rows if self.extract_float(x[8]) is not None ] pm10 = [ self.extract_float(x[2]) for x in province_rows if self.extract_float(x[2]) is not None ] pm25 = [ self.extract_float(x[3]) for x in province_rows if self.extract_float(x[3]) is not None ] o3 = [ self.extract_float(x[6]) for x in province_rows if self.extract_float(x[6]) is not None ] c6h6 = [ self.extract_float(x[7]) for x in province_rows if self.extract_float(x[7]) is not None ] if len(so2) > 0: province.quality.so2 = round(mean(so2), 2) if len(no2) > 0: province.quality.no2 = round(mean(no2), 2) if len(co) > 0: province.quality.co = round(mean(co), 2) if len(pm10) > 0: province.quality.pm10 = round(mean(pm10), 2) if len(pm25) > 0: province.quality.pm25 = round(mean(pm25), 2) if len(o3) > 0: province.quality.o3 = round(mean(o3), 2) if len(c6h6) > 0: province.quality.c6h6 = round(mean(c6h6), 2) if self.on_quality_fetched is not None: self.on_quality_fetched(self)
def read_shifts_from_html_pages(rawtoi1, rawtoi2, teamid1, teamid2, season, game): """ Aggregates information from two html pages given into a dataframe with one row per second and one col per player. :param rawtoi1: str, html page of shift log for team id1 :param rawtoi2: str, html page of shift log for teamid2 :param teamid1: int, team id corresponding to rawtoi1 :param teamid2: int, team id corresponding to rawtoi1 :param season: int, the season :param game: int, the game :return: dataframe """ from html_table_extractor.extractor import Extractor dflst = [] for rawtoi, teamid in zip((rawtoi1, rawtoi2), (teamid1, teamid2)): extractor = Extractor(rawtoi) extractor.parse() tables = extractor.return_list() ids = [] periods = [] starts = [] ends = [] durationtime = [] teams = [] i = 0 while i < len(tables): # A convenient artefact of this package: search for [p, p, p, p, p, p, p, p] if len(tables[i]) == 8 and helpers.check_number_last_first_format( tables[i][0]): pname = helpers.remove_leading_number(tables[i][0]) pname = helpers.flip_first_last(pname) pid = players.player_as_id(pname) i += 2 # skip the header row while re.match('\d{1,2}', tables[i][0]): # First entry is shift number # print(tables[i]) shiftnum, per, start, end, dur, ev = tables[i] # print(pname, pid, shiftnum, per, start, end) ids.append(pid) periods.append(int(per)) starts.append(start[:start.index('/')].strip()) ends.append(end[:end.index('/')].strip()) durationtime.append(helpers.mmss_to_secs(dur)) teams.append(teamid) i += 1 i += 1 else: i += 1 startmin = [x[:x.index(':')] for x in starts] startsec = [x[x.index(':') + 1:] for x in starts] starttimes = [ 1200 * (p - 1) + 60 * int(m) + int(s) + 1 for p, m, s in zip(periods, startmin, startsec) ] # starttimes = [0 if x == 1 else x for x in starttimes] endmin = [x[:x.index(':')] for x in ends] endsec = [x[x.index(':') + 1:] for x in ends] # There is an extra -1 in endtimes to avoid overlapping start/end endtimes = [ 1200 * (p - 1) + 60 * int(m) + int(s) for p, m, s in zip(periods, endmin, endsec) ] durationtime = [e - s for s, e in zip(starttimes, endtimes)] df = pd.DataFrame({ 'PlayerID': ids, 'Period': periods, 'Start': starttimes, 'End': endtimes, 'Team': teams, 'Duration': durationtime }) dflst.append(df) return _finish_toidf_manipulations(pd.concat(dflst), season, game)
def get_lists(soup: BeautifulSoup) -> List[List[str]]: extractor = Extractor(soup) extractor.parse() return extractor.return_list()
for i in range(starting_roll_number, ending_roll_number + 1): action = ActionChains(driver) textbox = driver.find_element_by_xpath('//*[@id="studentnumber"]') submit_button = driver.find_element_by_xpath( '//*[@id="middle"]/div/div/div[1]/center[1]/input[2]') action.click(textbox).send_keys(i).click(submit_button).perform() driver.switch_to_frame('marksheet') soup = BeautifulSoup(driver.page_source, 'html.parser') table = soup.findAll('table') try: extra = Extractor(table[0]) extra.parse() text_file.write(f"""\n{i}--> {extra.return_list()[0][0]} {extra.return_list()[2][0]} {extra.return_list()[3][0]} {extra.return_list()[7][0]} {extra.return_list()[9][0]} """) except IndexError: print('Student number has problem.') pass driver.refresh() text_file.close()
issPassUrl = "https://heavens-above.com/PassSummary.aspx?satid=25544&lat=55.2323&lng=-2.616&loc=Kielder&alt=378&tz=GMT" issPage = http.request('GET', issPassUrl) issSoup = BeautifulSoup(issPage.data.decode('utf-8'), "html.parser") print('ISS Page Downloaded') passes = issSoup.find("table", "standardTable") if passes == 'None': f = open('output.csv', 'w') f.close() else: passes = str(passes).replace( "><", ">\n<") # Separate table elements into new lines for Extractor extractor = Extractor(passes) extractor.parse() extractor.write_to_csv(path='.') print('CSV Written') links = issSoup.find_all("tr", "clickableRow") print(links[0]) # Python CSV tutorial at https://realpython.com/python-csv/ with open('output.csv', 'w+', newline='', encoding='utf-8') as f: reader = csv.reader(f) line = 0 passlist = {} passlist['passes'] = [] for isspass in reader: if (line <= 1): # Skip first two lines of header data line += 1
def _fetch_air_quality_routine(self, day: datetime): """ Populate the air quality of the provinces. Data is fetched from http://www.arpa.umbria.it/monitoraggi/aria/Default.aspx :param day: The day of which the air quality wants to be known (instance of `~datetime`) """ super()._fetch_air_quality_routine(day) date_fmt = day.strftime('%d/%m/%Y') data = { '__EVENTTARGET': 'ctl00$Content$txtData', '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATE': '/wEPDwUKMTUzNjEyNDUzNw9kFgJmD2QWAgIBD2QWAmYPZBYEAgsPZBYEAgEPFgIeC18hSXRlbUNvdW50AgMWBmYPZBYEAgEPDxYCHgdWaXNpYmxlaGQWAmYPFQEIMDkvMDQvMThkAgIPFQEZJm5ic3A7PC9wPg0KPHA+Jm5ic3A7PC9wPmQCAQ9kFgQCAQ9kFgJmDxUBCDA1LzA1LzE5ZAICDxUBwgFOZWxsYSBnaW9ybmF0YSBvZGllcm5hIGNpIHNvbm8gc3RhdGUgZGVsbGUgZGlmZmljb2x0JmFncmF2ZTsgdGVjbmljaGUgaW4gbWVyaXRvIGFsbGEgcHViYmxpY2F6aW9uZSBhdXRvbWF0aWNhIGRlaSBkYXRpIGRpIHNhYmF0byA0LiBMJ2luY29udmVuaWVudGUgdmVyciZhZ3JhdmU7IHJpc29sdG8gYWwgcGkmdWdyYXZlOyBwcmVzdG8uPC9wPmQCAg9kFgQCAQ9kFgJmDxUBCDE5LzAyLzE5ZAICDxUBhwM8c3Ryb25nPk1hbnV0ZW56aW9uZSBzdHJ1bWVudGF6aW9uZSAyMDE5PC9zdHJvbmc+PGJyIC8+RGFsIDE4IGZlYmJyYWlvIGFsIHByaW1vIG1hcnpvIHNvbm8gcHJldmlzdGUgbGUgb3BlcmF6aW9uaSBkaSBtYW51dGVuemlvbmUgcGVyaW9kaWNoZSAoYW5udWFsaSkgZGVsbGEgc3RydW1lbnRhemlvbmUgaW5zdGFsbGF0YSBuZWxsYSByZXRlIGRpIG1vbml0b3JhZ2dpby4gUGVyIHF1ZXN0byBtb3Rpdm8gcG90cmViYmVybyB2ZXJpZmljYXJzaSBkZWxsZSBpbnRlcnJ1emlvbmkgbmVsIHJpbGV2YW1lbnRvIGRlaSBkYXRpIHJlbGF0aXZpIGFnbGkgc3RydW1lbnRpIGluIG1hbnV0ZW56aW9uZS4mbmJzcDs8L3A+DQo8cD4mbmJzcDs8L3A+DQo8cD4mbmJzcDs8L3A+DQo8cD4mbmJzcDs8L3A+ZAIDDw8WBB4LUG9zdEJhY2tVcmwFK2FyY2hpdmlvTm90aXppZS5hc3B4P2NvZGljZVBhZ2luYT1SUk0mem9uYT0fAWdkZAIPD2QWAmYPZBYCAgEPEA8WBh4NRGF0YVRleHRGaWVsZAUETm9tZR4ORGF0YVZhbHVlRmllbGQFAklkHgtfIURhdGFCb3VuZGdkEBUPGVBlcnVnaWEgLSBQYXJjbyBDb3J0b25lc2UcUGVydWdpYSAtIFBvbnRlIFNhbiBHaW92YW5uaRRQZXJ1Z2lhIC0gRm9udGl2ZWdnZSBDaXR0w6AgZGkgQ2FzdGVsbG8gLSBDLiBDYXN0ZWxsbxpHdWJiaW8gLSBQaWF6emEgNDAgTWFydGlyaRFNYWdpb25lIC0gTWFnaW9uZRZGb2xpZ25vIC0gUG9ydGEgUm9tYW5hEFRvcmdpYW5vIC0gQnJ1ZmEZU3BvbGV0byAtIFBpYXp6YSBWaXR0b3JpYRJUZXJuaSAtIEJvcmdvIFJpdm8PVGVybmkgLSBDYXJyYXJhEVRlcm5pIC0gTGUgR3JhemllD0FtZWxpYSAtIEFtZWxpYRNOYXJuaSAtIE5hcm5pIFNjYWxvE09ydmlldG8gLSBDaWNvbmlhIDIVDwMzXzEDM18yBDNfNjkDM183AzNfMwMzXzYDM180AzNfNQUzXzIwNQM3XzEDN18yAzdfMwM3XzUDN180AzdfNhQrAw9nZ2dnZ2dnZ2dnZ2dnZ2dkZGT1g28Bzs2KuJM0nGhoW/nLrR4W/HpnjtjYCY1FCtl6eA==', '__VIEWSTATEGENERATOR': 'A373F38E', '__PREVIOUSPAGE': '5rDzdOLdhSojgNkWU0aySKgUcCP-WXzqaXaRNPbAb-Ekcs1vVl_yJf9liwnKWXEk15jl_Z8YIAJ86zswapmkHfDz2MMg9vQnDDQypfObingUmLuVVTMztw73FN9-55lI0', '__EVENTVALIDATION': '/wEdABshO2HSLC4Irl9HO+xCVg8wb8C3weGBaOLrENr46Y99cTPW5fmNeTa451MZa8LXyblcbg/Uqmez9yXP+xSTfXC/S9OqRU0oWDv+cbRkqcKtAqcsJFHEnZTzh0X+kVeLa7e4rr9jBld/uVqJpfp464tKRYmvyX4i1bjLFIfxIkw0G+o0YQNlnq4u76x5pwotKnDgEO4xErwMzPYvPwScdqOGIUgWeFC3y966dlr8RsY+JYzWFz2lgCufNhmaoE94Y/QiRS7TDGhtA/xOb3OYxEB522qpZQfWwl21Nv1xVarGgMm6hUuJGOA6Q4Ko1E4M+sQ9CZ53jxit2DF58lu5QFtr6x1PlqI+jgkEbNYTNUujYRbbFs2N4TjG5zEZ4xduFBkrD27kcj09V7bJX/igStyEnNJs5SuXPSKM2cTNsffB6XcH17ma9zwqai6CNsf9Og0ZPzjdX2zFoASErgXLJvie8NzsH8t7duXHZk9hbS9Vs21a/4yX1BpSDSioiW1gxr+tUHjFeS1m0yjnOD9kwBYX4jCmBywb7GNFZX8+9J5ux+74SyM4niEhJdJF38T+LG4OdFP/T/wCCiwNou/IvjveW95PGaK16TIOdZz/XYSt3Q==', 'ctl00$Content$txtData': date_fmt, 'ctl00$Content$Grafico1$cboStazioni': '3_1', 'ctl00$Content$Grafico1$cboInquinante': 'SO224H' } res = requests.post( 'http://www.arpa.umbria.it/monitoraggi/aria/Default.aspx', data=data) if res.status_code == 200: soup = BeautifulSoup(res.text, 'html.parser') html_table = soup.select_one('#ctl00_Content_TabellaDati') extractor = Extractor(html_table) extractor.parse() table = extractor.return_list()[2:] html_table = soup.select_one( '#ctl00_Content_TabellaDatiAltreStazioni') extractor = Extractor(html_table) extractor.parse() table.extend(extractor.return_list()[2:]) for province in self.provinces: province_rows = [ x for x in table if x[0].split(' - ')[0].lower() == province.name.lower() ] so2 = [ self.extract_float(x[1]) for x in province_rows if self.extract_float(x[1]) is not None ] no2 = [ self.extract_float(x[3]) for x in province_rows if self.extract_float(x[3]) is not None ] co = [ self.extract_float(x[4]) for x in province_rows if self.extract_float(x[4]) is not None ] pm10 = [ self.extract_float(x[7]) for x in province_rows if self.extract_float(x[7]) is not None ] pm25 = [ self.extract_float(x[9]) for x in province_rows if self.extract_float(x[9]) is not None ] o3 = [ self.extract_float(x[5]) for x in province_rows if self.extract_float(x[5]) is not None ] c6h6 = [ self.extract_float(x[7]) for x in province_rows if self.extract_float(x[7]) is not None ] if len(so2) > 0: province.quality.so2 = round(mean(so2), 2) if len(no2) > 0: province.quality.no2 = round(mean(no2), 2) if len(co) > 0: province.quality.co = round(mean(co), 2) if len(pm10) > 0: province.quality.pm10 = round(mean(pm10), 2) if len(pm25) > 0: province.quality.pm25 = round(mean(pm25), 2) if len(o3) > 0: province.quality.o3 = round(mean(o3), 2) if len(c6h6) > 0: province.quality.c6h6 = round(mean(c6h6), 2) if self.on_quality_fetched is not None: self.on_quality_fetched(self)
def sync_conference(conf_id=3): conf = ConferenceInterface.objects.get(pk=conf_id) route = "forms/1/entries" expires = arrow.utcnow().replace(minutes=+10).timestamp string_to_sign = str("{}:{}:{}:{}".format(conf.api_key, "GET", route, expires)) sig = _calculate_signature(string_to_sign, conf.private_key) req = requests.get( conf.url, params={"api_key": conf.api_key, "signature": sig, "expires": expires} ) data = json.loads(req.content) for entry in data.get("response", {}).get("entries", []): html = requests.get( "{}/wp-json/conference/v1/entry/{}/{}".format( settings.WP_URL, entry.get("form_id"), entry.get("id") ), auth=(settings.WP_BASIC_AUTH_USER, settings.WP_BASIC_AUTH_PASS), ) html = html.json().get("html") table = BeautifulSoup(html, "html.parser").find_all( "table", class_="entry-products" ) extractor = Extractor(table[0], transformer=unicode) extractor.parse() table_data = extractor.return_list() products = [] total_amount = "" for item in table_data: product_name, amount, price, total = item if price == "Total": total_amount = total continue product_name = product_name.strip() amount = int(amount) price = float(price.replace(u"$", "").strip().replace(",", ".")) products.append({"name": product_name, "amount": amount, "price": price}) doc = pq(html) billing_html = "" for val in doc(".entry-view-field-name"): if val.text == "Billing address details": doc(val).parents("tr").next_all().find("a").remove() billing_html = doc(val).parents("tr").next_all().find("td").html() if entry.get("34"): billing_html += "<br/>" + entry.get("34") billing_html = ( billing_html.replace("<br/>", "\n").replace("<p>", "").replace("</p>", "") ) if "wire" in entry.get("21"): payment_type = "wire" else: payment_type = "group" registration, is_created = ConferenceRegistration.objects.get_or_create( interface=conf, form_id=entry.get("form_id"), entry_id=entry.get("id"), defaults={ "ticket_type": entry.get("9"), "payment_type": payment_type, "source_url": entry.get("source_url"), "entry_created": arrow.get( entry.get("date_created").replace(" ", "T") ).datetime, "name": u"{} {}".format(entry.get("1.3"), entry.get("1.6")), "email": entry.get("2"), "organization": entry.get("6"), "billing_html": billing_html, "total_amount": total_amount, "products": products, }, )
def sync_conference(conf_id=3): conf = ConferenceInterface.objects.get(pk=conf_id) route = "forms/1/entries" expires = arrow.utcnow().replace(minute=+10).timestamp string_to_sign = str("{}:{}:{}:{}".format(conf.api_key, "GET", route, expires)) sig = _calculate_signature(string_to_sign, conf.private_key) req = requests.get( conf.url, params={ "api_key": conf.api_key, "signature": sig, "expires": expires, "paging[page_size]": 20, }, ) data = json.loads(req.content) for entry in data.get("response", {}).get("entries", []): if entry["status"] == "trash": continue html = requests.get( "{}/wp-json/conference/v1/entry/{}/{}".format( settings.WP_URL, entry.get("form_id"), entry.get("id")), auth=(settings.WP_BASIC_AUTH_USER, settings.WP_BASIC_AUTH_PASS), ) html = html.json().get("html") table = BeautifulSoup(html, "html.parser").find_all("table", class_="entry-products") extractor = Extractor(table[0], transformer=unicode) extractor.parse() table_data = extractor.return_list() products = [] total_amount = "" for item in table_data: product_name, amount, price, total = item if price == "Total": total_amount = total continue product_name = product_name.strip() amount = int(amount) price = float(price.replace(u"$", "").strip().replace(",", ".")) products.append({ "name": product_name, "amount": amount, "price": price }) doc = pq(html) billing_html = "" for val in doc(".entry-view-field-name"): if val.text == "Billing address details": doc(val).parents("tr").next_all().find("a").remove() billing_html = doc(val).parents("tr").next_all().find( "td").html() if entry.get("34"): billing_html += "<br/>" + entry.get("34") billing_html = (billing_html.replace("<br/>", "\n").replace("<p>", "").replace( "</p>", "")) if "wire" in entry.get("21"): payment_type = "wire" else: payment_type = "group" if entry.get("53"): is_group = True else: is_group = False registration, is_created = ConferenceRegistration.objects.get_or_create( interface=conf, form_id=entry.get("form_id"), entry_id=entry.get("id"), defaults={ "ticket_type": entry.get("9"), "payment_type": payment_type, "source_url": entry.get("source_url"), "entry_created": arrow.get(entry.get("date_created").replace(" ", "T")).datetime, "name": u"{} {}".format(entry.get("1.3"), entry.get("1.6")), "email": entry.get("2"), "organization": entry.get("6"), "billing_html": billing_html, "total_amount": total_amount, "products": products, "is_group": is_group, }, )
def set_indicator_value(self, day: datetime, indicator: str) -> float: """ Populates the indicator specified in the provinces fetched data from `http://www.cartografiarl.regione.liguria.it/SiraQualAria/script/Pub3AccessoDatiAria.asp?Tipo=DatiGiorno` :param day: The day of interest :param indicator: The indicator of interest """ if indicator not in self.indicator_map: return data = { 'Giorni': day.strftime('%d'), 'Mesi': day.strftime('%m'), 'Anni': day.strftime('%Y'), 'TipoTema': 'SENSORI', 'Tipo': 'DatiGiorno', 'Anno': day.strftime('%Y'), 'Mese': day.strftime('%m'), 'Giorno': day.strftime('%d'), 'DataIniz': day.strftime('%d/%m/%Y'), 'CodTema': 'SENSORI' } res = requests.post('http://www.cartografiarl.regione.liguria.it/SiraQualAria/script/Pub3AccessoDatiAria13.asp', data=data) soup = BeautifulSoup(res.text, 'html.parser') # a unique needs to be provided when a request is made, it is sent to the user in form of an hidden field try: id_richiesta = soup.find_all('input', {'name': 'Id_Richiesta'})[0]['value'] except: # data for the selected day not available return map_data = self.indicator_map[indicator] res = requests.get('http://www.cartografiarl.regione.liguria.it/SiraQualAria/script/Pub3AccessoDatiAria131.asp', params = ( ('Anno', day.strftime('%Y')), ('CodParam', map_data['CodParam']), ('SiglaParam', map_data['SiglaParam']), ('Azione', 'LISTA_STAZIONI'), ('CodTema', 'SENSORI'), ('DataIniz', day.strftime('%d/%m/%Y')), ('Id_Richiesta', id_richiesta) ) ) t = '</TR><TR>'.join(res.text.split('</TR>')) soup = BeautifulSoup(t, 'html.parser') table = soup.select('table')[0] extractor = Extractor(table) extractor.parse() # remove header table_data = extractor.return_list()[1:] if len(table_data) > 0: # remove any row after the first blank table_data = table_data[:next(idx for idx, y in enumerate(table_data) if len(y) == 0)] for province in self.provinces: values = list() for x in table_data: if province.short_name in x[1]: try: values.append(float(x[map_data['table_idx']].strip())) except: pass if len(values) != 0: setattr(province.quality, indicator, round(float(sum(values) / len(values)), 2))
def table2lists(table): extractor = Extractor(table) extractor.parse() return extractor.return_list()
def parse_html_table(html): extractor = Extractor(html) extractor.parse() return extractor.return_list()