from pattern import web from pattern.web import URL, Element import re from datetime import datetime, date, timedelta url = URL( 'https://docs.google.com/spreadsheets/d/1J2I40hglES63YZHROcOL3oAjDPqiiKLRPE_ikAWsR-Q/pubhtml?gid=1267634591' ).read() dom = Element(url) dom = dom.by_tag('tbody')[0] #date Get the date from the header today = date = dom.by_class('s0')[1].content #places Read the place from available class='s4' inside <td> places = [] for ix in dom.by_class('s4'): places.append(ix.content) reading_row = [4, 10, 16, 22] pol_reading = [] def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) cleantext = cleantext.strip("Highest: ") return cleantext def cleandate(txt):
from pattern import web from pattern.web import URL, Element url = URL('https://docs.google.com/spreadsheets/d/1J2I40hglES63YZHROcOL3oAjDPqiiKLRPE_ikAWsR-Q/pubhtml?gid=1267634591').read() dom = Element(url) dom = dom.by_tag('tbody')[0] #date Get the date from the header date = dom.by_class('s0')[1].content #places Read the place from available class='s4' inside <td> places = [] for ix in dom.by_class('s4'): places.append(ix.content) try: reading_row = [4, 10, 16, 22] pol_reading = [] for row in reading_row: reading = dom.by_tag('tr')[row] reading = reading.by_tag('td') for i in reading: if len(i) >= 1: pol_reading.append(i.content) pol_updated_row = [5, 11, 17, 23] pol_updated = []