def get_datalist(html_output, current_url_date): class_of_table = "table1" #div class of table with the data data_list = [] location_columns = [] soup = BeautifulSoup(html_output) table_html =soup.find('table', attrs={'class':'text_psinormal'}) #grabs all tr in table 'text_psinormal' rows = table_html.findAll('tr') #rows[0] is useless header #rows[1], second row i table, contains location names locations = rows[1].findChildren('th') for each_location in locations: location_columns.append(each_location.text.strip()) #rows[2] to rows[26] is the 24 hour readings :) for count in range(2,26): #grab all columns from each row cols = rows[count].findAll('td') #cols[0] is the time spans = cols[0].find_all('span') time = spans[0].text #cols[1] to cols[6] is the actual reading data (neglect column 7) for nested_count in range (1,6): data = ApiReading (current_url_date, "Singapore", #State location_columns[nested_count-1], #Region/Town..etc time, #Time--taken from the header of the column -- modified before insertion into DB strip_html(cols[nested_count].text.strip())) #reading itself -- modified before insertion into DB #print_ApiReading(data) data_list.append(data) return data_list
def get_datalist(html_output, current_url_date): class_of_table = "table1" #div class of table with the data data_list = [] soup = BeautifulSoup(html_output) table_html =soup.find('table', attrs={'class':class_of_table}) # grabs the top row rows = table_html.findAll('tr') is_Header_Row = True #set this to true, before entering for loop below, the first row is the header row #loop through al the rows for tr in rows: if is_Header_Row: headers = getHeaders(tr) #get the headers from the first row is_Header_Row = False #set it to fail the if check before all data rows (only first row is header row) else: cols = tr.findAll('td') location_columns = [] data_row = '' column_count = 0 for td in cols: if column_count < 2 : location_columns.append( td.text.strip() ) else: data = ApiReading(current_url_date, location_columns[0], #State location_columns[1], #Region/Town..etc str(headers[column_count]), #Time--taken from the header of the column -- modified before insertion into DB strip_html(td.text.strip())) #reading itself -- modified before insertion into DB data_list.append(data) column_count = column_count + 1 return data_list
def get_datalist(html_output, current_url_date): class_of_table = "table1" #div class of table with the data data_list = [] location_columns = [] soup = BeautifulSoup(html_output, "lxml") table_html =soup.find('table', attrs={'class':'text_psinormal'}) #grabs all tr in table 'text_psinormal' rows = table_html.findAll('tr') #rows[0] is useless header #rows[1], second row i table, contains location names locations = rows[1].findChildren('th') for each_location in locations: location_columns.append(each_location.text.strip()) #rows[2] to rows[26] is the 24 hour readings :) for count in range(2,26): #grab all columns from each row cols = rows[count].findAll('td') #cols[0] is the time spans = cols[0].find_all('span') time = spans[0].text #cols[1] to cols[6] is the actual reading data (neglect column 7) for nested_count in range (1,6): data = ApiReading (current_url_date, "Singapore", #State location_columns[nested_count-1], #Region/Town..etc time, #Time--taken from the header of the column -- modified before insertion into DB check_values(strip_html(cols[nested_count].text.strip()))) #reading itself -- modified before insertion into DB #print_ApiReading(data) data_list.append(data) return data_list