Пример #1
0
def get_datalist(html_output, current_url_date):
    class_of_table = "table1" #div class of table with the data
    data_list = []
    location_columns = []
    soup = BeautifulSoup(html_output)
    table_html =soup.find('table', attrs={'class':'text_psinormal'})
    #grabs all tr in table 'text_psinormal'
    rows = table_html.findAll('tr')
    #rows[0] is useless header
    
    #rows[1], second row i table, contains location names
    locations = rows[1].findChildren('th')
    for each_location in locations:
          location_columns.append(each_location.text.strip())

    #rows[2] to rows[26] is the 24 hour readings :)   
    for count in range(2,26):
       #grab all columns from each row
       cols = rows[count].findAll('td')
       #cols[0] is the time
       spans = cols[0].find_all('span')
       time = spans[0].text
       #cols[1] to cols[6] is the actual reading data (neglect column 7)
       for nested_count in range (1,6):
          data = ApiReading (current_url_date, 
                             "Singapore", #State
                             location_columns[nested_count-1], #Region/Town..etc
                             time, #Time--taken from the header of the column -- modified before insertion into DB
                             strip_html(cols[nested_count].text.strip())) #reading itself -- modified before insertion into DB
          #print_ApiReading(data)
          data_list.append(data)
       
               
    return data_list
Пример #2
0
def get_datalist(html_output, current_url_date):
    class_of_table = "table1" #div class of table with the data
    data_list = []
    soup = BeautifulSoup(html_output)
    table_html =soup.find('table', attrs={'class':class_of_table})
    # grabs the top row
    rows = table_html.findAll('tr')
    is_Header_Row = True #set this to true, before entering for loop below, the first row is the header row
    #loop through al the rows
    for tr in rows:
          if is_Header_Row:
                  headers = getHeaders(tr) #get the headers from the first row
                  is_Header_Row = False #set it to fail the if check before all data rows (only first row is header row)
          else:
                  cols = tr.findAll('td')
                  location_columns = []
                  data_row = ''
                  column_count = 0
                  
                  for td in cols:
                        if column_count < 2 :
                                location_columns.append( td.text.strip() )
                                                                                        
                        else:
                                data = ApiReading(current_url_date, 
                                                  location_columns[0], #State
                                                  location_columns[1], #Region/Town..etc
                                                  str(headers[column_count]), #Time--taken from the header of the column -- modified before insertion into DB
                                                  strip_html(td.text.strip())) #reading itself -- modified before insertion into DB
                                data_list.append(data)
                                
                        column_count = column_count + 1
    return data_list
Пример #3
0
def get_datalist(html_output, current_url_date):
    class_of_table = "table1" #div class of table with the data
    data_list = []
    location_columns = []
    soup = BeautifulSoup(html_output, "lxml")
    table_html =soup.find('table', attrs={'class':'text_psinormal'})
    #grabs all tr in table 'text_psinormal'
    rows = table_html.findAll('tr')
    #rows[0] is useless header
    
    #rows[1], second row i table, contains location names
    locations = rows[1].findChildren('th')
    for each_location in locations:
          location_columns.append(each_location.text.strip())

    #rows[2] to rows[26] is the 24 hour readings :)   
    for count in range(2,26):
       #grab all columns from each row
       cols = rows[count].findAll('td')
       #cols[0] is the time
       spans = cols[0].find_all('span')
       time = spans[0].text
       #cols[1] to cols[6] is the actual reading data (neglect column 7)
       for nested_count in range (1,6):
          data = ApiReading (current_url_date, 
                             "Singapore", #State
                             location_columns[nested_count-1], #Region/Town..etc
                             time, #Time--taken from the header of the column -- modified before insertion into DB
                             check_values(strip_html(cols[nested_count].text.strip()))) #reading itself -- modified before insertion into DB
          #print_ApiReading(data)
          data_list.append(data)
       
               
    return data_list