def parse_detail(self, id, html): soup = BeautifulSoup(html) series = {} series['title'] = select(soup, 'h1 a')[0].text pre_text = unicode(select(soup, '#eplist pre')[0]) seasons_html = pre_text.split('•')[1:] seasons = [] for season_html in seasons_html: season = {} season['number'] = int(re.search('Season (\d+)', season_html).groups(1)[0]) episodes = [] episodes_html = season_html.split('\n')[1:] # Ignore the "Season" line. for episode_html in episodes_html: m = re.match('^\s*(\d+)\s+(\d+)-(\d+)\s+(\w+)\s+(\d{2}\/\w{3}\/\d{2})\s+<a.*?>(.*?)<\/a>', episode_html) if m is None: continue data = m.groups(1) air_date = strptime(data[4], '%d/%b/%y') episode = {} episode['epid'] = int(data[0], 10) episode['season'] = int(data[1], 10) episode['number'] = int(data[2], 10) episode['production_number'] = data[3] episode['air_date'] = strftime('%Y-%m-%d', air_date) episode['title'] = data[5] episodes.append(episode) season['episodes'] = episodes seasons.append(season) series['seasons'] = seasons return series
def parse_list(self, html): soup = BeautifulSoup(html) immoweb_ids = [] estates = [] estates_table_rows = select(soup, '.result-liste tr') for estate_row in estates_table_rows[1:]: estate = {} estate_row_cells = select(estate_row, 'td') # Cell 0: price estate_price_cell = estate_row_cells[0] conditional_set(estate, 'price', parse_price(estate_price_cell.string)) # Cell 1: type + link estate_link = select(estate_row, 'a')[0]['href'] estate['id'] = parse_immoweb_link(estate_link) estate['estate_type'] = parse_estate_type(deep_contents(estate_row_cells[1])) # Cell 2: area conditional_set(estate, 'area', parse_first_number(estate_row_cells[2].string)) # Cell 3: bedrooms conditional_set(estate, 'bedrooms', parse_first_number(estate_row_cells[3].string)) # Cell 4: zip_code + city estate_city_cell = estate_row_cells[4] estate['zip_code'] = parse_zip_code(estate_city_cell.string) estate['city'] = parse_city(estate_city_cell.string) estates.append(estate) return estates
def parse_detail(self, id, html): song = {} song['artist'] = id[0] song['title'] = id[1] #last.fm puts some strange tag in its code that caused an error when it was left in. endash = unichr(0x2013) html = unicode(html.replace('<!%s[if IE]><![endif]%s>' % (endash, endash), '')) soup = BeautifulSoup(html) cloudItems = select(soup, '#tagcloud .cloudItem') tags = [] for cloudItem in cloudItems: m = re.search('font-size: (\d+)px', cloudItem['style']) weigth = int(m.group(1)) if m is not None else -1 tags.append((weigth, cloudItem.text)) song['tags'] = tags return song