def parse_html_page(link, publication_year, publication_month, publication_day, publisher, num_pages, image_url): soup = goodreads.get_html_page(link) info_tag = soup.find(itemprop='numberOfPages') if info_tag != None: num_pages = info_tag.contents[0].split(' ')[0] detail = info_tag.parent.next_sibling.next_sibling.contents[0] p_date = detail.split('Published')[1].split('by')[0].strip() p_arr = p_date.split(' ') if len(p_arr) == 3: publication_year = p_arr[2] publication_day = p_arr[1][:-2] publication_month = int( datetime.strptime(p_arr[0], '%B').strftime('%m')) elif len(p_arr) == 2: publication_year = p_arr[1] publication_month = int( datetime.strptime(p_arr[0], '%B').strftime('%m')) elif len(p_arr) == 1: publication_year = p_arr[0] publisher = detail.split('by')[1].strip() if len( detail.split('by')) > 1 else 'Unknown' image_tag = soup.find(id='coverImage') if image_tag != None: image_url = image_tag.get('src') book_info = dict(publication_year=publication_year, publication_month=publication_month, publication_day=publication_day, publisher=publisher, num_pages=num_pages, image_url=image_url) return book_info
def parse_html_page(link, born_at): soup = goodreads.get_html_page(link) birth_date = soup.find(itemprop='birthDate') if birth_date: bd = birth_date.string.split(' ').strip() born_at = datetime.strptime(bd, '%B %d %Y').strftime('%Y/%m/%d') print ('===========from Goodreads page', born_at) return born_at