def get_availablity(raw_data): availablity_tag_1 = raw_data.find('div', {'id': 'availability'}) availablity_tag_2 = raw_data.find( 'p', {'class': 'a-spacing-micro a-color-secondary a-text-bold'}) if availablity_tag_1: availablity = text_format(availablity_tag_1) return availablity elif availablity_tag_2: availablity = text_format(availablity_tag_2) return availablity return 'not_available'
def get_title(raw_data): title_tag_1 = raw_data.find('span', {'id': 'productTitle'}) title_tag_2 = raw_data.find('span', {'id': 'ebooksProductTitle'}) if title_tag_1: title = text_format(title_tag_1) return title if title_tag_2: title = text_format(title_tag_2) return title else: return 'not_available'
def get_product_manufacturer(response): product_info_table_container_2 = response.find( 'div', {'class': 'aplus-v2 desktop celwidget'}) from_manufracture = {'img': [], 'text': []} if product_info_table_container_2: description_images = product_info_table_container_2.findAll('img') description_paragraph = product_info_table_container_2.findAll('p') description_text = product_info_table_container_2.findAll( 'span', {'class': 'a-list-item'}) if len(description_images) and len(description_text) and len( description_paragraph): for image in description_images: from_manufracture['img'].append(image['src']) for text in description_text: from_manufracture['text'].append(text_format(text)) for text in description_paragraph: from_manufracture['text'].append(text_format(text)) return from_manufracture elif len(description_images) and len(description_text): for image in description_images: from_manufracture['img'].append(image['src']) for text in description_text: from_manufracture['text'].append(text_format(text)) return from_manufracture elif len(description_images) and len(description_paragraph): for image in description_images: from_manufracture['img'].append(image['src']) for text in description_paragraph: from_manufracture['text'].append(text_format(text)) return from_manufracture elif len(description_images): for image in description_images: from_manufracture['img'].append(image['src']) return from_manufracture elif len(description_text): for text in description_text: from_manufracture['text'].append(text_format(text)) return from_manufracture else: from_manufracture['text'].append('not_available') from_manufracture['img'].append('not_available') return from_manufracture
def get_brand(raw_data): brand_tag_1 = raw_data.find('a', {'id': 'bylineInfo'}) brand_tag_2 = raw_data.find('a', {'id': 'brand'}) if brand_tag_1: return text_format(brand_tag_1) elif brand_tag_2: brand_name = text_format(brand_tag_2) if len(brand_name) == 0: brand_image_tag = brand_tag_2.find('img') if brand_image_tag: brand_img = brand_image_tag['src'] return brand_img return text_format(brand_tag_2) return 'not_availabe'
def get_seller_description(raw_data): description_container_1 = raw_data.find('span', {'id': 'about-seller-text'}) description_container_2 = raw_data.find('div', {'id': 'about-seller'}) if description_container_1: description = text_format(description_container_1) if len(description) != 0: return description elif description_container_2: description = text_format(description_container_2) if len(description) != 0: return description else: return 'not_available' return 'not_available'
def check_and_get_seller_data(raw_data): seller_name_tag = raw_data.find('a') if seller_name_tag: seller_name = text_format(seller_name_tag) seller_link = url_format(seller_name_tag['href']) seller_raw_data = response_getter.get_content(seller_link) if seller_raw_data: return get_seller_info(seller_name, seller_raw_data) else: return seller_name, 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available' else: seller_name = text_format(raw_data) if seller_name: return seller_name, 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available' return 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available'
def get_seller_positive_rating(raw_data): positive_rating = raw_data.find( 'a', {'class': 'a-link-normal feedback-detail-description'}) if positive_rating: return text_format(positive_rating.b) else: return 'not_available'
def get_rating_details(raw_data): rating_types = { '5_star': 'not_available', '4_star': 'not_available', '3_star': 'not_available', '2_star': 'not_available', '1_star': 'not_available' } review_container = raw_data.find('table', {'id': 'histogramTable'}) if review_container: rating_tr_tag = review_container.findAll('tr') if len(rating_tr_tag) != 0: for rating_tr in rating_tr_tag: rt_list = [] td_tags = rating_tr.findAll('td') if len(td_tags) != 0: for td in td_tags: if td.a: rt_list.append(text_format(td.a)) else: rt_list.append('0') rating_types[rt_list[0]] = rt_list[-1] return rating_types else: return rating_types return rating_types
def get_seller_rating(raw_data): rating_container = raw_data.find('span', {'class': 'a-icon-alt'}) if rating_container: rating = text_format(rating_container) if 'template-formatted' in rating: return 'no_feedback' else: return rating else: return 'not_available'
def get_product_description(raw_data): description_container = raw_data.find('div', {'id': 'productDescription'}) description = {'text': ''} if description_container: description_text_tag = description_container.find( 'div', { 'class': 'a-expander-content a-expander-partial-collapse-content' }) if description_text_tag: description['text'] = text_format(description_text_tag) return description else: description['text'] = text_format(description_container) return description else: description['text'] = 'not_available' return description
def get_quantity(raw_data): quantity_container = raw_data.find('select', {'name': 'quantity'}) if quantity_container: quantity_no = quantity_container.findAll('option') if len(quantity_no) != 0: quantity = text_format(quantity_no[-1]) return quantity else: return 'not_available' else: return 'not_available'
def get_other_format_books_price(raw_data): other_format_price_container = raw_data.find('div', {'id': 'tmmSwatches'}) other_prices = {'default': 'not_available'} if other_format_price_container: price_tags = other_format_price_container.findAll( 'span', {'class': 'a-list-item'}) if len(price_tags) != 0: for price_tag in price_tags: book_type_tag = price_tag.find('a') price_container = price_tag.find( 'span', {'class': 'a-color-secondary'}) if book_type_tag and price_container: book_type = text_format(book_type_tag.span) price = text_format(price_container) other_prices.update(((book_type, price.replace('_', '')), )) return other_prices else: return other_prices
def get_product_rating(raw_data): rating_tag_1 = raw_data.find('a', {'id': 'cmrsSummary-popover'}) rating_tag_2 = raw_data.find('div', { 'class': 'a-spacing-none', 'id': 'averageCustomerReviews' }) if rating_tag_2: rating_tag = rating_tag_2.find('span', {'class': 'a-icon-alt'}) if rating_tag: rating = text_format(rating_tag) if rating: return rating.split('stars')[0] if rating_tag_1: rating_tag = rating_tag_1.find('span', {'class': 'a-icon-alt'}) if rating_tag: rating = text_format(rating_tag) if rating: return rating.split('stars')[0] return 'not_available'
def get_specail_promotion(raw_data): promotion_container = raw_data.find('div', {'id': 'quickPromoBucketContent'}) promotion = {'text': []} if promotion_container: text_list = promotion_container.find_all('li') if len(text_list): for text_tag in text_list: text_string = text_format(text_tag) if text_string: promotion['text'].append(text_string) return promotion promotion['text'].append('not_available') return promotion
def get_author_name(raw_data): author_tag_1 = raw_data.find('a', {'class': 'a-link-normal contributorNameID'}) author_tag_2 = raw_data.find_all("span", {'class': 'author notFaded'}) if author_tag_1: author_name = text_format(author_tag_1) return author_name elif author_tag_2: author_name = '' for auth in author_tag_2: auther_details = auth.text.replace("Author", "").replace( "()", "").replace(",", "").replace("\n", "").strip() + "|" author_name = author_name + auther_details return author_name else: return 'not_available'
def get_seller_overall_rating(page_soup): seller_raing = { '30_days': { 'positive': 'not_available', 'negative': 'not_available', 'neutral': 'not_available', 'count': 'not_available' }, '90_days': { 'positive': 'not_available', 'negative': 'not_available', 'neutral': 'not_available', 'count': 'not_available' }, '120_days': { 'positive': 'not_available', 'negative': 'not_available', 'neutral': 'not_available', 'count': 'not_available' }, 'life_time': { 'positive': 'not_available', 'negative': 'not_available', 'neutral': 'not_available', 'count': 'not_available' } } rating_container = page_soup.find('table', {'id': 'feedback-summary-table'}) if rating_container: tables = rating_container.findAll('tr') if len(tables) != 0: for row in tables[1:]: row_list = [] td_tags = row.findAll('td') if len(td_tags) != 0: for element in td_tags: row_list.append((text_format(element))) seller_raing['30_days'][row_list[0].lower()] = row_list[1] seller_raing['90_days'][row_list[0].lower()] = row_list[2] seller_raing['120_days'][row_list[0].lower()] = row_list[3] seller_raing['life_time'][ row_list[0].lower()] = row_list[4] return seller_raing
def get_color_variantes(raw_data): varaiants_tag = raw_data.find('div', {'id': 'variation_color_name'}) if varaiants_tag: varaiants_list = varaiants_tag.findAll('li') if len(varaiants_list): color_list = [] for varaiants in varaiants_list: varaiant_1 = varaiants.find('img') varaiant_2 = varaiants.find('div', {'class': 'twisterTextDiv text'}) if varaiant_1: # print varaiant color = varaiant_1['alt'] color_list.append(color) elif varaiant_2: color = text_format(varaiant_2) color_list.append(color) return '|'.join(color_list) return 'not_available'
def get_style_variants(raw_data): style_tag_container = raw_data.find('div', {'id': 'twisterContainer'}) if style_tag_container: style_list = style_tag_container.findAll('li') if len(style_list) != 0: style = [] for style_tag in style_list: style_text = style_tag.find('span', {'class': 'a-size-base'}) if style_text: style_text = text_format(style_text) if style_text: style.append(style_text) else: return 'not_available' return '|'.join(style) else: return 'not_available' else: return 'not_available'
def get_highlights(raw_data): highlights_tag_1 = raw_data.find('div', {'id': 'bookDescription_feature_div'}) highlights_tag_2 = raw_data.find('div', {'id': 'feature-bullets'}) highlights = {'text': []} if highlights_tag_1: highlights_container = highlights_tag_1.find('noscript') if highlights_container: highlights['text'].append(text_format(highlights_container)) return highlights elif highlights_tag_2: highlights_container_1 = highlights_tag_2.find( 'ul', {'class': 'a-unordered-list a-vertical a-spacing-none'}) highlights_container_2 = highlights_tag_2.find( 'span', {'class': 'a-color-base technicalData'}) if highlights_container_1: highlights_list = highlights_container_1.findAll('li') if len(highlights_list) != 0: for highlights_text in highlights_list: text = highlights_text.text.strip() highlights['text'].append(text) return highlights if highlights_container_2: highlights_list = highlights_container_2.findAll('li') if len(highlights_list) != 0: for highlights_text in highlights_list: text = highlights_text.text.strip() highlights['text'].append(text) return highlights highlights['text'].append('not_available') return highlights
def get_seller_detailed_information(raw_data): information_container = raw_data.findAll('div', {'class': 'a-column a-span6'}) information = {'default': []} if len(information_container) != 0: for info_container in information_container: info_table_container = info_container.find( 'ul', {'class': 'a-unordered-list a-nostyle a-vertical'}) if info_table_container: info_list = info_table_container.findAll( 'span', {'class': 'a-list-item'}) for info_tag in info_list: info_text = info_tag.find( 'ul', {'class': 'a-unordered-list a-nostyle a-vertical'}) if info_text: key = text_format(info_tag.span) information.update(((key, []), )) text_list = info_text.findAll('li') if len(text_list) != 0: for text_tag in text_list: information[key].append(text_tag.text) else: text_tag = info_tag.text if text_tag: try: key_value_pair = text_tag.split(':') information.update( ((key_value_pair[0], key_value_pair[1]), )) except IndexError: continue return information information['default'].append('not_available') return information
def get_product_specifications(raw_data): product_details_1 = raw_data.find('div', {'id': 'prodDetails'}) product_details_2 = raw_data.find('table', {'id': 'productDetailsTable'}) product_details_3 = raw_data.find('div', {'id': 'detailBullets_feature_div'}) product_details_4 = raw_data.find( 'div', {'class': 'a-section a-spacing-large pzr-features-containers'}) product_details_5 = raw_data.find('div', {'id': 'detail-bullets'}) product_details_6 = raw_data.find('table', {'id': 'product-specification-table'}) product_details_7 = raw_data.find( 'div', {'id': 'technicalSpecifications_feature_div'}) product_details_8 = raw_data.find('div', {'id': 'detail_bullets_id'}) specifications = {'default': 'not_available'} if product_details_2 and product_details_6: specification_container = product_details_2.find( 'td', {'class': 'bucket'}) if specification_container: list1 = specification_container.findAll('li') for specs in list1: try: spec = text_format(specs).split(':') specifications.update(((spec[0], spec[1]), )) except IndexError: continue tr_tags = product_details_6.findAll('tr') if len(tr_tags) != 0: for tr_tag in tr_tags: if tr_tag.th: key = text_format(tr_tag.th) if key in 'Customer Reviews': continue else: value = text_format(tr_tag.td) specifications.update(((key, value), )) return specifications elif product_details_4 and product_details_5: list1 = product_details_5.find_all('li') for specs in list1: try: spec = text_format(specs).split(':') specifications.update(((spec[0], spec[1]), )) except IndexError: continue t_body = product_details_4.findAll('table') if len(t_body) != 0: for tag in t_body: tr_tags = tag.findAll('tr') for tr_tag in tr_tags: if tr_tag.th: key = text_format(tr_tag.th) if key in 'Customer Reviews': continue else: value = text_format(tr_tag.td) specifications.update(((key, value), )) return specifications elif product_details_8: list1 = product_details_8.find_all('li') for specs in list1: try: spec = text_format(specs).split(':') specifications.update(((spec[0], spec[1]), )) except IndexError: continue return specifications elif product_details_7: table_container = product_details_7.find( 'table', {'id': 'technicalSpecifications_section_1'}) if table_container: tr_tags = table_container.findAll('tr') if len(tr_tags) != 0: for tr in tr_tags: if tr.th: key = tr.th.text.strip() if tr.td: value = tr.td.text.strip() specifications.update(((key, value), )) return specifications elif product_details_5: list1 = product_details_5.find_all('li') for specs in list1: try: spec = text_format(specs).split(':') specifications.update(((spec[0], spec[1]), )) except IndexError: continue return specifications elif product_details_1: t_body = product_details_1.findAll('table') if len(t_body) != 0: for tag in t_body: tr_tags = tag.findAll('tr') for tr_tag in tr_tags: if tr_tag.th: key = text_format(tr_tag.th) if key in 'Customer Reviews': continue else: value = tr_tag.td.text.strip() specifications.update(((key, value), )) else: td_tags = tr_tag.findAll('td') if len(td_tags) != 0 and len(td_tags) >= 2: specifications.update( ((text_format(td_tags[0]), text_format(td_tags[1])), )) return specifications elif product_details_2: specification_container = product_details_2.find( 'td', {'class': 'bucket'}) if specification_container: list1 = specification_container.findAll('li') for specs in list1: try: spec = text_format(specs).split(':') specifications.update(((spec[0], spec[1]), )) except IndexError: continue return specifications elif product_details_3: list2 = product_details_3.find_all('span', {'class': 'a-list-item'}) for specs in list2: try: spec = text_format(specs).split(':') specifications.update(((spec[0], spec[1]), )) except IndexError: continue return specifications else: return specifications return specifications