def crawling_page(window_arrow, is_yesterday=False): start_date = window_arrow.format('YYYYMMDD') url = "http://www.cjmall.com/etv/broad/schedule_list_week_iframe.jsp?start_date=" + start_date page_source = build_soup(url) table_source = page_source.findAll('tr') table_date = window_arrow.format('YYYY/MM/DD') week_days = table_source[0].findAll('td') index_for_the_day = 0 for week_day in week_days: if table_date in str(week_day): break index_for_the_day += 1 product_list = [] for column in table_source[1:]: hour = int(column.findAll('th')[0].text) if not is_yesterday and 0 <= hour <= 5: continue if is_yesterday and hour > 5: continue item_for_week_days = column.findAll('td') item = item_for_week_days[index_for_the_day] for category, the_time, parsed_name in parsing_item(item): product_list.append( ProductInfo( name=parsed_name, start_time=the_time, category=category, )) return product_list
def home_and_shopping(window_arrow): print('HNSMALL') url = "http://www.hnsmall.com/display/tvtable.do?from_date={0}".format( urllib.parse.quote(window_arrow.format('YYYY/MM/DD'), safe='')) soup = build_soup(url) rows = soup.find('table').find('tbody').find_all('tr') product_list = [] for prod_info in parse_table(rows, window_arrow): product_list.append(prod_info) return product_list
def hyundai_home_shopping(window_arrow): print('H MALL') url = 'http://www.hyundaihmall.com/front/bmc/brodPordPbdv.do?cnt=0&date={0}'.format( window_arrow.format('YYYYMMDD')) soup = build_soup(url) table = soup.find('table') rows = table.find('tbody').find_all('tr') product_list = [] for product_info in parse_table(rows): product_list.append(product_info) return product_list # for prod in hyundai_home_shopping(arrow.get('2017-04-09')): # print(prod)
def gs_shop(window_arrow): print('GS MALL') url = 'http://with.gsshop.com/tv/tvScheduleMain.gs?lseq=397357&selectDate={0}'.format( window_arrow.format('YYYYMMDD')) soup = build_soup(url) tables = soup.findAll('table') product_list = [] for table in tables: rows = table.findAll('tr') the_time = '' for row in rows: column_times = row.find_all('td', {'class': 'time'}) column_descs = row.find_all('td', {'class': 'desc'}) column_prices = row.find_all('td', {'class': 'price'}) column_pics = row.find_all('td', {'class': 'pic'}) the_time = parsing_td_time(column_times) or the_time category, product = parsing_td_desc(column_descs) price = parsing_td_price(column_prices) the_id = parsing_td_pic(column_pics) if not category or not product: continue image_url = get_image_url_by_prod_id(the_id) if the_id else '' prod_detail_url = get_product_detail_url_by_prod_id( the_id) if the_id else '' if not the_id: continue product_list.append( ProductInfo( name=product, start_time=the_time.split('-')[0], end_time=the_time.split('-')[1], shop_code='7', ch_no='8', category=category, price=price, image=image_url, product_id='000711' + the_id, shop_prod_id=the_id, detail_product_url=prod_detail_url, )) return product_list