def get_lxml_by_url(url): try: from framework.common.daum import headers, session from system.logic_site import SystemLogicSite res = session.get(url, headers=headers, cookies=SystemLogicSite.get_daum_cookies()) data = res.content root = lxml.html.fromstring(data) return root except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc())
def get_html(url): try: #from . import headers, cookies #res = Logic.session.get(url, headers=headers, cookies=cookies) from framework.common.daum import headers, session from system.logic_site import SystemLogicSite res = session.get(url, headers=headers, cookies=SystemLogicSite.get_daum_cookies()) data = res.content return data except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc())
def daum_get_ratings_list(keyword): try: # drama_keywords = {'월화드라마', '수목드라마', '금요/주말드라마', '일일/아침드라마'} # ent_keywords = {'월요일예능', '화요일예능', '수요일예능', '목요일예능', '금요일예능', '토요일예능', '일요일예능'} from framework.common.daum import headers, session from system.logic_site import SystemLogicSite url = 'https://search.daum.net/search?w=tot&q=%s' % py_urllib.quote( keyword.encode('utf8')) res = session.get(url, headers=headers, cookies=SystemLogicSite.get_daum_cookies()) html = res.content root = lxml.html.fromstring(html) list_program = root.xpath( '//ol[@class="list_program item_cont"]/li') data = [] for item in list_program: data_item = {} data_item['title'] = item.xpath('./div/strong/a/text()')[0] data_item['air_time'] = item.xpath('./div/span[1]/text()')[0] data_item['provider'] = item.xpath( './div/span[@class="txt_subinfo"][2]/text()')[0] data_item['image'] = item.xpath('./a/img/@src') data_item['scheduled'] = item.xpath( './div/span[@class="txt_subinfo"]/span[@class="txt_subinfo"]/text()' ) data_item['ratings'] = item.xpath( './div/span[@class="txt_subinfo"][2]/span[@class="f_red"]/text()' ) if len(data_item['image']): data_item['image'] = data_item['image'][0] else: data_item[ 'image'] = 'http://www.okbible.com/data/skin/okbible_1/images/common/noimage.gif' # data_item['image'] = 'https://search1.daumcdn.net/search/statics/common/pi/thumb/noimage_151203.png' if len(data_item['scheduled']): data_item['scheduled'] = data_item['scheduled'][0] if len(data_item['ratings']): data_item['ratings'] = data_item['ratings'][0] data.append(data_item) return data except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc())
def search_movie_web(movie_list, movie_name, movie_year): try: url = 'https://suggest-bar.daum.net/suggest?id=movie&cate=movie&multiple=1&mod=json&code=utf_in_out&q=%s' % ( urllib.quote(movie_name.encode('utf8'))) #from . import headers, cookies #res = Logic.session.get(url, headers=headers, cookies=cookies) from framework.common.daum import headers, session from system.logic_site import SystemLogicSite res = session.get(url, headers=headers, cookies=SystemLogicSite.get_daum_cookies()) data = res.json() movie_cmp = re.sub('[\\/:*?"<>|]', '', movie_name) for index, item in enumerate(data['items']['movie']): tmps = item.split('|') score = 85 tmps[0] = re.sub('[\\/:*?"<>|]', '', tmps[0]) if tmps[0] == movie_cmp and int(tmps[3]) == int(movie_year): score = 95 ##elif tmps[0].find(movie_cmp) != -1 and int(tmps[3]) == int(movie_year): ##score = 95 elif tmps[3] == movie_year or abs( int(tmps[3]) - int(movie_year)) <= 1: score = score + 6 else: score -= index * 5 if score < 10: score = 10 MovieSearch.movie_append( movie_list, { 'id': tmps[1], 'title': tmps[0], 'year': tmps[3], 'score': score }) except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) try: url = 'https://search.daum.net/search?nil_suggest=btn&w=tot&DA=SBC&q=%s%s' % ( '%EC%98%81%ED%99%94+', urllib.quote(movie_name.encode('utf8'))) ret = MovieSearch.get_movie_info_from_home(url) if ret is not None: if ret['year'] == movie_year: score = 100 need_another_search = False else: score >= 90 need_another_search = True MovieSearch.movie_append( movie_list, { 'id': ret['daum_id'], 'title': ret['title'], 'year': ret['year'], 'score': score, 'country': ret['country'], 'more': ret['more'] }) logger.debug('need_another_search : %s' % need_another_search) movie = ret['movie'] if need_another_search: tmp = movie.find('div[@class="coll_etc"]') if tmp is not None: tag_list = tmp.findall('.//a') first_url = None for tag in tag_list: match = re.compile('(.*?)\\((.*?)\\)').search( tag.text_content()) if match: daum_id = tag.attrib['href'].split('||')[1] score = 80 if match.group( 1) == movie_name and match.group( 2) == movie_year: first_url = 'https://search.daum.net/search?%s' % tag.attrib[ 'href'] elif match.group( 2 ) == movie_year and first_url is not None: first_url = 'https://search.daum.net/search?%s' % tag.attrib[ 'href'] MovieSearch.movie_append( movie_list, { 'id': daum_id, 'title': match.group(1), 'year': match.group(2), 'score': score }) logger.debug('first_url : %s' % first_url) if need_another_search and first_url is not None: new_ret = MovieSearch.get_movie_info_from_home( first_url) MovieSearch.movie_append( movie_list, { 'id': new_ret['daum_id'], 'title': new_ret['title'], 'year': new_ret['year'], 'score': 100, 'country': new_ret['country'], 'more': new_ret['more'] }) tmp = movie.find('.//ul[@class="list_thumb list_few"]') logger.debug('SERIES:%s' % tmp) if tmp is not None: tag_list = tmp.findall('.//div[@class="wrap_cont"]') first_url = None score = 80 for tag in tag_list: a_tag = tag.find('a') daum_id = a_tag.attrib['href'].split('||')[1] daum_name = a_tag.text_content() span_tag = tag.find('span') year = span_tag.text_content() logger.debug('daum_id:%s %s %s' % (daum_id, year, daum_name)) if daum_name == movie_name and year == movie_year: first_url = 'https://search.daum.net/search?%s' % a_tag.attrib[ 'href'] elif year == movie_year and first_url is not None: first_url = 'https://search.daum.net/search?%s' % tag.attrib[ 'href'] MovieSearch.movie_append( movie_list, { 'id': daum_id, 'title': daum_name, 'year': year, 'score': score }) logger.debug('first_url : %s' % first_url) if need_another_search and first_url is not None: new_ret = MovieSearch.get_movie_info_from_home( first_url) MovieSearch.movie_append( movie_list, { 'id': new_ret['daum_id'], 'title': new_ret['title'], 'year': new_ret['year'], 'score': 100, 'country': new_ret['country'], 'more': new_ret['more'] }) try: movie_list = list( reversed(sorted(movie_list, key=lambda k: k['score']))) logger.debug('smw - id: %s, score:%s, myear:%s, year:%s', movie_list[0]['id'], movie_list[0]['score'], movie_year, movie_list[0]['year']) id_url = 'http://movie.daum.net/data/movie/movie_info/detail.json?movieId=%s' % movie_list[ 0]['id'] #from . import headers, cookies #res = Logic.session.get(id_url, headers=headers, cookies=cookies) from framework.common.daum import headers, session from system.logic_site import SystemLogicSite res = session.get(id_url, headers=headers, cookies=SystemLogicSite.get_daum_cookies()) meta_data = res.json() logger.debug('smw - more search') if meta_data is not None: logger.debug('smw - more search....ing') info = meta_data['data'] if int(movie_list[0]['year']) == 0: movie_list[0]['year'] = unicode(info['prodYear']) elif int(movie_year) == int(info['prodYear']): movie_list[0]['year'] = unicode(info['prodYear']) movie_list[0]['score'] = movie_list[0]['score'] + 5 movie_list[0]['title'] = info['titleKo'] logger.debug('smw - eng title:%s', info['titleEn']) movie_list[0].update({ 'more': { 'eng_title': "", 'rate': "", 'during': "", 'genre': [] } }) movie_list[0]['more']['during'] = unicode(info['showtime']) if info['admissionDesc']: movie_list[0]['more']['rate'] = info['admissionDesc'] logger.debug('smw - rate:%s', movie_list[0]['more']['rate']) movie_list[0]['more']['eng_title'] = info['titleEn'] for item in info['countries']: movie_list[0]['country'] = item['countryKo'] break for item in info['genres']: movie_list[0]['more']['genre'].append( item['genreName']) logger.debug('%s', item['genreName']) except Exception as e: pass #logger.error('Exception:%s', e) #logger.error(traceback.format_exc()) except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) movie_list = list( reversed(sorted(movie_list, key=lambda k: k['score']))) return movie_list
def get_movie_info_from_home(url): try: #from . import headers, cookies #res = Logic.session.get(url, headers=headers, cookies=cookies) from framework.common.daum import headers, session from system.logic_site import SystemLogicSite res = session.get(url, headers=headers, cookies=SystemLogicSite.get_daum_cookies()) data = res.content html = lxml.html.document_fromstring(data) movie = None try: movie = html.get_element_by_id('movieEColl') except Exception as e: pass if movie is None: logger.debug('gmifh - movie is none') return title_tag = movie.get_element_by_id('movieTitle') a_tag = title_tag.find('a') href = a_tag.attrib['href'] title = a_tag.find('b').text_content() tmp = title_tag.text_content() tmp_year = '' match = re.compile(u'(?P<year>\\d{4})\\s\uc81c\uc791').search(tmp) more = {} if match: tmp_year = match.group('year') more['eng_title'] = tmp.replace(title, '').replace( tmp_year, '').replace(u'\uc81c\uc791', '').replace(u',', '').strip() country_tag = movie.xpath('//div[3]/div/div[1]/div[2]/dl[1]/dd[2]') country = '' if country_tag: country = country_tag[0].text_content().split('|')[0].strip() logger.debug(country) more['poster'] = movie.xpath( '//*[@id="nmovie_img_0"]/a/img')[0].attrib['src'] more['title'] = movie.xpath( '//*[@id="movieTitle"]/span')[0].text_content() tmp = movie.xpath( '//*[@id="movieEColl"]/div[3]/div/div[1]/div[2]/dl') more['info'] = [] more['info'].append(country_tag[0].text_content().strip()) logger.debug(more['info'][0]) tmp = more['info'][0].split('|') if len(tmp) == 5: more['country'] = tmp[0].replace(u'\uc678', '').strip() more['genre'] = tmp[1].replace(u'\uc678', '').strip() more['date'] = tmp[2].replace(u'\uac1c\ubd09', '').strip() more['rate'] = tmp[3].strip() more['during'] = tmp[4].strip() elif len(tmp) == 4: more['country'] = tmp[0].replace(u'\uc678', '').strip() more['genre'] = tmp[1].replace(u'\uc678', '').strip() more['date'] = '' more['rate'] = tmp[2].strip() more['during'] = tmp[3].strip() elif len(tmp) == 3: more['country'] = tmp[0].replace(u'\uc678', '').strip() more['genre'] = tmp[1].replace(u'\uc678', '').strip() more['date'] = '' more['rate'] = '' more['during'] = tmp[2].strip() daum_id = href.split('=')[1] return { 'movie': movie, 'title': title, 'daum_id': daum_id, 'year': tmp_year, 'country': country, 'more': more } except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) return