def api_download(bbs_id): logger.debug('api download :%s', bbs_id) try : rss_id, index = bbs_id.split('_') entity = ModelBbs2.get(id=int(rss_id)).as_dict() #logger.debug(entity) scheduler_instance = ModelScheduler2.get2(sitename=entity['site'], board_id=entity['board']) data = [ entity['url'], entity['files'][int(index)][0], entity['files'][int(index)][1] ] site_instance = ModelSite2.get(name=entity['site']).info if 'USE_SELENIUM' in site_instance['EXTRA']: from system import SystemLogicSelenium driver = SystemLogicSelenium.get_driver() SystemLogicSelenium.get_pagesoruce_by_selenium(data[0], site_instance['SELENIUM_WAIT_TAG']) logger.debug(data[1]) logger.debug('selenium download go..') driver.get(data[1]) logger.debug('selenium wait before...') #SystemLogicSelenium.waitUntilDownloadCompleted(120) #SystemLogicSelenium.waitUntilDownloadCompleted(10) import time time.sleep(10) logger.debug('selenium wait end') files = SystemLogicSelenium.get_downloaded_files() logger.debug(files) # 파일확인 filename_no_ext = os.path.splitext(data[2].split('/')[-1]) file_index = 0 for idx, value in enumerate(files): if value.find(filename_no_ext[0]) != -1: file_index = idx break logger.debug('fileindex : %s', file_index) content = SystemLogicSelenium.get_file_content(files[file_index]) byteio = io.BytesIO() byteio.write(content) filedata = byteio.getvalue() return send_file( io.BytesIO(filedata), mimetype='application/octet-stream', as_attachment=True, attachment_filename=data[2]) return download2(data, scheduler_instance) except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc())
def get_html(url, referer=None, stream=False, cookie=None, selenium_tag=None): try: logger.debug('get_html :%s', url) if selenium_tag: #if 'USE_SELENIUM' in site_instance.info['EXTRA']: from system import SystemLogicSelenium data = SystemLogicSelenium.get_pagesoruce_by_selenium(url, selenium_tag) else: headers['Referer'] = '' if referer is None else referer if cookie is not None: headers['Cookie'] = cookie if LogicFromSite.proxyes: page_content = LogicFromSite.session.get(url, headers=headers, proxies=LogicFromSite.proxyes, stream=stream, verify=False) else: page_content = LogicFromSite.session.get(url, headers=headers, stream=stream, verify=False) if cookie is not None: del headers['Cookie'] if stream: return page_content data = page_content.content #logger.debug(data) except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) logger.error('Known..') data = LogicFromSite.get_data(url) return data
def pageparser(url): try: if ModelSetting.get('use_selenium') == 'True': from system import SystemLogicSelenium return SystemLogicSelenium.get_pagesoruce_by_selenium( url, '//footer[@class="at-footer"]') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7' } page_source = requests.get(url, headers=headers).text #logger.debug(page_source) return page_source if ModelSetting.get('proxy') == 'False' and ModelSetting.get( 'cloudflare_bypass') == 'False': page_source = requests.get(url, headers=headers).text elif ModelSetting.get('proxy') == 'True' and ModelSetting.get( 'cloudflare_bypass') == 'False': page_source = requests.get(url, headers=headers, proxies={ "https": ModelSetting.get('proxy_url'), 'http': ModelSetting.get('proxy_url') }).text elif ModelSetting.get('proxy') == 'False' and ModelSetting.get( 'cloudflare_bypass') == 'True': if LogicNormal.scraper is None: LogicNormal.scraper = cfscrape.create_scraper() page_source = LogicNormal.scraper.get(url, headers=headers).text elif ModelSetting.get('proxy') == 'True' and ModelSetting.get( 'cloudflare_bypass') == 'True': if LogicNormal.scraper is None: LogicNormal.scraper = cfscrape.create_scraper() page_source = LogicNormal.scraper.get( url, headers=headers, proxies={ "https": ModelSetting.get('proxy_url'), 'http': ModelSetting.get('proxy_url') }).text except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) return None return page_source
def __get_bbs_list(site_instance, board, max_page, max_id, xpath_dict, is_test=False): bbs_list = [] index_step = xpath_dict[ 'INDEX_STEP'] if 'INDEX_STEP' in xpath_dict else 1 index_start = xpath_dict[ 'INDEX_START'] if 'INDEX_START' in xpath_dict else 1 stop_by_maxid = False if 'FORCE_FIRST_PAGE' in site_instance.info['EXTRA']: max_page = 1 cookie = None if 'COOKIE' in site_instance.info: cookie = site_instance.info['COOKIE'] for p in range(max_page): url = LogicFromSite.get_board_url(site_instance, board, str(p + 1)) list_tag = xpath_dict['XPATH'][:xpath_dict['XPATH'].find('[%s]')] #list_tag = '/html/body/main/div/div/div[3]/div/table/tbody' logger.debug('list_tag : %s', list_tag) logger.debug('Url : %s', url) if 'USE_SELENIUM' in site_instance.info['EXTRA']: from system import SystemLogicSelenium tmp = SystemLogicSelenium.get_pagesoruce_by_selenium( url, list_tag) else: tmp = LogicFromSite.get_html(url, cookie=cookie) #logger.debug(tmp) tree = html.fromstring(tmp) #tree = html.fromstring(LogicFromSite.get_html(url))) lists = tree.xpath(list_tag) logger.debug('Count : %s', len(lists)) for i in range(index_start, len(lists) + 1, index_step): try: a_tag = tree.xpath(xpath_dict['XPATH'] % i) a_tag_index = len(a_tag) - 1 if a_tag_index == -1: logger.debug('a_tag_index : %s', a_tag_index) continue item = {} # if 'TITLE_XPATH' in xpath_dict: #logger.debug(a_tag[a_tag_index].xpath(xpath_dict['TITLE_XPATH'])) if xpath_dict['TITLE_XPATH'].endswith('text()'): logger.debug(a_tag[a_tag_index].xpath( xpath_dict['TITLE_XPATH'])) item['title'] = py_urllib.unquote( a_tag[a_tag_index].xpath( xpath_dict['TITLE_XPATH'])[-1]).strip() else: item['title'] = py_urllib.unquote( a_tag[a_tag_index].xpath( xpath_dict['TITLE_XPATH']) [0].text_content()).strip() else: item['title'] = py_urllib.unquote( a_tag[a_tag_index].text_content()).strip() if 'TITLE_SUB' in xpath_dict: item['title'] = re.sub(xpath_dict['TITLE_SUB'][0], xpath_dict['TITLE_SUB'][1], item['title']).strip() # 일반적이 제목 처리 후 정규식이 있으면 추출 if 'TITLE_REGEX' in xpath_dict: match = re.compile(xpath_dict['TITLE_REGEX']).search( item['title']) if match: item['title'] = match.group('title') item['url'] = a_tag[a_tag_index].attrib['href'] if 'DETAIL_URL_SUB' in site_instance.info: #item['url'] = item['url'].replace(site_instance.info['DETAIL_URL_RULE'][0], site_instance.info['DETAIL_URL_RULE'][1].format(URL=site_instance.info['TORRENT_SITE_URL'])) item['url'] = re.sub( site_instance.info['DETAIL_URL_SUB'][0], site_instance.info['DETAIL_URL_SUB'][1].format( URL=site_instance.info['TORRENT_SITE_URL']), item['url']) if not item['url'].startswith('http'): form = '%s%s' if item['url'].startswith( '/') else '%s/%s' item['url'] = form % ( site_instance.info['TORRENT_SITE_URL'], item['url']) item['id'] = '' if 'ID_REGEX' in site_instance.info: id_regexs = [site_instance.info['ID_REGEX']] #id_regexs.insert(0, site_instance.info['ID_REGEX']) else: id_regexs = [ r'wr_id\=(?P<id>\d+)', r'\/(?P<id>\d+)\.html', r'\/(?P<id>\d+)$' ] for regex in id_regexs: match = re.compile(regex).search(item['url']) if match: item['id'] = match.group('id') break if item['id'] == '': for regex in id_regexs: match = re.compile(regex).search( item['url'].split('?')[0]) if match: item['id'] = match.group('id') break logger.debug('ID : %s, TITLE : %s', item['id'], item['title']) if item['id'].strip() == '': continue if is_test: bbs_list.append(item) else: if 'USING_BOARD_CHAR_ID' in site_instance.info[ 'EXTRA']: # javdb from .model import ModelBbs2 entity = ModelBbs2.get( site=site_instance.info['NAME'], board=board, board_char_id=item['id']) if entity is None: bbs_list.append(item) logger.debug('> Append..') else: logger.debug('> exist..') else: # 2019-04-04 토렌트퐁 try: if 'NO_BREAK_BY_MAX_ID' in site_instance.info[ 'EXTRA']: if int(item['id']) <= max_id: continue else: bbs_list.append(item) else: if int(item['id']) <= max_id: logger.debug('STOP by MAX_ID(%s)', max_id) stop_by_maxid = True break bbs_list.append(item) #logger.debug(item) except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) logger.error(site_instance.info) if stop_by_maxid: break logger.debug('Last count :%s', len(bbs_list)) return bbs_list