def api_download(bbs_id): logger.debug('api download :%s', bbs_id) try : rss_id, index = bbs_id.split('_') entity = ModelBbs2.get(id=int(rss_id)).as_dict() #logger.debug(entity) scheduler_instance = ModelScheduler2.get2(sitename=entity['site'], board_id=entity['board']) data = [ entity['url'], entity['files'][int(index)][0], entity['files'][int(index)][1] ] site_instance = ModelSite2.get(name=entity['site']).info if 'USE_SELENIUM' in site_instance['EXTRA']: from system import SystemLogicSelenium driver = SystemLogicSelenium.get_driver() SystemLogicSelenium.get_pagesoruce_by_selenium(data[0], site_instance['SELENIUM_WAIT_TAG']) logger.debug(data[1]) logger.debug('selenium download go..') driver.get(data[1]) logger.debug('selenium wait before...') #SystemLogicSelenium.waitUntilDownloadCompleted(120) #SystemLogicSelenium.waitUntilDownloadCompleted(10) import time time.sleep(10) logger.debug('selenium wait end') files = SystemLogicSelenium.get_downloaded_files() logger.debug(files) # 파일확인 filename_no_ext = os.path.splitext(data[2].split('/')[-1]) file_index = 0 for idx, value in enumerate(files): if value.find(filename_no_ext[0]) != -1: file_index = idx break logger.debug('fileindex : %s', file_index) content = SystemLogicSelenium.get_file_content(files[file_index]) byteio = io.BytesIO() byteio.write(content) filedata = byteio.getvalue() return send_file( io.BytesIO(filedata), mimetype='application/octet-stream', as_attachment=True, attachment_filename=data[2]) return download2(data, scheduler_instance) except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc())
def scheduler_function_task(): try: logger.debug('RSS scheduler_function') items = ModelScheduler2.get_list() for item in items: logger.debug(u'스케쥴링 시작') logger.debug('%s %s', item.sitename, item.board_id) if item.site is None: continue if not item.include_scheduler: logger.debug('not include_scheduler') continue if 'USING_BOARD_CHAR_ID' not in item.site.info['EXTRA']: last_bbs = item.get_last_bbs() if last_bbs is not None: max_id = last_bbs.board_id else: max_id = 0 else: max_id = 0 rss_list = LogicFromSite.get_list( item.site, item.board_id, max_id=max_id, page=ModelSetting.get_int('max_page'), scheduler_instance=item) if rss_list: save_list = LogicSelf.__db_save_list( item.site, item, rss_list) #logger.debug(save_list) groups = LogicSelf.get_group_list() group_name = None for group in groups: for sched in group['schedulers']: if sched['sitename'] == item.sitename and sched[ 'board_id'] == item.board_id: group_name = group['groupname'] break if group_name is not None: break from framework.common.torrent.process import TorrentProcess TorrentProcess.server_process(save_list, category=group_name) # selenium이 celery에서 돌 경우 해제안됨. from system import SystemLogicSelenium SystemLogicSelenium.close_driver() except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc())
def get_html(url, referer=None, stream=False, cookie=None, selenium_tag=None): try: logger.debug('get_html :%s', url) if selenium_tag: #if 'USE_SELENIUM' in site_instance.info['EXTRA']: from system import SystemLogicSelenium data = SystemLogicSelenium.get_pagesoruce_by_selenium(url, selenium_tag) else: headers['Referer'] = '' if referer is None else referer if cookie is not None: headers['Cookie'] = cookie if LogicFromSite.proxyes: page_content = LogicFromSite.session.get(url, headers=headers, proxies=LogicFromSite.proxyes, stream=stream, verify=False) else: page_content = LogicFromSite.session.get(url, headers=headers, stream=stream, verify=False) if cookie is not None: del headers['Cookie'] if stream: return page_content data = page_content.content #logger.debug(data) except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) logger.error('Known..') data = LogicFromSite.get_data(url) return data
def pageparser(url): try: if ModelSetting.get('use_selenium') == 'True': from system import SystemLogicSelenium return SystemLogicSelenium.get_pagesoruce_by_selenium( url, '//footer[@class="at-footer"]') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7' } page_source = requests.get(url, headers=headers).text #logger.debug(page_source) return page_source if ModelSetting.get('proxy') == 'False' and ModelSetting.get( 'cloudflare_bypass') == 'False': page_source = requests.get(url, headers=headers).text elif ModelSetting.get('proxy') == 'True' and ModelSetting.get( 'cloudflare_bypass') == 'False': page_source = requests.get(url, headers=headers, proxies={ "https": ModelSetting.get('proxy_url'), 'http': ModelSetting.get('proxy_url') }).text elif ModelSetting.get('proxy') == 'False' and ModelSetting.get( 'cloudflare_bypass') == 'True': if LogicNormal.scraper is None: LogicNormal.scraper = cfscrape.create_scraper() page_source = LogicNormal.scraper.get(url, headers=headers).text elif ModelSetting.get('proxy') == 'True' and ModelSetting.get( 'cloudflare_bypass') == 'True': if LogicNormal.scraper is None: LogicNormal.scraper = cfscrape.create_scraper() page_source = LogicNormal.scraper.get( url, headers=headers, proxies={ "https": ModelSetting.get('proxy_url'), 'http': ModelSetting.get('proxy_url') }).text except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) return None return page_source
def __get_download_list(html, tree, site_instance, item): download_list = [] try: if 'DOWNLOAD_REGEX' not in site_instance.info: return download_list #logger.debug(html) #tmp = html.find('a href="https://www.rgtorrent.me/bbs/download.php') #if tmp != -1: # logger.debug(html[tmp-300:tmp+300]) #logger.debug(site_instance.info['DOWNLOAD_REGEX']) tmp = re.compile(site_instance.info['DOWNLOAD_REGEX'], re.MULTILINE).finditer(html) for t in tmp: #logger.debug(t.group('url')) #logger.debug(t.group('filename')) if t.group('filename').strip() == '': continue entity = {} entity['link'] = py_urllib.unquote( t.group('url').strip()).strip() entity['link'] = unescape(entity['link']) logger.debug(entity['link']) entity['filename'] = py_urllib.unquote( t.group('filename').strip()) entity['filename'] = unescape(entity['filename']) if 'DOWNLOAD_URL_SUB' in site_instance.info: logger.debug(entity['link']) entity['link'] = re.sub( site_instance.info['DOWNLOAD_URL_SUB'][0], site_instance.info['DOWNLOAD_URL_SUB'][1].format( URL=site_instance.info['TORRENT_SITE_URL']), entity['link']).strip() if not entity['link'].startswith('http'): form = '%s%s' if entity['link'].startswith( '/') else '%s/%s' entity['link'] = form % ( site_instance.info['TORRENT_SITE_URL'], entity['link']) if 'FILENAME_SUB' in site_instance.info: entity['filename'] = re.sub( site_instance.info['FILENAME_SUB'][0], site_instance.info['FILENAME_SUB'][1], entity['filename']).strip() exist = False for tt in download_list: if tt['link'] == entity['link']: exist = True break if not exist: if app.config['config']['is_sjva_server'] and len( item['magnet']) > 0: # or True: try: ext = os.path.splitext( entity['filename'])[1].lower() #item['magnet'] if ext in ['.smi', '.srt', '.ass']: #if True: import io if 'USE_SELENIUM' in site_instance.info[ 'EXTRA']: from system import SystemLogicSelenium driver = SystemLogicSelenium.get_driver() driver.get(entity['link']) import time time.sleep(10) files = SystemLogicSelenium.get_downloaded_files( ) logger.debug(files) # 파일확인 filename_no_ext = os.path.splitext( entity['filename'].split('/')[-1]) file_index = 0 for idx, value in enumerate(files): if value.find( filename_no_ext[0]) != -1: file_index = idx break logger.debug('fileindex : %s', file_index) content = SystemLogicSelenium.get_file_content( files[file_index]) byteio = io.BytesIO() byteio.write(content) else: data = LogicFromSite.get_html( entity['link'], referer=item['url'], stream=True) byteio = io.BytesIO() for chunk in data.iter_content(1024): byteio.write(chunk) from discord_webhook import DiscordWebhook, DiscordEmbed webhook_url = app.config['config'][ 'rss_subtitle_webhook'] text = '%s\n<%s>' % (item['title'], item['url']) webhook = DiscordWebhook(url=webhook_url, content=text) webhook.add_file(file=byteio.getvalue(), filename=entity['filename']) response = webhook.execute() discord = response.json() logger.debug(discord) if 'attachments' in discord: entity['direct_url'] = discord[ 'attachments'][0]['url'] except Exception as e: logger.debug('Exception:%s', e) logger.debug(traceback.format_exc()) download_list.append(entity) return download_list except Exception as e: logger.debug('Exception:%s', e) logger.debug(traceback.format_exc()) return download_list
def __get_bbs_list(site_instance, board, max_page, max_id, xpath_dict, is_test=False): bbs_list = [] index_step = xpath_dict[ 'INDEX_STEP'] if 'INDEX_STEP' in xpath_dict else 1 index_start = xpath_dict[ 'INDEX_START'] if 'INDEX_START' in xpath_dict else 1 stop_by_maxid = False if 'FORCE_FIRST_PAGE' in site_instance.info['EXTRA']: max_page = 1 cookie = None if 'COOKIE' in site_instance.info: cookie = site_instance.info['COOKIE'] for p in range(max_page): url = LogicFromSite.get_board_url(site_instance, board, str(p + 1)) list_tag = xpath_dict['XPATH'][:xpath_dict['XPATH'].find('[%s]')] #list_tag = '/html/body/main/div/div/div[3]/div/table/tbody' logger.debug('list_tag : %s', list_tag) logger.debug('Url : %s', url) if 'USE_SELENIUM' in site_instance.info['EXTRA']: from system import SystemLogicSelenium tmp = SystemLogicSelenium.get_pagesoruce_by_selenium( url, list_tag) else: tmp = LogicFromSite.get_html(url, cookie=cookie) #logger.debug(tmp) tree = html.fromstring(tmp) #tree = html.fromstring(LogicFromSite.get_html(url))) lists = tree.xpath(list_tag) logger.debug('Count : %s', len(lists)) for i in range(index_start, len(lists) + 1, index_step): try: a_tag = tree.xpath(xpath_dict['XPATH'] % i) a_tag_index = len(a_tag) - 1 if a_tag_index == -1: logger.debug('a_tag_index : %s', a_tag_index) continue item = {} # if 'TITLE_XPATH' in xpath_dict: #logger.debug(a_tag[a_tag_index].xpath(xpath_dict['TITLE_XPATH'])) if xpath_dict['TITLE_XPATH'].endswith('text()'): logger.debug(a_tag[a_tag_index].xpath( xpath_dict['TITLE_XPATH'])) item['title'] = py_urllib.unquote( a_tag[a_tag_index].xpath( xpath_dict['TITLE_XPATH'])[-1]).strip() else: item['title'] = py_urllib.unquote( a_tag[a_tag_index].xpath( xpath_dict['TITLE_XPATH']) [0].text_content()).strip() else: item['title'] = py_urllib.unquote( a_tag[a_tag_index].text_content()).strip() if 'TITLE_SUB' in xpath_dict: item['title'] = re.sub(xpath_dict['TITLE_SUB'][0], xpath_dict['TITLE_SUB'][1], item['title']).strip() # 일반적이 제목 처리 후 정규식이 있으면 추출 if 'TITLE_REGEX' in xpath_dict: match = re.compile(xpath_dict['TITLE_REGEX']).search( item['title']) if match: item['title'] = match.group('title') item['url'] = a_tag[a_tag_index].attrib['href'] if 'DETAIL_URL_SUB' in site_instance.info: #item['url'] = item['url'].replace(site_instance.info['DETAIL_URL_RULE'][0], site_instance.info['DETAIL_URL_RULE'][1].format(URL=site_instance.info['TORRENT_SITE_URL'])) item['url'] = re.sub( site_instance.info['DETAIL_URL_SUB'][0], site_instance.info['DETAIL_URL_SUB'][1].format( URL=site_instance.info['TORRENT_SITE_URL']), item['url']) if not item['url'].startswith('http'): form = '%s%s' if item['url'].startswith( '/') else '%s/%s' item['url'] = form % ( site_instance.info['TORRENT_SITE_URL'], item['url']) item['id'] = '' if 'ID_REGEX' in site_instance.info: id_regexs = [site_instance.info['ID_REGEX']] #id_regexs.insert(0, site_instance.info['ID_REGEX']) else: id_regexs = [ r'wr_id\=(?P<id>\d+)', r'\/(?P<id>\d+)\.html', r'\/(?P<id>\d+)$' ] for regex in id_regexs: match = re.compile(regex).search(item['url']) if match: item['id'] = match.group('id') break if item['id'] == '': for regex in id_regexs: match = re.compile(regex).search( item['url'].split('?')[0]) if match: item['id'] = match.group('id') break logger.debug('ID : %s, TITLE : %s', item['id'], item['title']) if item['id'].strip() == '': continue if is_test: bbs_list.append(item) else: if 'USING_BOARD_CHAR_ID' in site_instance.info[ 'EXTRA']: # javdb from .model import ModelBbs2 entity = ModelBbs2.get( site=site_instance.info['NAME'], board=board, board_char_id=item['id']) if entity is None: bbs_list.append(item) logger.debug('> Append..') else: logger.debug('> exist..') else: # 2019-04-04 토렌트퐁 try: if 'NO_BREAK_BY_MAX_ID' in site_instance.info[ 'EXTRA']: if int(item['id']) <= max_id: continue else: bbs_list.append(item) else: if int(item['id']) <= max_id: logger.debug('STOP by MAX_ID(%s)', max_id) stop_by_maxid = True break bbs_list.append(item) #logger.debug(item) except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) logger.error(site_instance.info) if stop_by_maxid: break logger.debug('Last count :%s', len(bbs_list)) return bbs_list
def episode_download(queue_entity_episode): import plugin wr_id = queue_entity_episode.wr_id logger.debug('Episode Download wr_id:%s', wr_id) try: from selenium.webdriver.support.ui import WebDriverWait from system import SystemLogicSelenium if LogicNormal.driver is None: LogicNormal.driver = SystemLogicSelenium.create_driver() driver = LogicNormal.driver url = '%s/bbs/board.php?bo_table=manga&wr_id=%s' % ( ModelSetting.get('sitecheck'), wr_id) driver.get(url) fix_tag = WebDriverWait( driver, 30 ).until(lambda driver: driver.find_element_by_xpath( '//*[@id="thema_wrapper"]/div[3]/div/div/div[1]/div[2]/div[3]/div' )) SystemLogicSelenium.remove_element(driver, fix_tag) tag = WebDriverWait( driver, 30 ).until(lambda driver: driver.find_element_by_xpath( '//*[@id="thema_wrapper"]/div[3]/div/div/div[1]/div[2]/div[1]/div/div[1]/a[2]' )) queue_entity_episode.manga_id = tag.get_attribute('href').split( '=')[-1] title = driver.title queue_entity_episode.title = LogicNormal.titlereplace(title) match = re.compile( ur'(?P<main>.*?)((단행본.*?)?|특별편)?(\s(?P<sub>(\d|\-|\.)*?(화|권)))?(\-)?(전|후|중)?(\s?\d+(\-\d+)?화)?(\s\(완결\))?\s?$' ).match(title) if match: queue_entity_episode.maintitle = match.group('main').strip() else: match2 = re.compile(ur'(?P<main>.*?)\s시즌') if match2: queue_entity_episode.maintitle = match2.group( 'main').strip() else: queue_entity_episode.maintitle = title logger.debug('not match') queue_entity_episode.maintitle = LogicNormal.titlereplace( queue_entity_episode.maintitle) if ModelSetting.get('use_title_folder') == 'True': download_path = os.path.join(ModelSetting.get('dfolder'), queue_entity_episode.maintitle, queue_entity_episode.title) else: download_path = os.path.join(ModelSetting.get('dfolder'), queue_entity_episode.title) logger.debug(title) logger.debug(queue_entity_episode.maintitle) image_tags = WebDriverWait( driver, 30 ).until(lambda driver: driver.find_elements_by_xpath( '//*[@id="thema_wrapper"]/div[3]/div/div/div[1]/div[2]/div[5]/div/img' )) queue_entity_episode.total_image_count = len(image_tags) if not os.path.exists(download_path): os.makedirs(download_path) queue_entity_episode.status = '캡처중' plugin.socketio_callback('episode', queue_entity_episode.as_dict(), encoding=False) full = SystemLogicSelenium.full_screenshot(driver) if full is None: queue_entity_episode.status = '실패' plugin.socketio_callback('episode', queue_entity_episode.as_dict(), encoding=False) else: queue_entity_episode.status = '파일 생성중' for idx, tag in enumerate(image_tags): image_filepath = os.path.join( download_path, str(idx + 1).zfill(5) + '.png') left = tag.location['x'] top = tag.location['y'] right = tag.location['x'] + tag.size['width'] bottom = top + tag.size['height'] logger.debug('%s %s %s %s %s', idx, left, top, right, bottom) im = full.crop( (left, top, right, bottom)) # defines crop points im.save(image_filepath) queue_entity_episode.current_image_index = idx plugin.socketio_callback('episode', queue_entity_episode.as_dict(), encoding=False) if ModelSetting.get('zip') == 'True': LogicNormal.makezip(download_path) queue_entity_episode.status = '완료' plugin.socketio_callback('episode', queue_entity_episode.as_dict(), encoding=False) return True except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc())
def download2(self, entity): from selenium.webdriver.support.ui import WebDriverWait try: from system import SystemLogicSelenium from . import plugin if LogicNormal.driver is None: LogicNormal.driver = SystemLogicSelenium.create_driver() driver = LogicNormal.driver url = 'https://comic.naver.com/webtoon/detail.nhn?titleId=%s&no=%s' % ( entity['title_id'], entity['episode_id']) logger.debug(url) ret = False driver.get(url) entity['download_count'] += 1 entity['status'] = 1 entity['str_status'] = '대기' LogicNormal.update_ui(self, entity) tag = WebDriverWait( driver, 30).until(lambda driver: driver.find_element_by_xpath( '//*[@id="content"]/div[1]/div[1]/div[2]/h2')) entity['title'] = SystemLogicSelenium.get_text_excluding_children( driver, tag).strip() tag = WebDriverWait( driver, 30).until(lambda driver: driver.find_element_by_xpath( '//*[@id="content"]/div[1]/div[2]/div[1]/h3')) entity['episode_title'] = tag.text entity['str_status'] = '분석' LogicNormal.update_ui(self, entity) tag = WebDriverWait( driver, 30).until(lambda driver: driver.find_element_by_xpath( '//*[@id="btnRemoteConOnOff"]')) if tag.find_element_by_xpath('em').text == 'ON': #logger.debug('ON to OFF') tag.click() #S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X) #driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment #logger.debug(S('Width')) #logger.debug(S('Height')) tag = WebDriverWait( driver, 30).until(lambda driver: driver.find_element_by_xpath( '//*[@id="comic_view_area"]/div[1]')) dirname = ModelSetting.get('download_path') if ModelSetting.get_bool('use_title_folder'): dirname = os.path.join( dirname, Util.change_text_for_use_filename(entity['title'])) if not os.path.exists(dirname): os.makedirs(dirname) tmp = u'%s %s %s.png' % (entity['episode_id'].zfill(3), entity['title'], entity['episode_title']) entity['filename'] = os.path.join( dirname, Util.change_text_for_use_filename(tmp)) if os.path.exists(entity['filename']): entity['status'] = 12 entity['str_status'] = '파일 있음' LogicNormal.update_ui(self, entity) else: entity['str_status'] = '다운로드중' LogicNormal.update_ui(self, entity) from system import SystemLogicSelenium full = SystemLogicSelenium.full_screenshot(driver) if full is not None: img_tag = tag.find_elements_by_xpath('img') if len(img_tag) > 1: img_tag = img_tag[1] elif len(img_tag) == 1: img_tag = img_tag[0] else: img_tag = tag left = img_tag.location['x'] top = tag.location['y'] right = img_tag.location['x'] + img_tag.size['width'] bottom = tag.location['y'] + tag.size['height'] im = full.crop( (left, top, right, bottom)) # defines crop points im.save(entity['filename']) entity['status'] = 11 entity['str_status'] = '완료' LogicNormal.update_ui(self, entity) else: raise Exception('capture fail.') except Exception as e: logger.error('Exception:%s', e) logger.error(traceback.format_exc()) entity['status'] = 2 entity['str_status'] = '실패' if entity['download_count'] >= 5: entity['status'] = 13 entity['str_status'] = '재시도초과' LogicNormal.update_ui(self, entity) ModelItem.save_as_dict(entity)