示例#1
0
def api_download(bbs_id):
    logger.debug('api download :%s', bbs_id)
    try :
        rss_id, index = bbs_id.split('_')

        entity = ModelBbs2.get(id=int(rss_id)).as_dict()

        #logger.debug(entity)
        scheduler_instance = ModelScheduler2.get2(sitename=entity['site'], board_id=entity['board'])
        data = [
            entity['url'],
            entity['files'][int(index)][0],
            entity['files'][int(index)][1]
        ]
        
        site_instance = ModelSite2.get(name=entity['site']).info

        if 'USE_SELENIUM' in site_instance['EXTRA']:
            from system import SystemLogicSelenium
            driver = SystemLogicSelenium.get_driver()

            SystemLogicSelenium.get_pagesoruce_by_selenium(data[0], site_instance['SELENIUM_WAIT_TAG'])

            logger.debug(data[1])
            logger.debug('selenium download go..')
            driver.get(data[1])
            logger.debug('selenium wait before...')
            #SystemLogicSelenium.waitUntilDownloadCompleted(120)
            #SystemLogicSelenium.waitUntilDownloadCompleted(10)
            import time
            time.sleep(10)
            logger.debug('selenium wait end')
            files = SystemLogicSelenium.get_downloaded_files()
            logger.debug(files)
            # 파일확인
            filename_no_ext = os.path.splitext(data[2].split('/')[-1])
            file_index = 0
            for idx, value in enumerate(files):
                if value.find(filename_no_ext[0]) != -1:
                    file_index =  idx
                    break
            logger.debug('fileindex : %s', file_index)
            content = SystemLogicSelenium.get_file_content(files[file_index])
            byteio = io.BytesIO()
            byteio.write(content)
            filedata = byteio.getvalue()
            return send_file(
                io.BytesIO(filedata),
                mimetype='application/octet-stream',
                as_attachment=True,
                attachment_filename=data[2])
       
        return download2(data, scheduler_instance)
    except Exception as e:
        logger.error('Exception:%s', e)
        logger.error(traceback.format_exc())
示例#2
0
    def get_html(url, referer=None, stream=False, cookie=None, selenium_tag=None):
        try:
            logger.debug('get_html :%s', url)
            if selenium_tag:
                #if 'USE_SELENIUM' in site_instance.info['EXTRA']:
                from system import SystemLogicSelenium
                data = SystemLogicSelenium.get_pagesoruce_by_selenium(url, selenium_tag)

            else:
                headers['Referer'] = '' if referer is None else referer
                if cookie is not None:
                    headers['Cookie'] = cookie

                if LogicFromSite.proxyes:
                    page_content = LogicFromSite.session.get(url, headers=headers, proxies=LogicFromSite.proxyes, stream=stream, verify=False)
                else:
                    page_content = LogicFromSite.session.get(url, headers=headers, stream=stream, verify=False)
                if cookie is not None:
                    del headers['Cookie']
                if stream:
                    return page_content
                data = page_content.content
            #logger.debug(data)
        except Exception as e:
            logger.error('Exception:%s', e)
            logger.error(traceback.format_exc())
            logger.error('Known..')
            data = LogicFromSite.get_data(url)
        return data
示例#3
0
    def pageparser(url):
        try:
            if ModelSetting.get('use_selenium') == 'True':
                from system import SystemLogicSelenium
                return SystemLogicSelenium.get_pagesoruce_by_selenium(
                    url, '//footer[@class="at-footer"]')
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7'
            }
            page_source = requests.get(url, headers=headers).text
            #logger.debug(page_source)
            return page_source

            if ModelSetting.get('proxy') == 'False' and ModelSetting.get(
                    'cloudflare_bypass') == 'False':
                page_source = requests.get(url, headers=headers).text
            elif ModelSetting.get('proxy') == 'True' and ModelSetting.get(
                    'cloudflare_bypass') == 'False':
                page_source = requests.get(url,
                                           headers=headers,
                                           proxies={
                                               "https":
                                               ModelSetting.get('proxy_url'),
                                               'http':
                                               ModelSetting.get('proxy_url')
                                           }).text
            elif ModelSetting.get('proxy') == 'False' and ModelSetting.get(
                    'cloudflare_bypass') == 'True':
                if LogicNormal.scraper is None:
                    LogicNormal.scraper = cfscrape.create_scraper()
                page_source = LogicNormal.scraper.get(url,
                                                      headers=headers).text
            elif ModelSetting.get('proxy') == 'True' and ModelSetting.get(
                    'cloudflare_bypass') == 'True':
                if LogicNormal.scraper is None:
                    LogicNormal.scraper = cfscrape.create_scraper()
                page_source = LogicNormal.scraper.get(
                    url,
                    headers=headers,
                    proxies={
                        "https": ModelSetting.get('proxy_url'),
                        'http': ModelSetting.get('proxy_url')
                    }).text
        except Exception as e:
            logger.error('Exception:%s', e)
            logger.error(traceback.format_exc())
            return None
        return page_source
示例#4
0
    def __get_bbs_list(site_instance,
                       board,
                       max_page,
                       max_id,
                       xpath_dict,
                       is_test=False):
        bbs_list = []
        index_step = xpath_dict[
            'INDEX_STEP'] if 'INDEX_STEP' in xpath_dict else 1
        index_start = xpath_dict[
            'INDEX_START'] if 'INDEX_START' in xpath_dict else 1
        stop_by_maxid = False
        if 'FORCE_FIRST_PAGE' in site_instance.info['EXTRA']:
            max_page = 1
        cookie = None
        if 'COOKIE' in site_instance.info:
            cookie = site_instance.info['COOKIE']

        for p in range(max_page):
            url = LogicFromSite.get_board_url(site_instance, board, str(p + 1))
            list_tag = xpath_dict['XPATH'][:xpath_dict['XPATH'].find('[%s]')]
            #list_tag = '/html/body/main/div/div/div[3]/div/table/tbody'
            logger.debug('list_tag : %s', list_tag)

            logger.debug('Url : %s', url)
            if 'USE_SELENIUM' in site_instance.info['EXTRA']:
                from system import SystemLogicSelenium
                tmp = SystemLogicSelenium.get_pagesoruce_by_selenium(
                    url, list_tag)
            else:
                tmp = LogicFromSite.get_html(url, cookie=cookie)
            #logger.debug(tmp)
            tree = html.fromstring(tmp)
            #tree = html.fromstring(LogicFromSite.get_html(url)))

            lists = tree.xpath(list_tag)

            logger.debug('Count : %s', len(lists))

            for i in range(index_start, len(lists) + 1, index_step):
                try:
                    a_tag = tree.xpath(xpath_dict['XPATH'] % i)
                    a_tag_index = len(a_tag) - 1

                    if a_tag_index == -1:
                        logger.debug('a_tag_index : %s', a_tag_index)
                        continue
                    item = {}
                    #
                    if 'TITLE_XPATH' in xpath_dict:

                        #logger.debug(a_tag[a_tag_index].xpath(xpath_dict['TITLE_XPATH']))
                        if xpath_dict['TITLE_XPATH'].endswith('text()'):
                            logger.debug(a_tag[a_tag_index].xpath(
                                xpath_dict['TITLE_XPATH']))

                            item['title'] = py_urllib.unquote(
                                a_tag[a_tag_index].xpath(
                                    xpath_dict['TITLE_XPATH'])[-1]).strip()
                        else:
                            item['title'] = py_urllib.unquote(
                                a_tag[a_tag_index].xpath(
                                    xpath_dict['TITLE_XPATH'])
                                [0].text_content()).strip()
                    else:
                        item['title'] = py_urllib.unquote(
                            a_tag[a_tag_index].text_content()).strip()

                    if 'TITLE_SUB' in xpath_dict:
                        item['title'] = re.sub(xpath_dict['TITLE_SUB'][0],
                                               xpath_dict['TITLE_SUB'][1],
                                               item['title']).strip()

                    # 일반적이 제목 처리 후 정규식이 있으면 추출
                    if 'TITLE_REGEX' in xpath_dict:
                        match = re.compile(xpath_dict['TITLE_REGEX']).search(
                            item['title'])
                        if match:
                            item['title'] = match.group('title')

                    item['url'] = a_tag[a_tag_index].attrib['href']
                    if 'DETAIL_URL_SUB' in site_instance.info:
                        #item['url'] = item['url'].replace(site_instance.info['DETAIL_URL_RULE'][0], site_instance.info['DETAIL_URL_RULE'][1].format(URL=site_instance.info['TORRENT_SITE_URL']))
                        item['url'] = re.sub(
                            site_instance.info['DETAIL_URL_SUB'][0],
                            site_instance.info['DETAIL_URL_SUB'][1].format(
                                URL=site_instance.info['TORRENT_SITE_URL']),
                            item['url'])

                    if not item['url'].startswith('http'):
                        form = '%s%s' if item['url'].startswith(
                            '/') else '%s/%s'
                        item['url'] = form % (
                            site_instance.info['TORRENT_SITE_URL'],
                            item['url'])

                    item['id'] = ''
                    if 'ID_REGEX' in site_instance.info:
                        id_regexs = [site_instance.info['ID_REGEX']]
                        #id_regexs.insert(0, site_instance.info['ID_REGEX'])
                    else:
                        id_regexs = [
                            r'wr_id\=(?P<id>\d+)', r'\/(?P<id>\d+)\.html',
                            r'\/(?P<id>\d+)$'
                        ]
                    for regex in id_regexs:
                        match = re.compile(regex).search(item['url'])
                        if match:
                            item['id'] = match.group('id')
                            break
                    if item['id'] == '':
                        for regex in id_regexs:
                            match = re.compile(regex).search(
                                item['url'].split('?')[0])
                            if match:
                                item['id'] = match.group('id')
                                break

                    logger.debug('ID : %s, TITLE : %s', item['id'],
                                 item['title'])
                    if item['id'].strip() == '':
                        continue
                    if is_test:
                        bbs_list.append(item)
                    else:
                        if 'USING_BOARD_CHAR_ID' in site_instance.info[
                                'EXTRA']:
                            # javdb
                            from .model import ModelBbs2
                            entity = ModelBbs2.get(
                                site=site_instance.info['NAME'],
                                board=board,
                                board_char_id=item['id'])
                            if entity is None:
                                bbs_list.append(item)
                                logger.debug('> Append..')
                            else:
                                logger.debug('> exist..')
                        else:
                            # 2019-04-04 토렌트퐁
                            try:
                                if 'NO_BREAK_BY_MAX_ID' in site_instance.info[
                                        'EXTRA']:
                                    if int(item['id']) <= max_id:
                                        continue
                                    else:
                                        bbs_list.append(item)
                                else:
                                    if int(item['id']) <= max_id:
                                        logger.debug('STOP by MAX_ID(%s)',
                                                     max_id)
                                        stop_by_maxid = True
                                        break
                                    bbs_list.append(item)
                                    #logger.debug(item)
                            except Exception as e:
                                logger.error('Exception:%s', e)
                                logger.error(traceback.format_exc())
                except Exception as e:
                    logger.error('Exception:%s', e)
                    logger.error(traceback.format_exc())
                    logger.error(site_instance.info)
            if stop_by_maxid:
                break
        logger.debug('Last count :%s', len(bbs_list))
        return bbs_list