Python defaultlist示例，JavHelper.core.utils.defaultlist Python示例

示例#1

0

显示文件

def javlib_set_page(page_template: str,
                    page_num=1,
                    url_parameter=None,
                    config=None) -> dict:
    xpath_dict = {
        'title': '//*[@class="video"]/a/@title',
        'javid': '//*[@class="video"]/@id',
        'img': '//*[@class="video"]/a/img/@src',
        'car': '//*/div[@class="video"]/a/div[@class="id"]/text()'
    }
    xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href'

    # force to get url from ini file each time
    javlib_url = return_config_string(['其他设置', 'javlibrary网址'])

    lib_url = javlib_url + page_template.format(page_num=page_num,
                                                url_parameter=url_parameter)
    print(f'accessing {lib_url}')

    res = return_post_res(lib_url, behind_cloudflare=True).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = find_max_page(root.xpath(xpath_max_page)[0])
    except IndexError:
        max_page = page_num

    return jav_objs_raw, max_page

示例#2

0

显示文件

def javbus_magnet_search(car: str):
    jav_url = return_config_string(['其他设置', 'javbus网址'])
    gid_match = r'.*?var gid = (\d*);.*?'
    magnet_xpath = {
        'magnet': '//tr/td[position()=1]/a[1]/@href',
        'title': '//tr/td[position()=1]/a[1]/text()',
        'size': '//tr/td[position()=2]/a[1]/text()'
    }
    main_url_template = jav_url+'{car}'
    magnet_url_template = jav_url+'ajax/uncledatoolsbyajax.php?gid={gid}&uc=0'

    res = return_get_res(main_url_template.format(car=car)).text
    gid = re.search(gid_match, res).groups()[0]

    res = return_get_res(magnet_url_template.format(gid=gid), headers={'referer': main_url_template.format(car=car)}).content
    root = etree.HTML(res)

    magnets = defaultlist(dict)
    for k, v in magnet_xpath.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            magnets[_i].update({k: _value.strip('\t').strip('\r').strip('\n').strip()})
            if k == 'size':
                magnets[_i].update({'size_sort': parsed_size_to_int(_value.strip('\t').strip('\r').strip('\n').strip())})
    
    return magnets

示例#3

0

显示文件

def javdb_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict:
    """
    website parse function
    """
    xpath_dict = {
        'title': '//a[@class="box"]/div[@class="video-title"]/text()',
        'javid': '//a[@class="box"]/div[@class="uid"]/text()',
        'img': '//div[@class="item-image fix-scale-cover"]/img/@data-src',
        'car': '//a[@class="box"]/div[@class="uid"]/text()'
    }
    xpath_max_page = '//ul[@class="pagination-list"]/li/a[@class="pagination-link"][last()]/text()'

    # force to get url from ini file each time
    javdb_url = 'https://javdb4.com/'
    set_url = javdb_url + page_template.format(page_num=page_num, url_parameter=url_parameter)
    print(f'accessing {set_url}')

    # not really behind cloudflare but may prevent python scrape
    res = return_post_res(set_url, cookies={'over18': "1"}, behind_cloudflare=True).content
    root = etree.HTML(res.decode('utf-8'))

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = root.xpath(xpath_max_page)[-1]
    except:
        max_page = page_num
    if not max_page:
        max_page = page_num
    
    return jav_objs_raw, max_page

示例#4

0

显示文件

def javbus_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict:
    xpath_dict = {
        'title': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@title',
        'javid': '//div[@class="photo-info"]/span/date[1]/text()',
        'img': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@src',
        'car': '//div[@class="photo-info"]/span/date[1]/text()'
    }
    xpath_max_page = '//ul[@class="pagination pagination-lg"]/li/a/text()'

    # force to get url from ini file each time
    javbus_url = return_config_string(['其他设置', 'javbus网址'])
    set_url = javbus_url + page_template.format(page_num=page_num, url_parameter=url_parameter)
    print(f'accessing {set_url}')

    res = return_post_res(set_url).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = root.xpath(xpath_max_page)[-2]
    except:
        max_page = page_num
    if not max_page:
        max_page = page_num
    
    return jav_objs_raw, max_page

示例#5

0

显示文件

文件： jav321.py 项目： windygu/JAVOneStop

def jav321_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict:
    xpath_dict = {
        'title': '//div[@class="thumbnail"]/a/text()',
        'javid': '//div[@class="thumbnail"]/a/@href',  # need to extract from link
        'img': '//div[@class="thumbnail"]/a/img/@src',
        'car': '//div[@class="thumbnail"]/a/text()'  # need to extract from title
    }
    xpath_max_page = '//ul[@class="pager"]/li[@class="next"]/a/text()'
    max_page = page_num  # default value

    # force to get url from ini file each time
    #javbus_url = return_config_string(['其他设置', 'javbus网址'])
    jav_url = 'https://www.jav321.com/'
    set_url = jav_url + page_template.format(page_num=page_num, url_parameter=url_parameter)
    print(f'accessing {set_url}')

    res = return_post_res(set_url).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            # need to extract car from title, reusing file_scanner function
            if k == 'car':
                # need to separate text with car first
                _preprocess = _value.split(' ')[-1]
                
                # try to extract proper car
                try:
                    name_group = re.search(DEFAULT_FILENAME_PATTERN, _preprocess)
                    name_digits = name_group.group('digit')

                    # only keep 0 under 3 digits
                    # keep 045, 0830 > 830, 1130, 0002 > 002, 005
                    if name_digits.isdigit():
                        name_digits = str(int(name_digits))
                    while len(name_digits) < 3:
                        name_digits = '0' + name_digits
                    _value = name_group.group('pre') + '-' + name_digits
                except AttributeError as e:
                    print(f'cannot extract standard car format from {_preprocess} due to {e}')
                    _value = _preprocess
            jav_objs_raw[_i].update({k: _value})

    try:
        _new_max = root.xpath(xpath_max_page)
        if len(_new_max) > 0:
            max_page = int(max_page) + 1
    except:
        pass

    # max page override
    #if 'type' in page_template:
    #    max_page = max_page * 100
    
    return jav_objs_raw, max_page

示例#6

0

显示文件

def jav777_set_page(page_template: str,
                    page_num=1,
                    url_parameter=None,
                    config=None) -> dict:
    xpath_dict = {
        'title': '//h2[@class="post-title"]/a/@title',
        'javid': '//div[@class="post-container"]/div/@id',
        'img': '//div[@class="featured-media"]/a/img/@src',
        'car': '//h2[@class="post-title"]/a/@title'
    }
    xpath_max_page = '//center/a[position() = (last()-1)]/text()'

    # force to get url from ini file each time
    jav777_url = JAV777_URL
    set_url = jav777_url + page_template.format(page_num=page_num,
                                                url_parameter=url_parameter)
    print(f'accessing {set_url}')

    res = return_post_res(set_url).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            # need to extract car from title, reusing file_scanner function
            if k == 'car':
                # remove hd prefixes
                _value = _value.lstrip('(HD)')

                name_group = re.search(DEFAULT_FILENAME_PATTERN, _value)
                name_digits = name_group.group('digit')

                # only keep 0 under 3 digits
                # keep 045, 0830 > 830, 1130, 0002 > 002, 005
                if name_digits.isdigit():
                    name_digits = str(int(name_digits))
                while len(name_digits) < 3:
                    name_digits = '0' + name_digits
                _value = name_group.group('pre') + '-' + name_digits
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = root.xpath(xpath_max_page)[0]
    except:
        max_page = page_num
    if not max_page:
        max_page = page_num

    return jav_objs_raw, max_page

示例#7

0

显示文件

def javlib_set_page(page_prefix: str, page_num: int, config=None) -> dict:
    xpath_dict = {
        'title': '//*[@class="video"]/a/@title',
        'javid': '//*[@class="video"]/@id',
        'img': '//*[@class="video"]/a/img/@src',
        'car': '//*/div[@class="video"]/a/div[@class="id"]/text()'
    }
    xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href'

    # force to get url from ini file each time
    javlib_url = return_config_string(['其他设置', 'javlibrary网址'])

    # fill missing parameters
    if config == None:
        config = deepcopy(DEFAULT_JAVLIB_CONFIG)

    lib_url = javlib_url + page_prefix + str(page_num)
    print(f'accessing {lib_url}')

    res = return_post_res(lib_url,
                          proxies=config['proxies'],
                          cookies=config['cookies']).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = find_max_page(root.xpath(xpath_max_page)[0])
    except IndexError:
        max_page = page_num

    return jav_objs_raw, max_page