Exemplo n.º 1
0
def get_img_url_list(url):
    try:
        soup = BeautySoupTool.BeautySoupTool(url)
        # new_title = common.replace_sub(title)
        img_url_list = soup.beautySoup.select(
            "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] table tr td img[file]"
        )
        img_url_list_2 = soup.beautySoup.select(
            "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tr td img[file]"
        )
        img_url_list_3 = soup.beautySoup.select(
            "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div div[class='postattachlist'] dl dd p img[file]"
        )
        img_url_list_1 = soup.beautySoup.select(
            "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tbody tr td a[href]"
        )
        img_url_list.extend(img_url_list_2)
        img_url_list.extend(img_url_list_3)
        img_url_list.extend(img_url_list_1)
        # print('----------- 去重 ------------------')
        new_list = common.list_distinct(img_url_list)
        # print('去重后图片数量:' + str(len(new_list)))
        return [new_list, soup.title]
    except Exception as e:
        logger.info("get_img_url_list 请求失败,{},{}".format(
            common.get_datetime('%Y/%m/%d %H:%M'), e))
        return [[url], e]
Exemplo n.º 2
0
def distinct_url():
    # 先打开文件,去重
    down_urls = []
    with open(file_path, encoding='utf-8') as file_obj:
        for num, value in enumerate(file_obj, 1):
            line = value.strip('\n')
            if line == '':
                continue
            start = line.find('https')
            end = line.find(',')
            down_url = line[start:end]
            down_urls.append(down_url)
    logger.info('去重前数据:' + str(len(down_urls)))
    list_distinct = common.list_distinct(down_urls)
    logger.info('去重后数据:' + str(len(list_distinct)))
    return list_distinct
Exemplo n.º 3
0
def get_child_img_url(url):
    soup = common.get_beauty_soup(url)
    img_url_list = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] table tr td img[file]"
    )
    img_url_list_2 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tr td img[file]"
    )
    img_url_list_3 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div div[class='postattachlist'] dl dd p img[file]"
    )
    img_url_list_1 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tbody tr td a[href]"
    )
    img_url_list.extend(img_url_list_2)
    img_url_list.extend(img_url_list_3)
    img_url_list.extend(img_url_list_1)
    new_list = common.list_distinct(img_url_list)
    return new_list
Exemplo n.º 4
0
def get_jh_img_url_list(line):
    soup = common.get_beauty_soup(line)
    new_title = common.replace_sub(soup.title.string)
    img_url_list = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] table tr td img[file]"
    )
    img_url_list_2 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tr td img[file]"
    )
    img_url_list_3 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div div[class='postattachlist'] dl dd p img[file]"
    )
    img_url_list_1 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tbody tr td a[href]"
    )
    img_url_list.extend(img_url_list_2)
    img_url_list.extend(img_url_list_3)
    img_url_list.extend(img_url_list_1)
    new_list = common.list_distinct(img_url_list)
    print('去重后图片数量:' + str(len(new_list)))
    return [new_list, new_title]