Exemplo n.º 1
0
def write_to_text(params):
    temp = 0
    start_page = params['start_page']
    end_page = params['end_page']
    select_str = params['select_str']
    down_url = params['down_url']

    # done_down_text = cur_dir + params['done_down_text']
    # print(os.getcwd())
    done_down_text = common.get_file_name_list(os.getcwd(), "text")[0]
    print(done_down_text)
    with open(done_down_text, 'a+') as fileObj:
        readLines = fileObj.read().splitlines()
    for i in range(start_page, end_page):
        soup = common.get_beauty_soup(down_url % i)
        itemUrl = soup.select(select_str)

        for j in range(0, len(itemUrl)):
            fileUrl = itemUrl[j].get('href')
            print('fileUrl:' + fileUrl)
            if fileUrl is not None and fileUrl.find(
                    '.html') >= 0 and fileUrl not in readLines:
                os.chdir(cur_dir)
                # 保存已下载连接
                with open(done_down_text, 'a+') as ff:
                    ff.write(fileUrl + '\n')
                fileUrl = pre_url + fileUrl
                temp += 1
                print("fileUrl:" + fileUrl)
                # 保存下载连接
                with open(
                        common.get_datetime('%Y-%m-%d') + '_' +
                        str(temp // 500) + '.txt', 'a+') as f:
                    f.write(fileUrl + '\n')
Exemplo n.º 2
0
def get_child_img_url(url):
    soup = common.get_beauty_soup(url)
    img_url_list = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] table tr td img[file]"
    )
    img_url_list_2 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tr td img[file]"
    )
    img_url_list_3 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div div[class='postattachlist'] dl dd p img[file]"
    )
    img_url_list_1 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tbody tr td a[href]"
    )
    img_url_list.extend(img_url_list_2)
    img_url_list.extend(img_url_list_3)
    img_url_list.extend(img_url_list_1)
    new_list = common.list_distinct(img_url_list)
    return new_list
Exemplo n.º 3
0
def down_noval(params):
    # 文件下载路径
    down_path = params['down_file_path'] + cur_month
    select_str = params['select_str']
    require_pre_url = params['require_pre_url']
    if require_pre_url:
        pre_url_ = params['pre_url']
    if not (os.path.exists(down_path)):
        os.makedirs(down_path)
    file_name_list = common.get_file_name_list(cur_dir, 'txt')
    for index, file_name in enumerate(file_name_list, 1):
        print('下载第 %i 个文件: %s' % (index, file_name))
        # 打开文件
        with open(file_name) as file_obj:
            for num, value in enumerate(file_obj, 1):
                line = value.strip('\n')
                if require_pre_url:
                    line = pre_url_ + line
                print('第 %i 行: %s' % (num, line))
                soup = common.get_beauty_soup(line)
                title = common.replace_special_char(soup.title.string)
                newTitle = title.split('www')[-1]

                print(str(newTitle.strip()))
                textContent = soup.select(select_str)
                print('text数量:' + str(len(textContent)))
                if len(textContent) == 0:
                    os.chdir(os.getcwd())
                    with open(
                            common.get_datetime('%Y-%m-%d') + '_未下载.txt',
                            'a+') as f:
                        f.write('第' + str(num) + '行:' + line + ',' + newTitle +
                                '\n')
                else:
                    os.chdir(down_path)
                    with open(newTitle.strip() + '.txt', 'a+',
                              encoding='utf8') as f:
                        for i in range(0, len(textContent)):
                            text = textContent[i].text
                            os.chdir(down_path)
                            f.write(text + '\n')
                print("-----down over----------------")
Exemplo n.º 4
0
def get_jh_img_url_list(line):
    soup = common.get_beauty_soup(line)
    new_title = common.replace_sub(soup.title.string)
    img_url_list = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] table tr td img[file]"
    )
    img_url_list_2 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tr td img[file]"
    )
    img_url_list_3 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div div[class='postattachlist'] dl dd p img[file]"
    )
    img_url_list_1 = soup.select(
        "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tbody tr td a[href]"
    )
    img_url_list.extend(img_url_list_2)
    img_url_list.extend(img_url_list_3)
    img_url_list.extend(img_url_list_1)
    new_list = common.list_distinct(img_url_list)
    print('去重后图片数量:' + str(len(new_list)))
    return [new_list, new_title]
Exemplo n.º 5
0
import os

import common

temp = 0
for i in range(1, 47):
    print('第 %i 页' % i)
    url = "https://www.4455sk.com/xiaoshuo/list-情感小说-%i.html"
    print(url)
    soup = common.get_beauty_soup(url)
    itemUrl = soup.select(
        "body div[class='maomi-content'] main[id='main-container'] div[class='text-list-html'] div ul li a"
    )

    for j in range(0, len(itemUrl)):
        fileUrl = itemUrl[j].get('href')
        # print('fileUrl:'+fileUrl)
        if fileUrl is not None and fileUrl.find('.html') >= 0:
            fileUrl = "https://www.4455sk.com/" + fileUrl
            temp += 1
            print("fileUrl:" + fileUrl)
            os.chdir(os.getcwd())
            with open(
                    'qinggan_%s_%i.txt' %
                (common.get_datetime('%Y-%m-%d'), temp // 500), 'a+') as f:
                f.write(fileUrl + '\n')
print("打印完成")
Exemplo n.º 6
0
import common

# 获取总行数
path = 'D:/down/qinggan/%s/' % common.get_datetime('%Y-%m-%d')
if not (os.path.exists(path)):
    os.makedirs(path)

file_list = common.get_file_name_list(os.getcwd(), 'txt')
for index, file_name in enumerate(file_list, 1):
    print('下载第 %i 个文件: %s' % (index, file_name))
    # 打开文件
    with open(file_name) as file_obj:
        for num, value in enumerate(file_obj, 1):
            line = value.strip('\n')
            print('第 %i 行: %s' % (num, line))
            itemSoup = common.get_beauty_soup(line)
            title = common.replace_special_char(itemSoup.title.string)
            newTitle = title.split('www')[-1]

            print(str(newTitle.strip()))
            textContent = itemSoup.select(
                "body div[class='maomi-content'] main[id='main-container'] div[class='content']"
            )
            if len(textContent) == 0:
                with open(
                        'qinggan_%s_未下载.txt' % common.get_datetime('%Y-%m-%d'),
                        'a+') as f:
                    f.write('第 %i 行:%s ; %s \n' % (num, line, newTitle) +
                            str(num))
            else:
                with open(newTitle.strip() + '.txt', 'a+',