예제 #1
0
def find_link(region, zone, url_dir):
    if os.path.isfile(url_dir):
        if url_dir.endswith('Top_URL.txt'):
            print url_dir

            #找到链接文件中的每个链接,并以此为种子地址,自动开始下载
            for line in open(url_dir):
                try:
                    line_content = line.strip('\n')
                    rank, userUrl = line_content.split('\t')

                    have_a_rest("23:00:00", "06:00:00")
                    print "Zone " + zone + ":No.", rank, " website is downloading: ", userUrl
                    #result='''
                    dir_name = "more_sites"
                    save_path = web4.makeDIR(region, zone)
                    save_path = web4.makeDIR(save_path, dir_name)
                    #设定网站爬取的深度,排名越靠前,爬取的深度越深
                    rank = int(rank)
                    depth = 1

                    dir_path, myUrl, filestamp = web4.saveHTM(
                        region, zone, save_path, userUrl, rank, depth)  #储存网页
                    if dir_path != 'Error':
                        #爬取成功!延时1秒之后继续爬取
                        print "please wait for 1 seconds"
                        time.sleep(1)

                    else:
                        print dir_path, myUrl, filestamp
                    #'''
                except Exception, e:
                    print 'Error', str(e)
                    #创建错误日志路径
                    error_path = web4.makeDIR(region, zone)
                    error_path = web4.makeDIR(error_path, "url")
                    error_path = os.path.join(error_path,
                                              'get_start_site_ErrorLog.txt')
                    ErrorLog = open(error_path, 'a')  #追加出现的各种错误
                    ErrorLog.write(line_content + '\tError:\t' + str(e))
                    ErrorLog.close()
예제 #2
0
def move_url_file(filename):
    #还原URL文件路径
    region, zone, timestamp, depth, postex = filename.split('_', 4)
    url_file_name = filename + ".txt"
    url_path = os.path.join(region, zone)
    url_path = os.path.join(url_path, "more_urls")
    dateDir = timestamp[0:8]
    url_path = os.path.join(url_path, dateDir)
    src_path = os.path.join(url_path, url_file_name)
    dst_path = web4.makeDIR(url_path, "done")
    dst_path = os.path.join(dst_path, url_file_name)

    shutil.move(src_path, dst_path)

    print "Now put file " + filename + " into Done directory..."
예제 #3
0
def find_link(region, url_dir, timestamp, sec):  #遍历链接文件所在目录,逐个寻找链接文件并访问

    finish = "yes"  #下载链接的完成标志,初始默认为yes

    #创建错误日志路径
    error_path = web4.makeDIR(url_dir, "err_log")
    error_path = os.path.join(error_path, 'file_web_ErrorLog.txt')
    ErrorLog = open(error_path, 'a')  #追加记录爬取中的各种错误

    for s in os.listdir(url_dir):
        thisDir = os.path.join(url_dir, s)
        if "done" in thisDir or "err_log" in thisDir:
            #如果遍历遇到err_log目录或done目录则略过
            continue
        elif thisDir.endswith('URL.txt'):
            #如果碰到链接文件,则读取并下载里面的链接
            print thisDir
            finish = "no"
            path, filename = os.path.split(thisDir)  #分离路径和文件名
            filename, filetype = os.path.splitext(filename)  #分离文件名和扩展名

            #找到链接文件中的每个链接,并以此为种子地址,自动开始下载
            for line in open(thisDir):
                try:
                    line = line.strip('\n')
                    #print line+","
                    userUrl, fromUrl, rank = line.split('\t', 2)
                    if not userUrl.startswith('http'):
                        continue
                    have_a_rest("23:00:00", "06:00:00")  #夜间停止下载
                    get(filename, userUrl, rank, sec)
                except Exception, e:
                    print 'Error', str(e)
                    ErrorLog.write(line + '\t' + 'Error:' + '\t' + str(e))
            print "Links in file " + filename + " are done!"
            #将已遍历完的URL文件放进同一层的done目录下
            move_url_file(filename)
예제 #4
0
                    if not userUrl.startswith('http'):
                        continue
                    have_a_rest("23:00:00", "06:00:00")  #夜间停止下载
                    get(filename, userUrl, rank, sec)
                except Exception, e:
                    print 'Error', str(e)
                    ErrorLog.write(line + '\t' + 'Error:' + '\t' + str(e))
            print "Links in file " + filename + " are done!"
            #将已遍历完的URL文件放进同一层的done目录下
            move_url_file(filename)

    ErrorLog.close()
    return finish


#print now_time
if __name__ == '__main__':
    while True:
        region = raw_input("请输入爬虫分布的区域:")
        zone = raw_input("请输入爬虫工作的目录:")
        timestamp = raw_input("please input the date name( 20170810 e.g)")
        sec = raw_input("please input the time between 2 visits:")

        url_path = web4.makeDIR(region, zone)
        url_path = web4.makeDIR(url_path, "more_urls")  #在more_url子目录下进行寻找
        url_path = web4.makeDIR(url_path, timestamp)

        #遍历链接文件所在目录,逐个寻找链接文件并访问
        find_link(region, url_path, timestamp, sec)
    print "所有网站处理完毕"
예제 #5
0
-------------------------------------------------------------------------------
"""

#---------------------------------import---------------------------------------
import os
import shutil
import urllib2
import re
from BeautifulSoup import BeautifulSoup

import get_web_list
import web4
import thisLink3_0

#------------------------------------------------------------------------------

###############################################################################
if __name__ == "__main__":

    #1.下载排行榜网页
    region = raw_input("请输入爬虫分布的领域(mil,sports...):")
    size = raw_input("请输入该领域内每个区域的大小(以页数为单位):")
    src_dir = web4.makeDIR(region, "url")
    get_web_list.find_link(region, src_dir)
    print "The ChinaZ site already done!"
    #2.提取排行榜上的网站链接
    site_path = web4.makeDIR(region, "site")  #新建保存目录
    thisLink3_0.find_link(region, site_path, size)

    print "该领域的所有分区已建立,请启动main.py脚本开始多进程下载!"
예제 #6
0
파일: main.py 프로젝트: maxin5452/coolyun
import thisLink2_1
import file_web2_1

#------------------------------------------------------------------------------

###############################################################################
if __name__ == "__main__":

    region = raw_input("请输入爬虫分布的区域:")
    zone = raw_input("请输入爬虫工作的目录:")
    #zone为同一个region下面的分区,方便开启多进程同时下载
    timestamp = raw_input(
        "Please input the timestamp(20170809 eg.) you want to do:")
    sec = raw_input("please input the time between 2 crawling:")

    root_url_dir = web4.makeDIR(region, zone)
    root_url_dir = web4.makeDIR(root_url_dir, "url")
    #遍历链接文件所在目录,寻找链接文件并访问
    get_start_site.find_link(region, zone, root_url_dir)
    print "All seed webpages are done ! Now begin to download other webpages...s"

    while True:

        #根据database提取链接,database里面的timestamp和后面file_web2.1的timestamp默认为
        #一个
        over = thisLink2_1.find_file(region, zone, timestamp)
        print over

        #根据URL文件下载网页
        #result='''
        print "Now start the new round of crawling..."
예제 #7
0
                        print dir_path, myUrl, filestamp
                    #'''
                except Exception, e:
                    print 'Error', str(e)
                    #创建错误日志路径
                    error_path = web4.makeDIR(region, zone)
                    error_path = web4.makeDIR(error_path, "url")
                    error_path = os.path.join(error_path,
                                              'get_start_site_ErrorLog.txt')
                    ErrorLog = open(error_path, 'a')  #追加出现的各种错误
                    ErrorLog.write(line_content + '\tError:\t' + str(e))
                    ErrorLog.close()

    elif os.path.isdir(url_dir):
        for s in os.listdir(url_dir):
            newDir = os.path.join(url_dir, s)
            find_link(region, zone, newDir)


#print now_time
if __name__ == '__main__':
    while True:
        region = raw_input("请输入爬虫工作区域:")
        zone = raw_input("请输入爬虫工作分区:")
        #zone为同一个region下面的分区,方便开启多进程同时下载
        url_dir = web4.makeDIR(region, zone)
        url_dir = web4.makeDIR(url_dir, "url")
        #遍历链接文件所在目录,逐个寻找链接文件并访问
        find_link(region, zone, url_dir)
    print "所有网站处理完毕"