def download(urlfilepath):
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)  
    filename = os.path.basename(urlfilepath)
    writeTmpfile = os.path.join(donepath, filename+'.temp')
    writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done')
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile) 
    httpsflag = False
    if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS:
        readlines = FileUtility.readlines(urlfilepath)
        for line in readlines:
            if line.strip().startswith('https'):
                httpsflag = True
                break
    #创建空文件
    with open(writeTmpfile,'a+') as filetemp:
        filetemp.write('')    
    if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag:
        downWebkit(urlfilepath, writeTmpfile)
    elif urlfilepath.endswith(constant.POST_FILE_SUFFIX):
        downPost(urlfilepath, writeTmpfile)
    else:
        downGet(urlfilepath, writeTmpfile)
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile))
    FileUtility.remove(urlfilepath)       
示例#2
0
    def storagequery(self):
        QueryStorage.updatedb()
        SpiderConfigure.getinstance().setchannel(SPIDER_CHANNEL_S2)
        s2file = SpiderConfigure.getinstance().gets2file()
        if FileUtility.exists(s2file):
            lines = FileUtility.readlines(s2file)
            for strquery in lines:
                QueryStorage.getinstance().storequery(strquery)
                QueryStorage.getinstance().storewaibuquery(strquery)

        tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                              const.SPIDER_S3_INPUT_FILE)
        if FileUtility.exists(tiebafile):
            lines = FileUtility.readlines(tiebafile)
            for strquery in lines:
                if not self.checks3query(strquery):
                    continue
                query = strquery.split('\t')[0].strip()
                url = strquery.split('\t')[1].strip()
                QueryStorage.getinstance().storetiebaquery(query, url)
示例#3
0
 def s2upload(self, sfile):
     if FileUtility.exists(sfile):
         lines = FileUtility.readlines(sfile)
         for line in lines:
             try:
                 query = line.strip()
                 self.conf.setchannel(SPIDER_CHANNEL_S2)
                 self.conf.setquery(query)
                 URLFileManager.getinstance().generateurlfilepath()
                 allsite = self.factory.getall()
                 for site in allsite:
                     site.s2query(query)
             except:
                 Logger.printexception()
def downGet(urlfilepath, writeTmpfile, second=2):
    get = Get()
    try:
        lines = FileUtility.readlines(urlfilepath)
        for line in lines:
            try:   
                jsonstr = get.get(line.strip())
                if not jsonstr:
                    Logger.getlogging().debug('Download URL failed: {url}'.format(url=line.strip()))
                    continue
                with open(writeTmpfile,'a+') as filetemp:
                    filetemp.write(jsonstr+'\n')
                time.sleep(second)
            except:                 
                Logger.printexception()
    except:
        Logger.printexception()
示例#5
0
 def s3upload(self, tiebafile):
     lines = FileUtility.readlines(tiebafile)
     querylist = []
     sitelist = []
     self.conf.setchannel(SPIDER_CHANNEL_S2)
     for strquery in lines:
         query = strquery.split('\t')[0].strip()
         url = strquery.split('\t')[1].strip()
         Logger.getlogging().debug(query)
         Logger.getlogging().debug(url)
         self.conf.setquery(query)
         URLFileManager.getinstance().generateurlfilepath()
         querylist.append(query)
         site = self.factory.getsite(url)
         site.s2query(url)
         if site not in sitelist:
             sitelist.append(site)
示例#6
0
 def s1upload(self, sfile):
     if FileUtility.exists(sfile):
         lines = FileUtility.readlines(sfile)
         self.conf.setchannel(SPIDER_CHANNEL_S1)
         self.conf.setquery('')
         URLFileManager.getinstance().generateurlfilepath()
         for line in lines:
             try:
                 url = line.strip()
                 params = PageBasicInfo()
                 params.url = url
                 #NewsStorage.seturlinfos(params)
                 context = URLContext()
                 context.originalurl = url
                 context.type = URLContext.S1_MAIN_BODY
                 Logger.getlogging().debug(url)
                 URLManager.getinstance().storeurl(url, context,
                                                   REQUEST_TYPE_WEBKIT)
             except:
                 Logger.printexception()
示例#7
0
    def wb_analysis(self, filepath):
        Logger.getlogging().info(
            'Now, Start to analysis Waibu file {fl}'.format(fl=filepath))
        if '302_tencent_video' in filepath:
            type = constant.SPIDER_S2_WEBSITE_VIDEO
        else:
            type = constant.SPIDER_S2_WEBSITE_NEWS

        self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
        lines = FileUtility.readlines(filepath)
        tempwaibustorage = {}
        for line in lines:
            try:
                line = json.loads(line)
                params = PageBasicInfo()
                params.query = line['query']
                params.url = line['url']
                params.title = Common.strfilter(line['title'])
                params.body = Common.strfilter(line['body'])
                params.pubtime = line['pubtime']
                clicknum = line.get('clicknum', 0)
                if clicknum:
                    params.clicknum = int(clicknum)
                params.type = type
                if params.query not in URLManager.waibustorage:
                    URLManager.waibustorage[params.query] = []
                if params.query not in tempwaibustorage:
                    tempwaibustorage[params.query] = []
                URLManager.waibustorage[params.query].append(params)
                tempwaibustorage[params.query].append(params)
            except:
                Logger.printexception()

        Logger.getlogging().debug(
            'Now, Starting Select url to Insert and Update for uploading WAIBU data!'
        )
        for query in tempwaibustorage:
            paramslist = tempwaibustorage[query]
            for params in paramslist:
                self.conf.setquery(query)
                NewsStorage.seturlinfos(params)
def downPost(urlfilepath, writeTmpfile, second=10):
    post = Post()
    try:
        lines = FileUtility.readlines(urlfilepath)
        lenth = len(lines)
        for line in lines:
            jsonline = json.loads(line.strip())
            try: 
                url = jsonline['url']
                data = jsonline['data']
                #cookie = post.createCookie(url)
                jsonstr = post.post(url, data, cookie=None)
                if not jsonstr:
                    Logger.getlogging().debug('Download URL failed: url:{url}\tdata:{data}'.format(url=url, data=str(data)))
                    continue                
                with open(writeTmpfile,'a+') as filetemp:
                    filetemp.write(jsonstr+'\n')
                if lenth > 1:
                    time.sleep(second)
                lenth = lenth - 1
            except:               
                Logger.printexception()
    except:
        Logger.printexception()