def __init__(self,parent=None):
     SiteComments.__init__(self)
     self.r = RegexUtility()
     # self.basicstorage = BaseInfoStorage()
     # self.commentstorage = CommentsStorage()
     if parent:
         self.website = parent.website         
Пример #2
0
 def __download__(self, downloaderlist):
     valid_json_files = []
     for impl in downloaderlist:
         json_files = impl.download()
         for dfile in json_files:
             for ufile in self.upload_file_list.keys():
                 if RegexUtility.match(
                         Downloader.DOWNLOAD_FORMAT1.format(file=ufile),
                         dfile):
                     self.upload_file_list.pop(ufile)
                     if FileUtility.exists(dfile):
                         valid_json_files.append(dfile)
                         Logger.getlogging().info('downloadedjsonfile\t' +
                                                  dfile)
                 elif RegexUtility.match(
                         Downloader.DOWNLOAD_FORMAT2.format(file=ufile),
                         dfile):
                     value = RegexUtility.parse(
                         Downloader.DOWNLOAD_FORMAT2.format(file=ufile),
                         dfile)[0]
                     if FileUtility.exists(dfile):
                         valid_json_files.append(dfile)
                         Logger.getlogging().info('downloadedjsonfile\t' +
                                                  dfile)
                     if value[0] == value[1]:
                         self.upload_file_list.pop(ufile)
         retransmissionfiles = impl.outtimefiles()
         for fl in retransmissionfiles.keys():
             # 下载异常
             if fl not in self.all_retransmissionfiles:
                 self.all_retransmissionfiles[fl] = retransmissionfiles[fl]
             self.all_retransmissionfiles[fl].retrans_num += 1
             self.all_retransmissionfiles[fl].taskinfo = impl
             self.retransmissionfiles[fl] = self.all_retransmissionfiles[fl]
             if self.retransmissionfiles[
                     fl].retrans_num <= self.retransmissionlimitnum:
                 # 虽然下载失败了,但假装已下载,故在upload_file_list删除
                 self.upload_file_list.pop(fl)
                 Logger.getlogging().debug(
                     'download fail file {fl}:{num}th fail'.format(
                         fl=fl,
                         num=self.all_retransmissionfiles[fl].retrans_num))
             else:
                 # 虽然下载失败了,但假装已下载,故在upload_file_list删除;不再重传,在重传列表中删除
                 self.upload_file_list.pop(fl)
                 self.retransmissionfiles.pop(fl)
                 Logger.getlogging().debug(
                     'download fail file {fl}:more then {num}th fail'.
                     format(
                         fl=fl,
                         num=self.all_retransmissionfiles[fl].retrans_num -
                         1))
     return valid_json_files
 def __init__(self, post_url, parent=None):
     # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
     SiteS2Query.__init__(self)
     self.fakeoriginalurl = post_url
     self.r = RegexUtility()
     #self.post_url = BBSS2PostQuery.POST_URL
     self.post_url = post_url
     self.queryinfo = ''
     if parent:
         self.website = parent.website
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
         'Accept-Encoding': 'gzip, deflate, sdch'
     }
 def get(self, url):
     saveJson = {}
     try:
         Logger.getlogging().debug('Downloading: {url}'.format(url=url))
         request = urllib2.Request(url, headers=self.headers)
         response = urllib2.urlopen(request, timeout=self.timeout)
         code = response.getcode()
         info = response.info()
         # 判断返回的code,如果不是200,则返回空
         if code == 200:
             html = response.read()
             if (("Content-Encoding" in info) and (info['Content-Encoding'] == "gzip")):
                 html = zlib.decompress(html, 16 + zlib.MAX_WBITS);
             Logger.getlogging().debug('Request Sucessed: {url}'.format(url=url))
         else:
             Logger.getlogging().error('open {url} error, code = {code}'.format(url=url, code=code))
             Logger.getlogging().error('Request Failed: {url}'.format(url=url))
             return None
     except:
         Logger.getlogging().error('Request   Failed: {url}'.format(url=url))
         Logger.printexception()
         return None
     charset = RegexUtility.getid('charset', html)
     html = Common.trydecode(html, charset)
     saveJson['foundin'] = Common.urlenc(url)
     saveJson['html'] = Common.urlenc(html.encode(constant.CHARSET_UTF8))
     saveJson['crawler_time'] = int(time.time())
     jsonStr = json.dumps(saveJson)
     return jsonStr     
Пример #5
0
 def __init__(self, parent=None):
     # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
     SiteS2Query.__init__(self)
     self.fakeoriginalurl = 'https://donghua.dmzj.com/'
     self.r = RegexUtility()
     if parent:
         self.website = parent.website
Пример #6
0
 def analysis(self, line, post=False):
     param = ProcessParam()
     js = json.loads(line)
     param.crawler_time = TimeUtility.getuniformtime2(js['crawler_time'])
     param.url = Common.urldec(js['foundin'])
     param.content = js['html']
     if post:
         param.data = js['data']
     if js['html'][:3] == constant.GZIP_CODE:
         param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS)
     # decode
     content = Common.urldec(param.content)
     charset = RegexUtility.getid('charset', content)
     content = Common.trydecode(content, charset)
     param.content = content
     if 'property' in js:
         for property in js['property']:
             if not property.has_key('result'):
                 continue
             if property['property_name'] == u'page_body':
                 param.page_body = Common.trydecode(Common.urldec(property['result'][0]['text']),
                                                    constant.CHARSET_GBK)
             elif property['property_name'] == u'page_title':
                 param.page_title = Common.trydecode(Common.urldec(property['result'][0]['text']),
                                                     constant.CHARSET_GBK)
             elif property['property_name'] == u'html_time':
                 param.html_time = TimeUtility.getuniformtime2(property['result'][0]['text'])
     return param
Пример #7
0
 def analysis(self, line, method):
     try:
         js = json.loads(line)
         param = ProcessParam()
         param.crawler_time = TimeUtility.getuniformtime(js['crawler_time'])
         param.url = Common.urldec(js['foundin'])
         param.content = js['html']
         if method == constant.REQUEST_TYPE_POST:
             param.data = js['data']
         if js['html'][:3] == constant.GZIP_CODE:
             param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS)
         # decode
         content = Common.urldec(param.content)
         charset = RegexUtility.getid('charset', content)
         content = Common.trydecode(content, charset)
         param.content = content
         return param
     except:
         line = line.replace('\n', '').strip()
         if not line or line[0] == '#':
             return
         Logger.getlogging().debug(line)
         param = ProcessParam()
         param.url = line
         if method == constant.REQUEST_TYPE_POST:
             js = json.loads(line)
             param.url = js['url']
             param.data = js['data']
         param.content = HttpCacher.getcontent(line, method)
         if param.content is None:
             return
         return param
Пример #8
0
 def updateurlfilecontext(self, filename, urlfilecontext):
     for key in self.urlsfilemap.keys():
         if RegexUtility.match(key + '.*', filename):
             self.urlsfilemap[key] = urlfilecontext
             break
     else:
         self.urlsfilemap[filename] = urlfilecontext
Пример #9
0
def analysis(line):
    param = ProcessParam()
    js = json.loads(line)
  
    param.url = js['foundin']
    param.content = js['html']
    if js['html'][:3] == constant.GZIP_CODE:
        param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS)
    # decode
    content = Common.urldec(param.content)
    charset = RegexUtility.getid('charset', content)
    content = Common.trydecode(content, charset)
    param.content = content
    return param
Пример #10
0
 def process(self, params):
     type = SinaComments.COMMON_PATTERN_TYPE
     for key in SinaComments.REGEX_PATTERNS.keys():
         if RegexUtility.match(SinaComments.REGEX_PATTERNS[key], params.originalurl):
             type = key
     Logger.getlogging().debug('{url}:{key}'.format(url=params.originalurl, key=type))
     if type == SinaComments.VIDEO_PATTERN_TYPE:
         CommonComments(self).process(params)
     elif type == SinaComments.BLOG_PATTERN_TYPE:
         BlogComments(self).process(params)
     elif type == SinaComments.MANHUA_PATTERN_TYPE:
         self.manhuaprocess(params)
     elif type == SinaComments.STGP_PATTERN_TYPE:
         CommonComments(self).process(params)
     else:
         CommonComments(self).process(params)
Пример #11
0
 def referurl(self, params):
     res = True
     for pattern in ETLController.REFER_URLS_PATTERN:
         if RegexUtility.match(pattern, params.url):
             html = etree.HTML(params.content)
             oriurl = html.xpath('//*[@rel="canonical"]/@href')
             if oriurl:
                 Logger.getlogging().debug('inurl:' + params.url)
                 params.url = oriurl[0]
                 params.originalurl = params.url
                 Logger.getlogging().debug('outurl:' + params.url)
             else:
                 res = False
                 Logger.getlogging().warning('nocanonical:' + params.url)
             break
     return res
 def get(self, url):
     saveJson = {}
     try:
         Logger.getlogging().debug('Downloading: {url}'.format(url=url))
         self.driver.get(url)
         time.sleep(self.waitsec)
         html = self.driver.page_source
         Logger.getlogging().debug('Request Sucessed: {url}'.format(url=url))
     except:
         Logger.getlogging().error('Request   Failed: {url}'.format(url=url))
         Logger.printexception()
         return None
     charset = RegexUtility.getid('charset', html)
     html = Common.trydecode(html, charset)
     saveJson['foundin'] = Common.urlenc(url)
     saveJson['html'] = Common.urlenc(html.encode(constant.CHARSET_UTF8))
     saveJson['crawler_time'] = int(time.time())
     jsonStr = json.dumps(saveJson)
     return jsonStr    
Пример #13
0
 def str2cmtnum(value, key):
     value = value.replace(',', '')
     multiplier = 1
     for unit in SiteBasicInfo.UNITS.keys():
         if unit in value:
             multiplier = SiteBasicInfo.UNITS[unit]
     values = re.findall(r'\d+[.]?\d*', value)
     res = -1
     if len(values) == 1:
         res = float(values[0]) * multiplier
     elif len(values) > 1:
         value = SiteBasicInfo.strip(value, '').replace(u'(', '(').replace(
             u')', ')').replace(u':', ':')
         for format in SiteBasicInfo.NUMBER_FORMATS[key]:
             str = RegexUtility.search(format, value)
             if str:
                 res, c = SiteBasicInfo.str2num(str.group(0))
                 break
         else:
             res = float(values[0]) * multiplier
     return int(res), len(values)
Пример #14
0
 def __getcontent(self, url, method):
     database = bsddb.btopen(self.file, 'c')
     if database.has_key(Common.md5(url)):
         content = Common.urldec(
             database[Common.md5(url)]).decode(CHARSET_DEFAULT)
         database.close()
         return content
     if method == constant.REQUEST_TYPE_POST:
         js = json.loads(url)
         content = HttpUtility().post(js['url'], js['data'])
     elif method == constant.REQUEST_TYPE_WEBKIT:
         content = HttpUtility().wget(url)
     elif method == constant.REQUEST_TYPE_IMG:
         content = HttpUtility().get(url)
         content = binascii.b2a_hex(content)
     else:
         content = HttpUtility().get(url)
     if content is None:
         database.close()
         return None
     charset = RegexUtility().getid('charset', content)
     unic = Common.trydecode(content, charset)
     utf8str = unic.encode(CHARSET_UTF8)
     charset = CHARSET_UTF8
     self.urlmap[Common.md5(url)] = unic
     # content = content.encode('utf8')
     line = {
         "md5": Common.md5(url),
         "charset": charset,
         "html": Common.urlenc(utf8str),
         "url": Common.urlenc(url)
     }
     if len(utf8str) > 2000:
         database = bsddb.btopen(self.file, 'c')
         database[Common.md5(url)] = Common.urlenc(utf8str)
         database.close()
         # FileUtility.writeline(self.file, json.dumps(line))
     return utf8str.decode(CHARSET_UTF8)
 def __init__(self, parent):
     # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
     SiteS2Query.__init__(self)
     self.fakeoriginalurl = 'https://tieba.baidu.com'
     self.r = RegexUtility()
     self.website = parent.website
Пример #16
0
 def __init__(self):
     SiteS2Query.__init__(self)
     self.r = RegexUtility()
     self.fakeoriginalurl = 'http://cartoon.pptv.com/'
Пример #17
0
 def match(self, url):
     for pt in self.patterns:
         if RegexUtility.match(pt, url):
             return True
     return False
 def __init__(self):
     # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
     SiteS2Query.__init__(self)
     self.fakeoriginalurl = 'http://www.52tian.net/'
     self.querylastdays = SpiderConfigure.getinstance().getlastdays()
     self.r = RegexUtility()
class tian52S2Query(SiteS2Query):
    TIAN52_QUERY_TEMPLATE = 'http://www.52tian.net/-----------/{q}/'
    TIAN52_QUERY_P_TEMPLATE = 'http://www.52tian.net/-----------{p}/{q}/'
    TIAN52_S2QUERY_FIRST_PAGE = 'S2QUERY_FIRST_PAGE'
    TIAN52_S2QUERY_EACH_PAGE = 'S2QUERY_EACH_PAGE'
    TIAN52_S2QUERY_EACH_PAGE_CMP = 'S2QUERY_EACH_PAGE_CMP'

    ##############################################################################################
    # @functions:__init__
    # @param: none
    # @return:none
    # @author:Liyanrui
    # @date:2016/11/24
    # @note:天上人间动漫网搜类的构造器,初始化内部变量
    ##############################################################################################
    def __init__(self):
        # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
        SiteS2Query.__init__(self)
        self.fakeoriginalurl = 'http://www.52tian.net/'
        self.querylastdays = SpiderConfigure.getinstance().getlastdays()
        self.r = RegexUtility()

    ################################################################################################################
    # @functions:query
    # @info: query condition
    # @return:none
    # @note:SiteS2Query,S2 query
    ################################################################################################################
    def query(self, info):
        q = Common.urlenc(info)
        urls = [tian52S2Query.TIAN52_QUERY_TEMPLATE.format(q=q)]
        self.__storeqeuryurllist__(urls, self.TIAN52_S2QUERY_FIRST_PAGE,
                                   {'query': q})

    ################################################################################################################
    # @functions:process
    # @params: see WebSite.process
    # @return:none
    # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表
    ################################################################################################################
    def process(self, params):
        try:

            # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL
            if params.step == tian52S2Query.TIAN52_S2QUERY_FIRST_PAGE:
                self.step1(params)
                # 获得首页url参数
            elif params.step == tian52S2Query.TIAN52_S2QUERY_EACH_PAGE:
                #print '########',params.content
                if self.r.search(u'/v7/404.asp', params.content):
                    Logger.getlogging().warning(
                        '{url}:40000. HttpRespond:404 Maybe no search results'.
                        format(url=params.url))
                    return
                if re.findall(
                        '^http[s]{0,1}://www\.52tian\.net/[(qingbao)|(tupian)|(yinyue)].*',
                        params.originalurl):
                    self.step3(params)
                else:
                    self.step2(params)

        except:
            Logger.printexception()

    #----------------------------------------------------------------------
    def step1(self, params):
        """获取查询的url列表"""
        q = params.customized['query']
        soup = BeautifulSoup(params.content, 'html5lib')
        pageobj = soup.select('.pages > a')
        if pageobj:
            pages = int(pageobj[-3].get_text())
        else:
            pages = 1
        # 所有循环列表
        querylist = []
        # 根据总页数,获取query列表
        for page in range(1, pages + 1, 1):
            url = tian52S2Query.TIAN52_QUERY_P_TEMPLATE.format(
                p=page, q=params.customized['query'])
            querylist.append(url)
        self.__storeqeuryurllist__(querylist,
                                   tian52S2Query.TIAN52_S2QUERY_EACH_PAGE,
                                   {'query': q})

    #----------------------------------------------------------------------
    def step2(self, params):
        """获取视频类的url列表"""
        key = Common.urldec(params.customized['query'])
        soup = BeautifulSoup(params.content, 'html5lib')
        lis = soup.select('.imagelist2 > ul > li')
        if lis:
            urllist = []
            for li in lis:
                title = li.select_one('a').get_text()
                if key not in title:
                    continue
                url = li.select_one('a').get('href')
                urllist.append(url)
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

    #----------------------------------------------------------------------
    #新闻部分暂时没做
    def step3(self, params):
        """获取新闻类的url列表"""
        key = Common.urldec(params.customized['query'])
        soup = BeautifulSoup(params.content, 'html5lib')
        lis = soup.select('.wzlist > ul > li.wztitle')
        if lis:
            urllist = []
            for li in lis:
                title = li.select_one('a').get_text()
                # if key not in title:
                if not Common.checktitle(key, title):
                    continue
                pubtime = li.select_one('span').get_text()
                url = 'http://www.52tian.net' + li.select_one('a').get('href')
                if compareNow(getuniformtime(pubtime), self.querylastdays):
                    urllist.append(url)
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Пример #20
0
 def __init__(self):
     SiteComments.__init__(self)
     self.r = RegexUtility()
     self.basicstorage = BaseInfoStorage()
     self.commentstorage = CommentsStorage()
Пример #21
0
class MofangS2Query(SiteS2Query):
    QUERY_TEMPLATE = 'http://www.mofang.com/index.php?m=search&a=json_init&q={key}&type=video&page={pageno}&pagesize={pagesize}'
    QUERY_TEMPLATE_BBS = 'http://bbs.mofang.com/searchThread?keyword={key}&p={pageno}&pagesize={pagesize}'
    DEFAULT_PAGE_SIZE = 20
    S2QUERY_FIRST_PAGE = 'S2QUERY_FIRST_PAGE'
    S2QUERY_EACH_PAGE = 'S2QUERY_EACH_PAGE'

    ################################################################################################################
    # @functions:__init__
    # @param: none
    # @return:none
    # @note:MofangS2Query,初始化内部变量
    ################################################################################################################
    def __init__(self):
        # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
        SiteS2Query.__init__(self)
        self.r = RegexUtility()
        self.fakeoriginalurl = 'http://www.mofang.com/'

    ################################################################################################################
    # @functions:query
    # @info: query condition
    # @return:none
    # @note:SiteS2Query,S2 query
    ################################################################################################################
    def query(self, info):
        Logger.getlogging().info("query")
        keyvalue = Common.urlenc(info)
        #keyvalue = info
        # step1: 根据key, 拼出下面的url
        # http://www.mofang.com/index.php?m=search&a=json_init&q={key}&type=video&page={页数}&pagesize=1
        # 视频S2
        urls = [
            MofangS2Query.QUERY_TEMPLATE.format(key=keyvalue,
                                                pageno=1,
                                                pagesize=1)
        ]
        Logger.getlogging().debug(urls[0])
        self.__storeqeuryurllist__(urls, self.S2QUERY_FIRST_PAGE,
                                   {'query': info})

        # 2016/12/20 测试时发现,该论坛的搜索功能改为discuz 为Post获取数据,因此关闭论坛S2功能。
        # 论坛S2
        # urls = [MofangS2Query.QUERY_TEMPLATE_BBS.format(key=keyvalue, pageno=1, pagesize=20)]
        # Logger.getlogging().debug(urls[0])
        # self.__storeqeuryurllist__(urls, self.S2QUERY_FIRST_PAGE, {'query':info})

    ################################################################################################################
    # @functions:process
    # @params: see WebSite.process
    # @return:none
    # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表
    ################################################################################################################
    def process(self, params):
        if self.r.match(r'^http://www\.mofang\.com/.*', params.url):
            self.processVideo(params)
        # else:
        #     self.processBBS(params)

    ################################################################################################################
    # @functions:processVideo
    # @params: see WebSite.processVideo
    # @return:none
    # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表
    ################################################################################################################
    def processVideo(self, params):
        if params.step == MofangS2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回json内容,comments['totalnums'] 得到视频总数
            #一个json返回20条数据,需要使用总数除以20获得总页数,然后在写到page参数里面
            info = params.customized['query']
            keyvalue = Common.urlenc(info)
            try:
                jsondate = json.loads(params.content)
                comments_count = jsondate['totalnums']
            except:
                Logger.getlogging().warning('{}:40000'.format(params.url))
                return
            # 获取不到,则返回
            if int(comments_count) == 0:
                return

            page_count = int(
                math.ceil(float(comments_count) / self.DEFAULT_PAGE_SIZE))
            # 根据上面的page_count数,拼出所有的搜索结果url(最新1周)
            querylist = []
            if page_count > 0:
                for page in range(1, page_count + 1, 1):
                    url = MofangS2Query.QUERY_TEMPLATE.format(
                        key=keyvalue,
                        pageno=page,
                        pagesize=self.DEFAULT_PAGE_SIZE)
                    Logger.getlogging().debug(url)
                    querylist.append(url)
                self.__storeqeuryurllist__(querylist,
                                           MofangS2Query.S2QUERY_EACH_PAGE,
                                           {'query': info})

        elif params.step == MofangS2Query.S2QUERY_EACH_PAGE:
            # Step3: 根据Step2的返回jason数据,获取
            # 标题:comments['data'][0开始到19]['title']
            # 连接:comments['data'][0开始到19]['url']
            # 视频发布时间:comments['data'][0开始到19]['inputtime'] 这个需要截断前10位,只能对比日期

            info = params.customized['query']
            try:
                jsondate = json.loads(params.content)
                searchresult = jsondate['data']
            except:
                Logger.getlogging().warning('{}:40000'.format(params.url))
                return

            # 获取当前日(日期类型)
            today = datetime.datetime.strptime(TimeUtility.getcurrentdate(),
                                               TimeUtility.DATE_FORMAT_DEFAULT)

            urllist = []
            for index in range(0, len(searchresult), 1):
                #print searchresult[index]['title']
                #print searchresult[index]['inputtime']
                if searchresult[index]['title'] is not None:
                    # 标题中包含指定要查询的关键字,对应的url保存
                    # if searchresult[index]['title'].find(info) > -1:
                    if Common.checktitle(info, searchresult[index]['title']):
                        if searchresult[index]['inputtime'] is not None:
                            #inputtime = datetime.datetime.strptime(TimeUtility.getuniformtime2(int(searchresult[index]['inputtime'])), TimeUtility.TIME_FORMAT_DEFAULT)
                            #intervaldays = today - inputtime
                            #if intervaldays.days <= int(self.querylastdays):
                            pubtime = getuniformtime(
                                str(searchresult[index]['inputtime']))

                            if compareNow(pubtime, int(self.querylastdays)):
                                urllist.append(searchresult[index]['url'])
                        else:
                            # 获取不到发布时间,则默认为周期以内
                            urllist.append(searchresult[index]['url'])

            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

    ################################################################################################################
    # @functions:processBBS
    # @params: see WebSite.processBBS
    # @return:none
    # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表
    ################################################################################################################
    def processBBS(self, params):
        if params.step == MofangS2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回json内容,comments['totalnums'] 得到视频总数
            #一个json返回20条数据,需要使用总数除以20获得总页数,然后在写到page参数里面
            info = params.customized['query']
            keyvalue = Common.urlenc(info)
            try:
                jsondate = json.loads(params.content)
                comments_count = jsondate['data']['total']
            except:
                Logger.getlogging().warning('{}:40000'.format(params.url))
                return
            # 获取不到,则返回
            if int(comments_count) == 0:
                return

            page_count = int(
                math.ceil(float(comments_count) / self.DEFAULT_PAGE_SIZE))
            # 根据上面的page_count数,拼出所有的搜索结果url(最新1周)
            querylist = []
            if page_count > 0:
                for page in range(1, page_count + 1, 1):
                    url = MofangS2Query.QUERY_TEMPLATE_BBS.format(
                        key=keyvalue,
                        pageno=page,
                        pagesize=self.DEFAULT_PAGE_SIZE)
                    Logger.getlogging().debug(url)
                    querylist.append(url)
                self.__storeqeuryurllist__(querylist,
                                           MofangS2Query.S2QUERY_EACH_PAGE,
                                           {'query': info})

        elif params.step == MofangS2Query.S2QUERY_EACH_PAGE:
            # Step3: 根据Step2的返回jason数据,获取
            # 标题:comments['data']['threads'][0开始到19]['subject']
            # 连接:comments['data']['threads'][0开始到19]['link_url']
            # 视频发布时间:comments['data']['threads'][0开始到19]['create_time'] 这个需要截断前10位,只能对比日期

            info = params.customized['query']
            try:
                jsondate = json.loads(params.content)
                searchresult = jsondate['data']['threads']
            except:
                Logger.getlogging().warning('{}:40000'.format(params.url))
                return

            # 获取当前日(日期类型)
            today = datetime.datetime.strptime(TimeUtility.getcurrentdate(),
                                               TimeUtility.DATE_FORMAT_DEFAULT)

            urllist = []
            for index in range(0, len(searchresult), 1):

                if searchresult[index]['subject'] is not None:
                    # 标题中包含指定要查询的关键字,对应的url保存
                    # if searchresult[index]['subject'].find(info) > -1:
                    if Common.checktitle(info, searchresult[index]['subject']):
                        if searchresult[index]['create_time'] is not None:
                            #inputtime = datetime.datetime.strptime(TimeUtility.getuniformtime2(int(searchresult[index]['create_time'])), TimeUtility.TIME_FORMAT_DEFAULT)
                            #intervaldays = today - inputtime
                            #if intervaldays.days <= int(self.querylastdays):
                            #urllist.append(searchresult[index]['link_url'])
                            inputtime = getuniformtime(
                                str(searchresult[index]['create_time']))

                            if compareNow(inputtime, int(self.querylastdays)):
                                urllist.append(searchresult[index]['link_url'])
                        else:
                            # 获取不到发布时间,则默认为周期以内
                            urllist.append(searchresult[index]['link_url'])

            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Пример #22
0
 def __init__(self):
     SiteComments.__init__(self)
     self.r = RegexUtility()
     self.client_id = 'cytaCBUri'
Пример #23
0
 def __init__(self):
     SiteS2Query.__init__(self)
     self.r = RegexUtility()
     self.fakeoriginalurl = 'http://www.acfun.cn/v/'
     self.querylastdays = SpiderConfigure.getinstance().getlastdays()
class NarutomS2Query(SiteS2Query):
    QUERY_TEMPLATE = 'http://search.narutom.com/cse/search?q={key}&p={pageno}&s=7660238846226745217&entry=1'
    FIRST_PAGE = 'http://search.narutom.com/cse/search?s=7660238846226745217&entry=1&q={key}'
    DEFAULT_PAGE_SIZE = 10
    MAX_COUNT = 750
    S2QUERY_FIRST_PAGE = 'S2QUERY_FIRST_PAGE'
    S2QUERY_EACH_PAGE = 'S2QUERY_EACH_PAGE'

    ################################################################################################################
    # @functions:__init__
    # @param: none
    # @return:none
    # @note:NarutomS2Query类构造器,初始化内部变量
    ################################################################################################################
    def __init__(self):
        # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
        SiteS2Query.__init__(self)
        self.fakeoriginalurl = 'http://www.narutom.com/'
        self.r = RegexUtility()

    ################################################################################################################
    # @functions:query
    # @info: query condition
    # @return:none
    # @note:SiteS2Query,S2 query
    ################################################################################################################
    def query(self, info):
        Logger.getlogging().info("query")
        keyvalue = Common.urlenc(info)

        #Step1:根据key, 拼出下面的url(不能设置最新和一周检索条件)
        #http://search.narutom.com/cse/search?s=7660238846226745217&entry=1&ie=gbk&q=key的urlcode
        url = NarutomS2Query.FIRST_PAGE.format(key=keyvalue)
        urls = [url]
        Logger.getlogging().debug(urls[0])
        self.__storeqeuryurllist__(urls, self.S2QUERY_FIRST_PAGE,
                                   {'query': info})

    ################################################################################################################
    # @functions:process
    # @params: see WebSite.process
    # @return:none
    # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表
    ################################################################################################################
    def process(self, params):
        if params.step == NarutomS2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回的url,通过xpath://*[@id="results"]/span 得到搜索结果总件数,根据总件数,拼出搜索结果的url,写文件保存
            html = etree.HTML(params.content)
            nodes = html.xpath('//*[@id="results"]/span')
            # 获取不到,则返回
            if len(nodes) == 0:
                return

            # 获取总检索件数(例如:为您找到相关结果1,307个)
            count = 0
            totalstr = nodes[0].text.replace(',', '')
            if self.r.search(u'\d+', totalstr):
                countstr = self.r.parse(u'(\d+)', totalstr)[0]
                count = int(countstr)
                # 该网站最多能获得750件检索结果
                if count > self.MAX_COUNT:
                    count = self.MAX_COUNT
            else:
                return

            # 根据上面的count数,拼出所有的搜索结果url
            info = params.customized['query']
            keyvalue = Common.urlenc(info)
            page_count = float(count / self.DEFAULT_PAGE_SIZE)
            firstpage = NarutomS2Query.FIRST_PAGE.format(key=keyvalue)
            querylist = []
            querylist.append(firstpage)
            if count > 10:
                #第二页的page数是1,第三页是2...... page数范围是:1-74(表示第2页-第75页)
                for page in range(1, int(math.ceil(page_count)), 1):
                    url = NarutomS2Query.QUERY_TEMPLATE.format(key=keyvalue,
                                                               pageno=page)
                    querylist.append(url)

            self.__storeqeuryurllist__(querylist,
                                       NarutomS2Query.S2QUERY_EACH_PAGE,
                                       {'query': info})

        elif params.step == NarutomS2Query.S2QUERY_EACH_PAGE:
            # Step3:根据Step2的返回结果,通过xpath: //*[@id="results"]/div/h3/a/@href 获得搜索结果的url,把url写入文件
            info = params.customized['query']
            html = etree.HTML(params.content)
            nodes = html.xpath('//*[@id="results"]/div/h3/a/@href')
            #titles = html.xpath('//*[@id="results"]/div/h3/a')
            pubtimestr = html.xpath('//*[@class="c-showurl"]')

            datecheck = False
            if len(pubtimestr) == len(nodes):
                datecheck = True

            urllist = []
            for index in range(0, len(nodes), 1):
                # if titles[index] is not None and titles[index].find(info) > -1:
                # if titles[index] is not None and Common.checktitle(info, titles[index]):
                # 标题中包含指定要查询的关键字,对应的url保存
                if datecheck:
                    # 如果xpath获取到了包含时间的字符串,检查时间
                    if self.r.search('(\d+-\d+-\d+)', pubtimestr[index].text):
                        pubtime = getuniformtime(
                            self.r.parse('(\d+-\d+-\d+)',
                                         pubtimestr[index].text)[0])
                        if compareNow(pubtime, int(self.querylastdays)):
                            urllist.append(nodes[index])
                else:
                    urllist.append(nodes[index])
            '''
            urllist = []
            for node in nodes:
                urllist.append(node)
            '''
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Пример #25
0
class YoukuComments(SiteComments):
    COMMENTS_URL = 'http://p.comments.youku.com/ycp/comment/pc/commentList?objectId=%s&app=100-DDwODVkv' \
                   '&currentPage=%d&pageSize=%d&listType=0&sign=%s&time=%s'
    PLAYINFO_URL = 'http://v.youku.com/action/getVideoPlayInfo?vid={vid}&param%5B%5D=updown&callback=data'
    PAGE_SIZE = 30
    STEP_1 = None
    STEP_2 = 2
    STEP_3 = 3

    ##############################################################################################
    # @functions:__init__
    # @param: none
    # @return:none
    # @author:QW_Liang
    # @date:2017/09/07
    # @note:youkuComments类的构造器,初始化内部变量
    ##############################################################################################
    def __init__(self):
        SiteComments.__init__(self)
        self.r = RegexUtility()

    ##############################################################################################
    # @functions:process
    # @param:共通模块传入的参数(对象url, 原始url, 当前step数,自定义参数)
    # @return:Step1:获取评论的首页url
    #          Step2:获取评论的所有url
    #          Step3: 抽出的评论和最新评论的创建时间
    # @author:QW_Liang
    # @date:2017/09/07
    # @note:Step1:通过共通模块传入的html内容获取到oid,拼出获取评论总页数的url,并传递给共通模块
    #        Step2:通过共通模块传入的html内容获取到评论总页数,拼出获取评论的url,并传递给共通模块
    #        Step3:通过共通模块传入的html内容获取到评论和最新评论的创建时间,并传递给共通模块
    ##############################################################################################
    def process(self, params):
        try:
            if params.step is YoukuComments.STEP_1:
                # 从url中获取拼接评论url的参数
                objectId = self.r.getid('videoId', params.content, '\s*:\s*"')
                pTime = str(
                    int(
                        time.mktime(
                            datetime.datetime.timetuple(
                                datetime.datetime.now())) * 1000))
                #获取参数中的随机数
                sign = MD5().m(
                    '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime)
                # 拼接第一页评论url
                comments_url = YoukuComments.COMMENTS_URL % (
                    objectId, 1, YoukuComments.PAGE_SIZE, sign, pTime)
                #通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              YoukuComments.STEP_2, {'objectId': objectId})

                # 来疯吧直播播放量
                if self.r.search(r'^http://v\.laifeng\.com/\d+',
                                 params.originalurl):
                    clicknum = int(self.r.getid('onlineNum', params.content))
                    NewsStorage.setclicknum(params.originalurl, clicknum)

                if objectId:
                    playinfo_url = YoukuComments.PLAYINFO_URL.format(
                        vid=objectId)
                    self.storeurl(playinfo_url, params.originalurl,
                                  YoukuComments.STEP_2, {'objectId': objectId})
            #获取第一页评论内容,循环获取全部评论url
            elif params.step == YoukuComments.STEP_2:
                if re.findall('getVideoPlayInfo\?vid', params.url):
                    playinfo = json.loads((params.content)[20:-2])
                    clicknum = int(playinfo['data']['stat']['vv'].replace(
                        ',', ''))
                    votenum = int(playinfo['data']['updown']['up'].replace(
                        ',', ''))
                    NewsStorage.setclicknum(params.originalurl, clicknum)
                    NewsStorage.setvotenum(params.originalurl, votenum)
                else:
                    objectId = params.customized['objectId']
                    pTime = str(
                        int(
                            time.mktime(
                                datetime.datetime.timetuple(
                                    datetime.datetime.now())) * 1000))
                    # 获取参数中的随机数
                    sign = MD5().m(
                        '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' +
                        pTime)
                    # 获取评论的Jason返回值
                    comments = json.loads(params.content)
                    # 比较上次抓取该url的页面评论量和当前取到的评论量
                    if not comments.has_key('data'):
                        Logger.getlogging().warning(
                            "{url}:30000 No comments!".format(
                                url=params.originalurl))
                        return
                    if not comments['data']:
                        Logger.getlogging().warning(
                            "{url}:30000 No comments!".format(
                                url=params.originalurl))
                        return

                    # 判断增量
                    comments_count = comments['data']['totalSize']
                    cmtnum = CMTStorage.getcount(params.originalurl, True)
                    if int(comments_count <= cmtnum):
                        return
                    NewsStorage.setcmtnum(params.originalurl, comments_count)

                    # 获取评论总页数
                    comments_pages = int(comments['data']['totalPage'])
                    if comments_pages == 0:
                        return
                    # 如果评论数量过多只取前十页
                    if comments_pages >= self.maxpages:
                        comments_pages = self.maxpages

                    lasttime = CMTStorage.getlastpublish(
                        params.originalurl, True)
                    # 循环拼接评论url,提交下载平台获取评论数据
                    for page in range(0, comments_pages + 1, 1):
                        commentUrl = YoukuComments.COMMENTS_URL % (
                            objectId, page + 1, YoukuComments.PAGE_SIZE, sign,
                            pTime)
                        self.storeurl(commentUrl, params.originalurl,
                                      YoukuComments.STEP_3,
                                      {'objectId': objectId})

                    NewsStorage.setcmtnum(params.originalurl,
                                          int(comments['data']['totalSize']))

            #解析评论数据
            elif params.step == YoukuComments.STEP_3:
                commentsinfo = json.loads(params.content)
                for comment in commentsinfo['data']['comment']:
                    content = str(comment['content'])
                    curtime = TimeUtility.getuniformtime(
                        int(comment['createTime']))
                    nick = comment['user']['userName']
                    # 通过时间判断评论增量
                    # if curtime > lasttime:
                    if not CMTStorage.exist(params.originalurl, content,
                                            curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content,
                                            curtime, nick)
        except:
            Logger.printexception()
 def download(self):
     """
     平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download()
     step1:从平台上下载数据到本地./data/platform
     step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json
     """
     files = []
     if self.completed():
         return files
     Logger.getlogging().debug(self.download_path)
     srclist = FileUtility.getfilelist(self.download_path, [])
     for donefile in srclist:
         filename = FileUtility.getfilename(donefile)
         if donefile.endswith(
                 'done') and filename not in self.download_file_list:
             self.download_file_list.append(filename)
             self.download_time = time.time()
             for upfile in self.upload_file_list.keys():
                 if filename.startswith(upfile):
                     FileUtility.copy(donefile, self.cache_path)
                     binfile = self.cache_path + FileUtility.getfilename(
                         donefile)
                     if FileUtility.getfilesize(
                             donefile) == FileUtility.getfilesize(binfile):
                         Logger.getlogging().info(
                             'Remove {file}'.format(file=donefile))
                         FileUtility.remove(donefile)
                         if FileUtility.exists(donefile):
                             Logger.getlogging().error(
                                 'Remove {file} failed'.format(
                                     file=donefile))
                     else:
                         Logger.getlogging().error(
                             'File not equal {file}'.format(file=donefile))
                     jsonfile = self.bin2json(binfile)
                     files.append(jsonfile)
                     uploadtime = self.uploadfile_retranslist[
                         upfile].start_time
                     if RegexUtility.match(
                             TencentDownloader.DOWNLOAD_FORMAT1.format(
                                 file=upfile), filename):
                         self.upload_file_list.pop(upfile)
                         self.uploadfile_retranslist.pop(upfile)
                     elif RegexUtility.match(
                             TencentDownloader.DOWNLOAD_FORMAT2.format(
                                 file=upfile), filename):
                         value = \
                         RegexUtility.parse(TencentDownloader.DOWNLOAD_FORMAT2.format(file=upfile), filename)[0]
                         if value[0] == value[1]:
                             self.upload_file_list.pop(upfile)
                             self.uploadfile_retranslist.pop(upfile)
                     if not FileUtility.exists(jsonfile):
                         Logger.getlogging().error(
                             'no json file generate from done file:{done}'.
                             format(done=binfile))
                         os.mknod(jsonfile)
                     # update upload time
                     keys = self.sortkeys()
                     for fl in keys:
                         if self.uploadfile_retranslist[
                                 fl].start_time >= uploadtime:
                             self.uploadfile_retranslist[
                                 fl].start_time = time.time()
                             time.sleep(0.1)
                     break
     return files
 def __init__(self):
     # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
     SiteS2Query.__init__(self)
     self.fakeoriginalurl = 'http://www.narutom.com/'
     self.r = RegexUtility()
Пример #28
0
 def __init__(self):
     # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
     SiteS2Query.__init__(self)
     self.fakeoriginalurl = 'http://bbs.laohu.com/'
     self.r = RegexUtility()
     self.inputtime = self.querylastdays
Пример #29
0
 def __init__(self):
     SiteComments.__init__(self)
     self.r = RegexUtility()
Пример #30
0
class LaohuS2Query(SiteS2Query):
    #LAOHU_QUERY_TEMPLATE = 'http://bbs.laohu.com/plugin.php?id=esearch&mymod=search&myac=thread&word={KEY}&page={pn}'
    LAOHU_QUERY_TEMPLATE = 'http://bbs.laohu.com/plugin.php?id=esearch&mymod=search&myac=thread&word={KEY}&page={pn}&srchfrom={time}'
    LAOHU_S2QUERY_FIRST_PAGE = 'S2QUERY_FIRST_PAGE'
    LAOHU_S2QUERY_EACH_PAGE = 'S2QUERY_EACH_PAGE'
    LAOHU_MAIN_DOMAIN = 'http://bbs.laohu.com/'
    LAOHU_LINK = 'http://bbs.laohu.com/thread-{tid}-1-1.html'
    DEFAULT_TIME = 86400
    tids = []

    ##############################################################################################
    # @functions:__init__
    # @param: none
    # @return:none
    # @author:HuBorui
    # @date:2016/11/28
    # @note:老虎游戏论坛元搜类的构造器,初始化内部变量
    ##############################################################################################
    def __init__(self):
        # 使用该URL识别回传S2查询结果的类,推荐使用主站URL
        SiteS2Query.__init__(self)
        self.fakeoriginalurl = 'http://bbs.laohu.com/'
        self.r = RegexUtility()
        self.inputtime = self.querylastdays

    def preprocess(self, mid_url):
        if self.r.search('tid=\d+', mid_url):
            tid = self.r.parse('tid=(\d+)', mid_url)[0]
            if len(self.tids) == 0:
                self.tids.append(tid)
                newurl = self.LAOHU_LINK.format(tid=tid)
            else:
                if tid not in self.tids:
                    self.tids.append(tid)
                    newurl = self.LAOHU_LINK.format(tid=tid)
                else:
                    newurl = None
        else:
            newurl = self.LAOHU_MAIN_DOMAIN + mid_url
        return newurl

    ################################################################################################################
    # @functions:pageprocess
    # @info: query condition
    # @return:none
    # @note:SiteS2Query,S2 query
    ################################################################################################################
    def pageprocess(self, params):
        # 获取文本
        xparser = XPathUtility(params.content)
        # 获取该页超级链接
        hreflist = xparser.xpath('//h3/a/@href')
        hrefs = []
        for mid_url in hreflist:
            mid = self.preprocess(mid_url)
            if mid is not None:
                hrefs.append(mid)

        # 获取该页内容的所有发布时间
        publictime = xparser.xpath('//*[@class="scontent"]/text()[1]')
        publicTimes = []
        for timeindex in publictime:
            middle = str(timeindex).replace('\n', '').replace('\t', '').strip()
            publicTimes.append(
                str(str(middle).split(' ')[0]) + ' ' +
                str(str(middle).split(' ')[1]))
        # 获取改页所有title
        titles = []
        titles_list = xparser.getlist('//h3')
        for title in titles_list:
            mid_title = str(title).replace('\n', '').replace('\t', '').strip()
            titles.append(mid_title)
        # 获取关键字
        KEY_mid = params.customized['KEY']
        KEY = Common.urldec(KEY_mid)
        # 获取标题正则表达式
        titlePatten = KEY
        # 获取一周前日期
        today = datetime.datetime.now()
        before_days = today + datetime.timedelta(-self.inputtime)
        before_arr = str(before_days).split('.')
        before_time = before_arr[0]

        urllist = []
        len_hrefs = len(hrefs)
        number = 0
        for index in publicTimes[:len_hrefs]:
            # 是否是标题命中
            # mid_value = re.compile(titlePatten)
            # flg = mid_value.search(str(titles[number]))
            flg = Common.checktitle(titlePatten, str(titles[number]))
            # 是当前一周内发布视频,并且标题命中的场合
            if index > before_time and flg:
                url = hrefs[number]
                urllist.append(url)
            number = number + 1

        # 获取最终url列表
        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)

    ################################################################################################################
    # @functions:query
    # @info: query condition
    # @return:none
    # @note:SiteS2Query,S2 query
    ################################################################################################################
    def query(self, info):
        urlkey = Common.urlenc(info)
        time = self.querylastdays * LaohuS2Query.DEFAULT_TIME
        urls = [
            LaohuS2Query.LAOHU_QUERY_TEMPLATE.format(KEY=urlkey,
                                                     pn=1,
                                                     time=time)
        ]
        self.__storeqeuryurllist__(urls, self.LAOHU_S2QUERY_FIRST_PAGE, {
            'KEY': urlkey,
            'time': time
        })

    ################################################################################################################
    # @functions:process
    # @params: see WebSite.process
    # @return:none
    # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表
    ################################################################################################################
    def process(self, params):
        # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL
        if params.step == LaohuS2Query.LAOHU_S2QUERY_FIRST_PAGE:
            # 获得首页url参数
            KEY = params.customized['KEY']
            time = params.customized['time']
            #获取总页数
            xparser = XPathUtility(params.content)
            pageCounts = xparser.getlist('//*[@id="main"]/div[2]/span')
            if len(pageCounts) > 0:
                page = str(pageCounts[0]).split('/')[1]

                #获取第一页的搜索结果
                self.pageprocess(params)

                if int(page) > 1:
                    if int(page) >= self.maxpages:
                        page = self.maxpages
                    querylist = []
                    # 根据总页数,获取query列表(第一页的数据已经获取到了,从第二页开始拼出获取的url)
                    for pages in range(2, int(page) + 1, 1):
                        url = LaohuS2Query.LAOHU_QUERY_TEMPLATE.format(
                            KEY=KEY, pn=pages, time=time)
                        querylist.append(url)
                    self.__storeqeuryurllist__(
                        querylist, LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE,
                        {'KEY': KEY})

            else:
                Logger.getlogging().debug('抱歉,没有找到与' + ' ' + KEY + ' ' +
                                          '相关的帖子')

        # 从查询页面中获取视频URL
        elif params.step == LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE:
            self.pageprocess(params)