示例#1
0
def spider_crawl():
    print 'Crawling at gtainside homepage...'
    global _info
    global _info2
    spider_homepage = SpiderHomePage()
    spider_homepage.narrow_collect_range()
    spider_homepage.set_type_info()
    topics = _info.keys()
    while True:
        print 'Crawling at gtainside topicpage:'
        print '    There have:'
        for topic in topics:
            print '        id:%3d - (%-3s/%-12s)' % (
                int(_info[topic]['id']), _info[topic]['ver'],
                _info[topic]['type'])
        print 'Please input: [id], [startpage], [endpage]',
        print 'to start crawl.'
        print 'Note1: Either page set to 0 will crawl to last page.'
        print 'Note2: Input \'finish\' will finish the crawl'
        r = raw_input('->')

        if r.startswith('finish'):
            print 'Collect action at gtainside finished.'
            pause()
            break

        if r.count(',') is not 2:
            print 'Please input with specific format: [id], [start] [end]'
            print 'Note: Either page set to 0 will crawl to last page.'
            pause()
            continue

        i, st, ed = r.split(',')
        i, st, ed = i.strip(), st.strip(), ed.stripc()
                
        if i.isdigit() is False or st.isdigit() is False \
           or ed.isdigit() is False or st <= ed:
            print 'Please input with specific format: [id], [start] [end]'
            print 'Note: Either page set to 0 will crawl to last page.'
            pause()
            continue

        if _info2.has_key(i) is not True:
            print '[Input error] ID:', i, 'do not find.'
            pause()
            continue

        topiclink = _info2[i]
        ed = 0 if ed == -1 else ed
        spider = SpiderTopicPage(topiclink, ed)
        topicpage.set_maximum_depth()
        topicpage.get_info()
示例#2
0
def spider_crawl():
    while True:
        print 'Crawling at gtabbs'
        print 'Please input crawling range/pages(Recommendation: 1 - 300)'
        r = raw_input('Please input: [min], [max] - to ensure range/pages\n->')

        if r.count(',') is not 1:
            print 'Please input with specific format: [min], [max]'
            pause()
            continue
        
        st, ed = r.split(',')
        st, ed = st.strip(), ed.strip()
        
        if not (st.isdigit() is True and ed.isdigit() is True and st < ed):
            print 'Please input with specific format: [min], [max]'
            pause()
            continue

        link_pages = ['http://www.gtabbs.com/bbs-141-%d'
                      % i for i in range(int(st), int(ed))]
        for link_page in link_pages:
            link_topics = SpiderTopicPage(link_page).get_topics()
            for link_topic in link_topics:
                spider = SpiderTopicContent(link_topic)
                mod = modinfo.ModInfo(link_topic)
                mod.updatekey('site', 'http://www.gtabbs.com')
                mod.updatekey('link', link_topic)
                mod.updatekey('has_att', spider.detect_attachment())
                mod.updatekey('name', spider.get_name())
                mod.updatekey('type', '')
                mod.updatekey('subtype', '')
                mod.updatekey('ver', spider.get_gtaver())
                mod.updatekey('imglink', spider.get_img())
                mod.updatekey('publisher', spider.get_publisher())
                mod.updatekey('date', strftime('%Y%m%d%H%M%S'))
                mod.updatekey('collecttime', strftime('%Y%m%d%H%M%S'))
                print 'Collected: %s' % link_topic
                #mod.show()
                #break
                
        #modinfo.show()
        filename = 'gtabbs_%s.pkl' % strftime('%Y%m%d%H%M%S')
        modinfo.dump(filename)
        print 'Collect action at gtagarage finished.'
        print 'Data store at file:', filename
        pause()
        break
示例#3
0
def spider_crawl():
    while True:
        print 'Crawling at gtagarage'
        print 'Please input crawling range/index(Recommendation: 0 - 20000)'
        r = raw_input('Please input: [min], [max] - to ensure range/index\n->')

        if r.count(',') is not 1:
            print 'Please input with specific format: [min], [max]'
            pause()
            continue
        
        st, ed = r.split(',')
        st, ed = st.strip(), ed.strip()
        
        if not (st.isdigit() is True and ed.isdigit() is True and st < ed):
            print 'Please input with specific format: [min], [max]'
            pause()
            continue

        links = ['http://www.gtagarage.com/mods/show.php?id=%d'
                 % i for i in range(int(st), int(ed))]
        for link in links:
            spider = SpiderLinkPage(link)
            mod = modinfo.ModInfo(link)
            mod.updatekey('site', 'http://www.gtagarage.com')
            mod.updatekey('link', link)
            mod.updatekey('authorlink', spider.get_mod_authorlink())
            mod.updatekey('dldlink', spider.get_mod_dldlink())
            mod.updatekey('imglink', spider.get_mod_imglink())
            mod.updatekey('name', spider.get_mod_name())
            mod.updatekey('type', spider.get_mod_type())
            mod.updatekey('subtype', spider.get_mod_subtype())
            mod.updatekey('ver', spider.get_mod_gtaver())
            mod.updatekey('author', spider.get_mod_author())
            mod.updatekey('status', spider.get_mod_status())
            mod.updatekey('date', spider.get_mod_lastupdated())
            mod.updatekey('collecttime', strftime('%Y%m%d%H%M%S'))
            print 'Collected: %s' % link
            #mod.show()
        #modinfo.show()
        filename = 'gtagarage_%s.pkl' % strftime('%Y%m%d%H%M%S')
        modinfo.dump(filename)
        modinfo.clear()
        print 'Collect action at gtagarage finished.'
        print 'Data store at file:', filename
        pause()
        break
示例#4
0
    def get_info(self, cur_depth):
        fac_depth = (
            self.maximum_depth
            if (self.depth >= self.maximum_depth or self.depth == 0)
            else self.depth)
        while cur_depth < fac_depth:
            cur_link = format("%s&start=%d&orderBy=" %
                              (self.link, cur_depth * 7))
            self.cont = spiderutils.openurlex(cur_link).read()
            cur_depth += 1

            # collect info
            name_iter = re.finditer(
                r'Title:</B></TD>\s+<TD><B>(.*?)</B></TD>', self.cont)
            author_iter = re.finditer(
                r'Author:</TD>\s+<TD>(.*?)</TD>', self.cont)
            date_iter = re.finditer(
                r'Date:</TD>\s+<TD>(.*?)</TD>', self.cont)
            img_iter = re.finditer(
                r'Image:</TD>\s+<TD><img src="(.*?)"><BR><BR></TD>', self.cont)
            id_iter_forview = re.finditer(
                r'<BR><center><a href="[\D]+(.*?)"><.*?><B>DOWNLOAD</B>'
                , self.cont)
            id_iter_fordld = re.finditer(
                r'<BR><center><a href="[\D]+(.*?)"><.*?><B>DOWNLOAD</B>'
                , self.cont)
            mod_name = (name.group(1) for name in name_iter)
            mod_author = (author.group(1) for author in author_iter)
            mod_date = (date.group(1) for date in date_iter)
            mod_img = (
                ("%s%s%s" % (
                    self.info["homepage"],
                    self.info["imglink"],
                    imglink))
                for imglink in (imglink.group(1) for imglink in img_iter))
            mod_infopage = (
                ("%s%s%d" % (
                    self.info["homepage"],
                    self.info["infopage"],
                    int(index))
                for index in (index.group(1) for index in id_iter_forview)))
            mod_dldlink = (
                ("%s%s%d" % (
                    self.info["homepage"],
                    self.info["dldlink"],
                    int(index))
                 for index in (index.group(1) for index in id_iter_fordld)))

            # store info
            for mod_infopage in mod_infopage:
                mod = modinfo.ModInfo(mod_infopage)
                mod.updatekey('site', 'http://www.gtainside.com')
                mod.updatekey('link', mod_infopage)
                mod.updatekey('name', mod_name.next())
                mod.updatekey('type', get_type_fromlink(self.link))
                mod.updatekey('subtype', '')
                mod.updatekey('ver', get_ver_fromlink(self.link))
                mod.updatekey('imglink', mod_img.next())
                mod.updatekey('dldlink', mod_dldlink.next())
                mod.updatekey('author', mod_name.next())
                mod.updatekey('date', mod_date.next())
                mod.updatekey('collecttime', strftime('%Y%m%d%H%M%S'))
                print 'Collected: %s' % mod_infopage
                #mod.show()
                #break

        #modinfo.show()
        filename = 'gtainside_%s.pkl' % strftime('%Y%m%d%H%M%S')
        modinfo.dump('gtainside_%s.pkl' % strftime('%Y%m%d%H%M%S'))
        modinfo.clear()
        print 'Single collect action at gtainside finished.'
        print 'Data store at file:', filename
        pause()