예제 #1
0
def spider_crawl():
    while True:
        print 'Crawling at gtagarage'
        print 'Please input crawling range/index(Recommendation: 0 - 20000)'
        r = raw_input('Please input: [min], [max] - to ensure range/index\n->')

        if r.count(',') is not 1:
            print 'Please input with specific format: [min], [max]'
            pause()
            continue
        
        st, ed = r.split(',')
        st, ed = st.strip(), ed.strip()
        
        if not (st.isdigit() is True and ed.isdigit() is True and st < ed):
            print 'Please input with specific format: [min], [max]'
            pause()
            continue

        links = ['http://www.gtagarage.com/mods/show.php?id=%d'
                 % i for i in range(int(st), int(ed))]
        for link in links:
            spider = SpiderLinkPage(link)
            mod = modinfo.ModInfo(link)
            mod.updatekey('site', 'http://www.gtagarage.com')
            mod.updatekey('link', link)
            mod.updatekey('authorlink', spider.get_mod_authorlink())
            mod.updatekey('dldlink', spider.get_mod_dldlink())
            mod.updatekey('imglink', spider.get_mod_imglink())
            mod.updatekey('name', spider.get_mod_name())
            mod.updatekey('type', spider.get_mod_type())
            mod.updatekey('subtype', spider.get_mod_subtype())
            mod.updatekey('ver', spider.get_mod_gtaver())
            mod.updatekey('author', spider.get_mod_author())
            mod.updatekey('status', spider.get_mod_status())
            mod.updatekey('date', spider.get_mod_lastupdated())
            mod.updatekey('collecttime', strftime('%Y%m%d%H%M%S'))
            print 'Collected: %s' % link
            #mod.show()
        #modinfo.show()
        filename = 'gtagarage_%s.pkl' % strftime('%Y%m%d%H%M%S')
        modinfo.dump(filename)
        modinfo.clear()
        print 'Collect action at gtagarage finished.'
        print 'Data store at file:', filename
        pause()
        break
예제 #2
0
    def get_info(self, cur_depth):
        fac_depth = (
            self.maximum_depth
            if (self.depth >= self.maximum_depth or self.depth == 0)
            else self.depth)
        while cur_depth < fac_depth:
            cur_link = format("%s&start=%d&orderBy=" %
                              (self.link, cur_depth * 7))
            self.cont = spiderutils.openurlex(cur_link).read()
            cur_depth += 1

            # collect info
            name_iter = re.finditer(
                r'Title:</B></TD>\s+<TD><B>(.*?)</B></TD>', self.cont)
            author_iter = re.finditer(
                r'Author:</TD>\s+<TD>(.*?)</TD>', self.cont)
            date_iter = re.finditer(
                r'Date:</TD>\s+<TD>(.*?)</TD>', self.cont)
            img_iter = re.finditer(
                r'Image:</TD>\s+<TD><img src="(.*?)"><BR><BR></TD>', self.cont)
            id_iter_forview = re.finditer(
                r'<BR><center><a href="[\D]+(.*?)"><.*?><B>DOWNLOAD</B>'
                , self.cont)
            id_iter_fordld = re.finditer(
                r'<BR><center><a href="[\D]+(.*?)"><.*?><B>DOWNLOAD</B>'
                , self.cont)
            mod_name = (name.group(1) for name in name_iter)
            mod_author = (author.group(1) for author in author_iter)
            mod_date = (date.group(1) for date in date_iter)
            mod_img = (
                ("%s%s%s" % (
                    self.info["homepage"],
                    self.info["imglink"],
                    imglink))
                for imglink in (imglink.group(1) for imglink in img_iter))
            mod_infopage = (
                ("%s%s%d" % (
                    self.info["homepage"],
                    self.info["infopage"],
                    int(index))
                for index in (index.group(1) for index in id_iter_forview)))
            mod_dldlink = (
                ("%s%s%d" % (
                    self.info["homepage"],
                    self.info["dldlink"],
                    int(index))
                 for index in (index.group(1) for index in id_iter_fordld)))

            # store info
            for mod_infopage in mod_infopage:
                mod = modinfo.ModInfo(mod_infopage)
                mod.updatekey('site', 'http://www.gtainside.com')
                mod.updatekey('link', mod_infopage)
                mod.updatekey('name', mod_name.next())
                mod.updatekey('type', get_type_fromlink(self.link))
                mod.updatekey('subtype', '')
                mod.updatekey('ver', get_ver_fromlink(self.link))
                mod.updatekey('imglink', mod_img.next())
                mod.updatekey('dldlink', mod_dldlink.next())
                mod.updatekey('author', mod_name.next())
                mod.updatekey('date', mod_date.next())
                mod.updatekey('collecttime', strftime('%Y%m%d%H%M%S'))
                print 'Collected: %s' % mod_infopage
                #mod.show()
                #break

        #modinfo.show()
        filename = 'gtainside_%s.pkl' % strftime('%Y%m%d%H%M%S')
        modinfo.dump('gtainside_%s.pkl' % strftime('%Y%m%d%H%M%S'))
        modinfo.clear()
        print 'Single collect action at gtainside finished.'
        print 'Data store at file:', filename
        pause()