示例#1
0
 def __save_bucket(self, to_db=True):
     if to_db == True:
         storage = Storage()
         storage.connect('crawldb')
         storage.set_collection('crawlset')
         Helper.log("Insert count", len(self.crawlset_bucket))
         storage.insert_documents(self.crawlset_bucket)
     else:
         FileStorage.bulk_write(self.crawlset_bucket, 'content')
示例#2
0
 def __finalize_crawlset_bucket(self):
     Helper.log("Adding content..")
     self.crawlset_bucket = set(self.crawlset_bucket)
     json_bucket = []
     for crawlset in self.crawlset_bucket:
         if crawlset.content == '':
             Helper.log('Downloading content for URL', crawlset.link)
             crawlset.content = UrlGrabber.retrieve_html(crawlset.link)
             Helper.log('OK!')
         json_bucket.append(crawlset.to_dictionary())
     self.crawlset_bucket = json_bucket
示例#3
0
 def __fill_crawlset_bucket(self):
     current_depth = -1
     if current_depth < self.level:
         current_depth += 1
         Helper.log('Depth', current_depth)
         grabber1 = UrlGrabber(current_depth, self.site_url, self.site_root)
         crawlsets1 = grabber1.grab()
         if crawlsets1 is not None:
             self.crawlset_bucket.extend(crawlsets1)
         current_depth += 1
         if current_depth < self.level:
             current_depth += 1
             for set1 in crawlsets1:
                 set1 = (set1)
                 Helper.log('Depth', current_depth)
                 grabber2 = UrlGrabber(current_depth, set1.link,
                                       self.site_root)
                 crawlsets2 = grabber2.grab()
                 if crawlsets2 is not None:
                     self.crawlset_bucket.extend(crawlsets2)
                 # Level 3
                 if current_depth < self.level:
                     current_depth += 1
                     for set2 in crawlsets2:
                         set2 = (set2)
                         Helper.log('Depth', current_depth)
                         grabber3 = UrlGrabber(current_depth, set2.link,
                                               self.site_root)
                         crawlsets3 = grabber3.grab()
                         if crawlsets3 is not None:
                             self.crawlset_bucket.extend(crawlsets3)
                         # Level 4
                         if current_depth < self.level:
                             current_depth += 1
                             for set3 in crawlsets3:
                                 set3 = (set3)
                                 Helper.log('Depth', current_depth)
                                 grabber4 = UrlGrabber(
                                     current_depth, set3.link,
                                     self.site_root)
                                 crawlsets4 = grabber4.grab()
                                 if crawlsets4 is not None:
                                     self.crawlset_bucket.extend(crawlsets4)