示例#1
0
 def scrape(self, targeturl):
     target = get_target(targeturl)
     log.info('Scraping user %s at URL: %s', target, targeturl)
     # do posts first because we're already on the timeline
     if self.settings['posts']:
         self.mapping['posts'](targeturl)
     for key, value in self.settings.iteritems():
         if value and key is not 'posts':
             self.mapping[key](targeturl)
     log.info('Finished scraping user %s', target)
示例#2
0
    def scrape_about(self, targeturl):
        target = get_target(targeturl)
        rec = record.Record(self._output_file(target, 'about'), ['section', 'text'])

        def callback(section, content):
            rec.add_record({'section': section, 'text': content})
            log.info('Scraped section %s with the following text:\n#### START ####\n%s\n####  END  ####',
                     section, content)

        self.crawl_about(targeturl, callback)
示例#3
0
    def scrape_checkins(self, targeturl):
        target = get_target(targeturl)
        rec = record.Record(self._output_file(target, 'checkins'), ['name', 'url'])

        def callback(name, url, i):
            rec.add_record({'name': name, 'url': url})
            log.info('Scraped check in %d: %s', i, name)

        scraped = self.crawl_checkins(targeturl, callback)
        log.info('Scraped %d checkins into %s', scraped, rec.filename)
示例#4
0
    def scrape_likes(self, targeturl):
        target = get_target(targeturl)
        rec = record.Record(self._output_file(target, 'likes'), ['name', 'url'])
        log.info('Scraping likes into %s', rec.filename)

        def callback(name, page_url, i):
            rec.add_record({'name': name, 'url': page_url})
            log.info('Scraped like %d: %s', i, name)

        likes_scraped = self.crawl_likes(targeturl, callback)
        log.info('Scraped %d likes into %s', likes_scraped, rec.filename)
示例#5
0
    def scrape_friends(self, targeturl):
        target = get_target(targeturl)
        rec = record.Record(self._output_file(target, 'friends'), ['name', 'profile'])
        log.info('Scraping friends into %s', rec.filename)

        def callback(name, url, imgurl, i):
            friend_url = strip_query(url)
            rec.add_record({'name': name, 'profile': friend_url})
            log.info('Scraped friend %d: %s', i, name)

        friends_scraped = self.crawl_friends(targeturl, callback)
        log.info('Scraped %d friends into %s', friends_scraped, rec.filename)
示例#6
0
    def scrape_photos(self, targeturl):
        """Scrapes the targets photos and only the photos on the target's photo page.
        Photos in albums are not scraped.
        """
        target = get_target(targeturl)
        # scrape main photos
        photo_album = record.Album(self._output_file(target, 'photos'), True)

        def photo_cb(photourl, description, perma, _):
            self._save_to_album(photourl, description, perma, photo_album)

        photos_scraped = self.crawl_photos(targeturl, photo_cb)
        log.info('Scraped %d photos into %s', photos_scraped, photo_album.name)
示例#7
0
    def scrape_all_albums(self, targeturl):
        target = get_target(targeturl)

        def album_cb(name, url, _):
            """What to do for each album.
            """
            album_name = 'album-' + path_safe(name)
            album = record.Album(self._output_file(target, album_name), True)

            def album_download_cb(photourl, perma, _):
                self._save_to_album(photourl, '', perma, album)

            scraped = self.crawl_one_album(url, album_download_cb)
            log.info('Scraped %d photos into %s', scraped, album.name)

        self.crawl_albums(targeturl, album_cb)
示例#8
0
    def scrape_posts_by_year(self, targeturl, year=None):
        target = get_target(targeturl)
        rec_name = 'posts'
        if year:
            rec_name += '_' + str(year)
        rec = record.Record(self._output_file(target, rec_name), ['date', 'post', 'translation', 'permalink'])
        log.info('Scraping posts into %s', rec.filename)

        def callback(p_time, post_text, p_link, translation, i):
            rec.add_record({
                'date': timestring(p_time),
                'post': post_text,
                'translation': translation,
                'permalink': p_link,
            })

            # keep translation as a unicode string while merging
            if translation:
                translation = u'==== TRANSLATION ====\n{}\n'.format(translation)
            log.info(('Scraped post %d\n\n#### START POST ####\n%s\n%s'
                      '####  END POST  ####\n'), i, post_text, translation)

        posts_scraped = self.crawl_posts(targeturl, callback, year)
        log.info('Scraped %d posts into %s', posts_scraped, rec.filename)