Exemplo n.º 1
0
class YouJizzScraper():

    def __init__(self):
        homeDirectory = os.getenv("HOME")
        sys.path.append(r"" + homeDirectory + "/ScraperBot" + "")
        self.dataHandler = DataHandler()

    def scrape_videos(self, br, htmlscraper, parser, wpPost, videoUrls):
        postList = wpPost.get_posts(10000)
        for i in range(len(videoUrls)):
            try:
                print "---------------------" + str(i) + " from " + str(len(videoUrls)) + "------------------------"
                title = htmlscraper.convert_hypen_into_space(parser.split_url(videoUrls[i]))
                print "title: " + htmlscraper.uppercase_first_letter_from_string(title)
                if (self.dataHandler.is_this_item_on_the_list(title, postList)):
                    print "Content already posted"
                else:
                    print "Video scraping started ..."
                    tags = htmlscraper.convert_title_to_categories(str(title))
                    soup = BeautifulSoup(br.scrap_website(videoUrls[i]))
                    soup.prettify()
                    thumbnail = parser.get_thumbnail(soup)
                    print "thumbnail: " + thumbnail
                    paraVideo = parser.parse_video_id(videoUrls[i])
                    iframe = parser.create_video_iframe(paraVideo[0], paraVideo[1])
                    print "iframe: " + iframe
                    video_duration = parser.get_duration(soup)
                    print "video duration: " + video_duration
                    embedurl = htmlscraper.parse_src_from_video_iframe(iframe)
                    print "embedurl " + embedurl
                    duration_for_snippets = parser.prepare_duration_for_snippets(video_duration)
                    print "duration for snippets: " + duration_for_snippets
                    print "Wordpress post creator starting ..."
                    wpPost.createPost(title, thumbnail, iframe, video_duration, duration_for_snippets, tags, embedurl)
                    print "Scraped video [OK]"
            except:
                pass

    def scrape_from_category(self, br, htmlscraper, parser, wpPost, categoryUrls, scraper):
        print "scraping videos from categories"
        for i in range(len(categoryUrls)):
            soup = BeautifulSoup(br.scrap_website(categoryUrls[i]))
            totalUrlsVideos = parser.getUrlsFromVideos(soup)
            totalUrlsVideos = list(set(totalUrlsVideos))
            scraper.scrape_videos(br, htmlscraper, parser, wpPost, totalUrlsVideos)

    def main(self):
        print "Youjizz scraper bot is starting ..."
        br = common.startBrowser.BotBrowser()
        homepage = br.scrap_website('http://www.youjizz.com')
        htmlscraper = common.html_tag_parser.HtmlTagParser(homepage)
        parser = parsers.parser_youporn.YoupornParser(homepage)
        wpPost = common.postCreator.PostCreator()
        scraper = YouJizzScraper()
        soup = BeautifulSoup(homepage)
        totalUrlsVideos = parser.getUrlsFromVideos(soup)
        totalUrlsCategories = parser.getUrlsFromCategories(soup)
        totalUrlsVideos = list(set(totalUrlsVideos))
        totalUrlsCategories = list(set(totalUrlsCategories))
        #scraper.scrape_videos(br, htmlscraper, parser, wpPost, totalUrlsVideos)
        #scraper.scrape_from_category(br, htmlscraper, parser, wpPost, totalUrlsCategories, scraper)
        print "Youjizz scraper bot is finishing ..."
Exemplo n.º 2
0
class xVideosScraper():

    def __init__(self):
        self.dataHandler = DataHandler()

    def scrape_videos(self, br, htmlscraper, parser, wpPost, videoUrls):
        postList = wpPost.get_posts(10)
        for i in range(len(videoUrls)):
            try:
                print "---------------------" + str(i) + " from " + str(len(videoUrls)) + "------------------------"
                title = htmlscraper.convert_underscore_into_space(parser.split_url(videoUrls[i]))
                print "title: " + htmlscraper.uppercase_first_letter_from_string(title)
                if (self.dataHandler.is_this_item_on_the_list(title, postList)):
                    print "Content already posted"
                else:
                    print "url " + videoUrls[i]
                    title_as_categories = htmlscraper.convert_hypen_into_space(title)
                    categories = htmlscraper.convert_string_into_categories(title_as_categories)
                    print "title convert to categories: " + str(categories)
                    soup = BeautifulSoup(br.scrap_website(videoUrls[i]))
                    print "video page scraped "
                    duration = parser.get_video_duration(soup)
                    duration_for_snippets = parser.prepare_duration_for_snippets(duration)
                    print "duration for snippets: " + duration_for_snippets
                    duration = duration + "min"
                    print "duration " + duration
                    thumbnail = parser.get_thumbnail(soup)
                    print "thumbnail: " + thumbnail
                    video_id = parser.get_video_id(videoUrls[i])
                    iframe_object = parser.create_video_iframe(video_id)
                    print "iframe: " + iframe_object
                    embedurl = htmlscraper.parse_src_from_video_iframe(iframe_object)
                    print "embedurl " + embedurl
                    print "Wordpress post creator starting ..."
                    wpPost.createPost(title, thumbnail, iframe_object, duration, duration_for_snippets, categories, embedurl)
                    print "Scraped video [OK]"
            except:
                pass

    def scrape_from_category(self, br, htmlscraper, parser, wpPost, categoryUrls, scraper):
        print "scraping videos from categories"
        for i in range(len(categoryUrls)):
            soup = BeautifulSoup(br.scrap_website(categoryUrls[i]))
            totalUrlsVideos = parser.getUrlsFromVideos(soup)
            totalUrlsVideos = list(set(totalUrlsVideos))
            scraper.scrape_videos(br, htmlscraper, parser, wpPost, totalUrlsVideos)

    def main(self):
        print "xVideos scraper bot is starting ..."
        br = common.startBrowser.BotBrowser()
        homepage = br.scrap_website('http://www.xvideos.com/')
        htmlscraper = common.html_tag_parser.HtmlTagParser(homepage)
        parser = parsers.parser_xvideos.XvideosParser(homepage)
        wpPost = common.postCreator.PostCreator()
        scraper = xVideosScraper()
        soup = BeautifulSoup(homepage)
        totalUrlsVideos = parser.getUrlsFromVideos(soup)
        totalUrlsCategories = parser.getUrlsFromCategories(soup)
        totalUrlsVideos = list(set(totalUrlsVideos))
        totalUrlsCategories = list(set(totalUrlsCategories))
        scraper.scrape_videos(br, htmlscraper, parser, wpPost, totalUrlsVideos)
        scraper.scrape_from_category(br, htmlscraper, parser, wpPost, totalUrlsCategories, scraper)
        print "Scraping finished"
Exemplo n.º 3
0
 def __init__(self):
     homeDirectory = os.getenv("HOME")
     sys.path.append(r"" + homeDirectory + "/ScraperBot" + "")
     self.dataHandler = DataHandler()
Exemplo n.º 4
0
 def __init__(self):
     self.dataHandler = DataHandler()