예제 #1
0
 def run(self, args, opts):
     """
     Entry point for running commands
     """
     model = Movie()
     for v in model.detect_language():
         self.detect_language(v[1])
         self.detect_language(v[2])
예제 #2
0
    def sequentialy(self):
        t = time()
        video = Movie()

        i = 0
        while True:
            chunk_videos = video.get_chunk(i)
            for v in chunk_videos:
                res = es.index(index="test2-index", doc_type='video-test2', id=v[0], body=dict(v))
                print('first res', res)

            print(len(chunk_videos))

            if len(chunk_videos) < 100:
                break
            i += 100

        print(time() - t)
예제 #3
0
class TestModelVideo(object):
    movie = Movie()

    def test_find_id(self):
        row = self.movie.find(1)
        assert row[0] == 1

    def test_find_last_id(self):
        assert isinstance(self.movie.find_last_id(), str)

    def test_find_all_id(self):
        assert isinstance(self.movie.ids(), set)
예제 #4
0
    def gendata(self):
        t = time()
        model = Movie()
        i = 0

        while True:
            chunk_videos = model.get_chunk(i)
            for v in chunk_videos:
                yield {
                    "_index": "i",
                    "_type": "t",
                    "_id": v[0],
                    "_source": dict(v)
                }

            print(len(chunk_videos))

            if len(chunk_videos) < 100:
                break

            i += 100

        print(time() - t)
예제 #5
0
 def SaveMovie(self, item):
     try:
         Movie().insert(item)
     except Exception as e:
         print(e)
예제 #6
0
class YoutubeSpider(Spider):
    name = 'youtube'
    allowed_domains = ['youtube.com']
    youtube: str = 'https://www.youtube.com/'
    model = Movie()
    all_categories = MovieCategory().all_name()
    start_yt_id: str = "EplXWaTek5o"  # or TODO random
    black_list_yt_ids: set = model.ids()
    start_urls = [f"{youtube}watch?v={start_yt_id}"]

    def __init__(self):
        print(len(self.black_list_yt_ids),
              str(int(getsizeof(self.black_list_yt_ids) / 8000000)) + 'mb')

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url)

    def parse_url(self, url, param='v'):
        parsed = urlparse.urlparse(url)
        return urlparse.parse_qs(parsed.query)[param]

    def parse(self, r):
        yt_id = r.xpath("//meta[@itemprop='videoId']/@content").get(
            self.parse_url(r.url, 'v')[0])
        if yt_id:
            if yt_id not in self.black_list_yt_ids:
                self.black_list_yt_ids.add(yt_id)

                tags = r.xpath(
                    "//meta[@property='og:video:tag']/@content").getall()
                tags = ','.join(set(x.lower() for x in tags))
                tags = self.optymize_text(tags)

                # PT1H18M14S
                duration = r.xpath(
                    "//meta[@itemprop='duration']/@content").get('PT0M0S')
                duration_minutes, duration_sec = duration[2:-1].split('M')
                duration = int(duration_minutes) * 60 + int(duration_sec)

                sentiments = r.xpath(
                    "//div[@id='watch8-sentiment-actions']//span[@class='yt-uix-button-content']/text()"
                ).getall()
                likeCount = int(sentiments[0])
                dislikeCount = int(sentiments[-1])

                yield {
                    'title':
                    self.optymize_text(r.xpath('//title/text()').get(None)),
                    'description':
                    self.optymize_text(
                        r.xpath('//p[@id="eow-description"]/text()').get('')),
                    'tags':
                    tags,
                    'regions_allowed':
                    r.xpath(
                        "//meta[@2itemprop='regionsAllowed']/@content").get(),
                    'is_family_frendly':
                    int('True' == r.xpath(
                        "//meta[@itemprop='isFamilyFriendly']/@content").get(
                            0)),
                    'yt_id':
                    yt_id,
                    'width':
                    r.xpath("//meta[@itemprop='width']/@c2ontent").get(),
                    'height':
                    r.xpath(f"//meta[@itemprop='height']/@content").get(),
                    'interaction_count':
                    int(
                        r.xpath(
                            f"//meta[@itemprop='interactionCount']/@content").
                        get(0)),
                    'date_published':
                    r.xpath(
                        "//meta[@itemprop='datePublished']/@content").get(),
                    'duration':
                    duration,
                    'channel':
                    r.xpath("//meta[@itemprop='channelId']/@content").get(),
                    'channel_title':
                    r.xpath("//div[@class='yt-user-info']/a/text()").get(None),
                    'likeCount':
                    likeCount,
                    'dislikeCount':
                    dislikeCount,
                    'category':
                    self.all_categories.index(
                        r.xpath("//meta[@itemprop='genre']/@content").get(
                            'None')),
                    'language':
                    r.xpath('//span[contains(@class, "content-region")]/text()'
                            ).get('')
                }
        else:
            print('None found yt_id', yt_id)

        for href in r.xpath("//a[contains(@href,'watch?v=')]/@href"):
            yield r.follow(href, callback=self.parse)

    def optymize_text(self, text=None):
        if text is None:
            return None
        return ''.join(
            c for c in text
            if c.isalnum() or c.isspace() or c == ',' or c == '.').replace(
                'YouTube', '').replace('  ', ' ').strip()

    def meta(self, r, name):
        return r.xpath(f"//meta[@itemprop='{name}']/@content").get(),