Exemplo n.º 1
0
    def parse_meta(json_spec: Dict) -> BotJsonMeta:
        meta = BotJsonMeta()
        meta.name = json_spec['name']
        meta.race = PlayerRace[json_spec['race'].upper()]
        bot_type = json_spec['botType']
        if bot_type == "JAVA_JNI" or bot_type == "JAVA_MIRROR":
            bot_type = "JAVA"
        meta.botType = BotType[bot_type]

        meta.description = json_spec[
            'description'] if 'description' in json_spec else None
        meta.update = parse_iso_date(
            json_spec['update']) if 'update' in json_spec else None
        meta.botBinary = json_spec[
            'botBinary'] if 'botBinary' in json_spec else None
        meta.bwapiDLL = json_spec[
            'bwapiDLL'] if 'bwapiDLL' in json_spec else None
        meta.botProfileURL = json_spec[
            'botProfileURL'] if 'botProfileURL' in json_spec else None
        meta.javaDebugPort = json_spec[
            'javaDebugPort'] if 'javaDebugPort' in json_spec else None
        meta.javaOpts = json_spec[
            'javaOpts'] if 'javaOpts' in json_spec else None

        return meta
Exemplo n.º 2
0
def format_datetime_like(dt_object):
    if dt_object is None:
        result = dt_object

    elif isinstance(dt_object, str):
        try:
            parse_iso_date(dt_object)
        except Exception:
            raise
        else:
            result = dt_object

    elif isinstance(dt_object, dt.datetime):
        result = dt_object.astimezone(
            dt.timezone.utc).strftime(ISO8601_FORMAT_Z)

    else:
        result = dt_object.strftime(ISO8601_FORMAT_Z)

    return result
Exemplo n.º 3
0
    def parse_meta(json_spec: Dict):
        meta = BotJsonMeta()
        meta.name = json_spec['name']
        meta.race = PlayerRace[json_spec['race'].upper()]
        meta.description = json_spec['description']

        bot_type = json_spec['botType']
        if bot_type == "JAVA_JNI" or bot_type == "JAVA_MIRROR":
            bot_type = "JAVA"
        meta.botType = BotType[bot_type]

        meta.update = parse_iso_date(json_spec['update'])
        meta.botBinary = json_spec['botBinary']
        meta.bwapiDLL = json_spec['bwapiDLL']
        meta.botProfileURL = json_spec['botProfileURL']

        return meta
Exemplo n.º 4
0
    def task_post(self, grab, task):
        print 'task_post', grab.response.url

        self.platform.inc_api_calls()

        title = content = date = None

        # The Post's URL
        url = grab.response.url
        print "Checking url %s" % url
        # The Post's Title
        post_title = grab.doc.select(
            '//div[contains(@id,"Blog")]//*[contains(@class,"post-title") or contains(@class,"postTitle")]'
        )
        if not post_title.count():
            post_title = grab.doc.select(
                '//div[contains(@id,"Blog")]//div[contains(@class,"Post")]//*[contains(@class,"PostHeader")]'
            )
            if not post_title.count():
                post_title = grab.doc.select(
                    '//div[contains(@id,"Blog")]/div[@class="post"]/text()[1]')
        if post_title.count():
            title = post_title.text().strip()

        # The Post's Content
        post_content = grab.doc.select(
            '//div[contains(@id,"Blog")]//div[(contains(@class,"post-body") or contains(@class,"postBody"))]'
        )
        if not post_content.count():
            post_content = grab.doc.select(
                '//div[contains(@id,"Blog")]//div[contains(@class,"Post")]//div[contains(@class,"PostContent")]'
            )
        if post_content.count():
            content = post_content.html()
        else:
            return

        # The post date can be present in different variations
        # That's why we try to extract it from <abbr> (ISO format), after that from the date-header and finally from the URL
        # In the header the date can be present in different formats: "Tuesday, 6 December 2011", "February 18, 2014", etc.
        # Sometimes (http://www.tieandi.com/2014/02/valentines-day-wishlist.html) it's written not in English
        # In this case we can only get year and month from the url ("/2014/02/"") and we set day=1
        # We work with "date" over and over again, until this value is changed from None
        # But if the date is None after all tries... well, I have no idea about
        # when this post is published, may be a few thousands years ago?
        post_date = grab.doc.select(
            '//div[contains(@class,"post-footer")]//abbr[contains(@class,"published")]/@title'
        )
        # <abbr>
        if post_date.count():
            iso_date = post_date.text().strip()
            try:
                date = parse_iso_date(iso_date)
            except ValueError:
                # bad ISO-format? let's try another way
                pass
        # "date-header"
        if not date:
            post_date = grab.doc.select(
                '//*[contains(@class,"date-header")]//span')
            if not post_date.count():
                post_date = grab.doc.select(
                    '//*[contains(@class,"date-header")]')
                if not post_date.count():
                    post_date = grab.doc.select(
                        '//div[contains(@class,"post")]/*[contains(@class,"postAuthor")]/a'
                    )
            if post_date.count():
                YMDhm = self.parse_date(post_date.text())
                if YMDhm:
                    date = datetime(*YMDhm)
            # URL
            if not date:
                url_path = urlparse(grab.response.url).path
                if url_path.startswith('/'):
                    url_path_parts = url_path.split('/')
                    month = year = None
                    year_from_url = url_path_parts[1]
                    month_from_url = url_path_parts[2]
                    if month_from_url.isdigit() and int(
                            month_from_url) in range(1, 13):
                        month = int(month_from_url)
                    if year_from_url.isdigit() and int(year_from_url) in range(
                            2000, 2020):
                        year = int(year_from_url)
                    if month and year:
                        date = datetime(year, month, 1, 0, 0)

        post = models.Posts()
        post.influencer = self.platform.influencer
        post.show_on_search = self.platform.influencer.show_on_search
        post.platform = self.platform
        post.title = title
        post.url = url
        post.content = content
        post.create_date = date
        post.save()
        self._inc('posts_saved')
        print "Created post: %s " % post
        self.posts[post] = []

        # Comments
        # This section also can be present in different variations
        comments = []
        # The first type
        comments_blocks = grab.doc.select(
            '//div[contains(@id,"comments")]//dl[contains(@id,"comments-block")]'
        )
        if comments_blocks.count():
            comments_authors = comments_blocks.select(
                './/dt[contains(@class,"author")]')
            for author in comments_authors:
                author_name = author_url = url = None
                author_a = author.select('./a[@rel="nofollow"]')
                if not author_a.count():
                    author_a = author.select('.//a[@rel="nofollow"]')
                if author_a.count():
                    author_name = author_a.text().strip()
                    try:
                        author_url = author_a.attr('href')
                    except Exception:
                        pass
                comment_body = author.select(
                    './following-sibling::dd[contains(@class,"comment-body")][1]'
                )
                if comment_body.count():
                    content = comment_body.html()
                    # content = ' '.join([_.text().strip() for _ in comment_body.select('.//text()')])
                else:
                    continue
                comment_footer = author.select(
                    './following-sibling::dd[contains(@class,"comment-footer")][1]'
                )
                if comment_footer.count():
                    comment_timestamp = comment_footer.select(
                        './/span[contains(@class,"comment-timestamp")]/a')
                else:
                    comment_timestamp = author.select(
                        './/span[contains(@class,"comment-timestamp")]/a')
                if comment_timestamp.count():
                    try:
                        url = comment_timestamp.attr('href')
                    except Exception:
                        pass
                    timestamp = comment_timestamp.text()
                    if date and not str(
                            date.year)[:-1] in timestamp and ':' in timestamp:
                        timestamp = '.'.join([
                            str(_) for _ in (date.month, date.day, date.year)
                        ]) + ' ' + timestamp
                    YMDhm = self.parse_date(timestamp)
                    if YMDhm:
                        date = datetime(*YMDhm)
                comments.append(
                    dict(
                        author_name=author_name,
                        author_url=author_url,
                        content=content,
                        date=date,
                        url=url,
                    ))
        # The second type
        else:
            comments_blocks = grab.doc.select(
                '//div[@*="comments"]//div[@*="comment-header"]')
            for comment_header in comments_blocks:
                author_name = author_url = url = None
                author = comment_header.select(
                    './/cite[contains(@class,"user")]')
                if author.count():
                    author_a = author.select('./a')
                    if author_a.count():
                        author_name = author_a.text().strip()
                        try:
                            author_url = author_a.attr('href')
                        except Exception:
                            pass
                    else:
                        author_name = author.text().strip()
                else:
                    author = comment_header.select(
                        './/a[contains(@class, "autor-name")]')
                    if author.count():
                        author_name = author.text().strip()
                        try:
                            author_url = author.attr('href')
                        except Exception:
                            pass
                comment_timestamp = comment_header.select(
                    './/span[contains(@class,"datetime") or contains(@class,"timestamp") or contains(@id,"timestamp")]/a'
                )
                if comment_timestamp.count():
                    try:
                        url = comment_timestamp.attr('href')
                    except Exception:
                        pass
                    timestamp = comment_timestamp.text()
                    if date and not str(
                            date.year)[:-1] in timestamp and ':' in timestamp:
                        timestamp = '.'.join([
                            str(_) for _ in (date.month, date.day, date.year)
                        ]) + ' ' + timestamp
                    YMDhm = self.parse_date(timestamp)
                    if YMDhm:
                        date = datetime(*YMDhm)
                comment_body = comment_header.select(
                    './/p[contains(@class,"comment-content") or contains(@class,"comment-body")]'
                )
                if not comment_body.count():
                    comment_body = comment_header.select(
                        './following-sibling::p[contains(@class,"comment-content") or contains(@class,"comment-body")][1]'
                    )
                if comment_body.count():
                    content = comment_body.html()
                    # content = ' '.join([_.text().strip() for _ in comment_body.select('.//text()')])
                else:
                    continue
                comments.append(
                    dict(
                        author_name=author_name,
                        author_url=author_url,
                        content=content,
                        date=date,
                        url=url,
                    ))
        print "got %d comments " % len(comments)
        for comment in comments:
            self.posts[post].append(comment)
        # this will call self.fetch_post_interactions() first to see if we got any comments
        # if not, disqus will be crawled
        self.fetch_post_interactions_extra([post])