class GetSubsites:
    @dataclass
    class Stats:
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()

    def get_subsites(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))
        self.__get_subsites_list("sections")
        self.__get_subsites_list("companies")
        self.db.commit()

    def __get_subsites_list(self, subsite_type):
        response = self.api.execute("subsites_list/" + subsite_type)
        subsites_list = response.json()["result"]
        for subsite_data in subsites_list:
            print(subsite_data)
            self.db.execute_insert(
                """
                    insert into subsites (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (subsite_data["id"], json.dumps(subsite_data)))
class ParseUserData:
    def __init__(self):
        config = ConfigLoader.load()
        self.db = DataBaseWrapper(config["db"])

    def parse(self):
        page_size = 500
        page = 0
        while True:
            print("Fetch page #{0} ({1})".format(page, page_size * page))
            result = self.db.execute_select(
                """
                    select json from users
                        order by id
                        limit %s offset %s
                """, (page_size, page_size * page))
            if len(result) == 0:
                break

            for row in result:
                user_data = json.loads(row[0])["result"]
                self.db.execute_update(
                    """
                        update users
                            set
                                created = to_timestamp(%s),
                                name = %s,
                                type = %s,
                                karma = %s,
                                is_plus = %s,
                                is_verified = %s,
                                is_available_for_messenger = %s,
                                entries_count = %s,
                                comments_count = %s,
                                favorites_count = %s,
                                subscribers_count = %s
                            where id = %s
                    """, (user_data["created"], user_data["name"],
                          user_data["type"], user_data["karma"],
                          user_data["is_plus"], user_data["is_verified"],
                          user_data["isAvailableForMessenger"],
                          user_data["counters"]["entries"],
                          user_data["counters"]["comments"],
                          user_data["counters"]["favorites"],
                          user_data["subscribers_count"], user_data["id"]))

            page += 1
            self.db.commit()
class GetSubsites:
    @dataclass
    class Stats:
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()

    def get_subsites(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))

        errors = self.db.execute_select(
            """
                select user_id
                from user_errors
                where status_code = 404
            """, None)

        for error in errors:
            user_id = error[0]
            subsite_data = self.db.execute_select_one(
                """
                    select id
                    from subsites
                    where id = %s
                """, (user_id, ))
            if subsite_data is None:
                self.__get_subsite(user_id)

    def __get_subsite(self, subsite_id):
        response = self.api.execute("subsite/" + str(subsite_id))
        if response.status_code == 429:
            # Too Many Requests
            print(
                datetime.now().strftime("%H:%M:%S") +
                ": 429 Too Many Requests. Requests processed since last 429 error: "
                + str(self.stats.requests_since_last_429) +
                ". Wait for 60 seconds and repeat")
            self.stats.requests_since_last_429 = 0
            time.sleep(60)
            self.__get_subsite(subsite_id)
            return

        print(
            str(subsite_id) + ": " + str(response.status_code) + ": " +
            str(response.json()))
        if response.status_code == 200:
            self.db.execute_insert(
                """
                    insert into subsites (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (subsite_id, json.dumps(response.json()["result"])))
            self.db.commit()
示例#4
0
class ParsePostData:
    def __init__(self):
        config = ConfigLoader.load()
        self.db = DataBaseWrapper(config["db"])
        self.tag_regex = re.compile(config["api"]["tag_regex"])

    def __parse_tags(self, post_id, text):
        search_index = 0
        while True:
            match = self.tag_regex.search(text, search_index)
            if match is None:
                break

            parsed_tag = urllib.parse.unquote(match.group(0))
            if len(parsed_tag) >= 3 and not parsed_tag[1:].isdigit():
                self.db.execute_insert(
                    """
                        insert into post_tags (post_id, value, source)
                            values (%s, %s, %s)
                    """,
                    (post_id, parsed_tag.lower(), text)
                )
            search_index = match.end(0)

    def parse(self):
        offset_base = 0
        page_size = 500
        page = 0
        while True:
            offset = offset_base + page_size * page
            print(f"Fetch page #{page} ({offset})")
            result = self.db.execute_select(
                """
                    select id, json from posts
                        order by id
                        limit %s offset %s
                """,
                (page_size, offset)
            )
            if len(result) == 0:
                break

            for row in result:
                post_id = row[0]
                try:
                    post_data = json.loads(row[1])["result"]
                    if "blocks" in post_data:
                        blocks = post_data["blocks"]
                        for block in blocks:
                            block_type = block["type"]
                            block_data = block["data"]
                            text_length = 0
                            if "text" in block_data:
                                text_length = len(block_data["text"])
                                self.__parse_tags(post_id, block_data["text"])
                            if block_type == "list":
                                for item in block_data["items"]:
                                    text_length += len(item)
                                    self.__parse_tags(post_id, item)

                            self.db.execute_insert(
                                """
                                    insert into post_blocks (post_id, type, data, text_length)
                                        values (%s, %s, %s, %s)
                                """,
                                (post_id, block_type, json.dumps(block_data), text_length)
                            )

                    co_author_id = None
                    co_author_name = None
                    if "co_author" in post_data:
                        co_author_id = post_data["co_author"]["id"]
                        co_author_name = post_data["co_author"]["name"]

                    self.db.execute_update(
                        """
                            update posts
                                set
                                    created = to_timestamp(%s),
                                    type = %s,
                                    subsite_id = %s,
                                    subsite_name = %s,
                                    subsite_type = %s,
                                    author_id = %s,
                                    author_name = %s,
                                    co_author_id = %s,
                                    co_author_name = %s,
                                    title = %s,
                                    is_enabled_comments = %s,
                                    is_enabled_likes = %s,
                                    is_repost = %s,
                                    is_show_thanks = %s,
                                    is_filled_by_editors = %s,
                                    is_editorial = %s,
                                    hotness = %s,
                                    comments_count = %s,
                                    favorites_count = %s,
                                    hits_count = %s,
                                    likes_count = %s,
                                    likes_sum = %s
                                where id = %s
                        """,
                        (
                            post_data["date"],
                            post_data["type"],
                            post_data["subsite"]["id"],
                            post_data["subsite"]["name"],
                            post_data["subsite"]["type"],
                            post_data["author"]["id"],
                            post_data["author"]["name"],
                            co_author_id,
                            co_author_name,
                            post_data["title"],
                            post_data["isEnabledComments"],
                            post_data["isEnabledLikes"],
                            post_data["isRepost"],
                            post_data.get("is_show_thanks"),
                            post_data.get("is_filled_by_editors"),
                            post_data.get("isEditorial"),
                            post_data.get("hotness"),
                            post_data["commentsCount"],
                            post_data["favoritesCount"],
                            post_data["hitsCount"],
                            post_data["likes"]["count"],
                            post_data["likes"]["summ"],
                            post_id
                        )
                    )

                except Exception:
                    print(f"Exception for post #{post_id}")
                    raise

            page += 1
            self.db.commit()
class GetSubsiteTimeline:
    @dataclass
    class Stats():
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()
        self.offset = 1
        self.count = 50
        self.subsite_id = 203796

    @staticmethod
    def __time():
        return datetime.now().strftime("%H:%M:%S")

    def get_posts(self):
        print(f"Started at {self.__time()}")
        timeline = self.__get_timeline(self.subsite_id, 'new', self.count,
                                       self.offset)
        while timeline:
            print(f'{len(timeline)}/{self.offset}')
            parsed_timeline = self.__parse_timeline(timeline)
            self.__db_insert(parsed_timeline)

            if self.stats.request_count % 10 == 0:
                self.db.commit()
                print(
                    f'{self.__time()}: {self.stats.request_count} requests processed ({self.stats.post_count} posts, {self.stats.error_count} errors)'
                )
            self.offset += self.count
            timeline = self.__get_timeline(self.subsite_id, 'new', self.count,
                                           self.offset)

        self.db.commit()

    def __db_insert(self, parsed_timeline: list):
        for post in parsed_timeline:
            self.db.execute_insert(
                """
                    insert into posts (id, commentscount, favoritescount, hitscount, likescount, date_created, subsite_id, is_show_thanks, is_filled_by_editors, iseditorial)
                        values (%s, %s, %s, %s, %s, to_timestamp(%s), %s, %s, %s, %s)
                    on conflict (id)
                        do update set date_created = excluded.date_created;
                """, ([*post.values()]))
            self.stats.post_count += 1

        self.stats.request_count += 1
        self.stats.requests_since_last_429 += 1

    def __parse_timeline(self, timeline: list) -> list:
        parsed = [
            dict(entry_id=post.get('id'),
                 commentsCount=post.get('commentsCount'),
                 favoritesCount=post.get('favoritesCount'),
                 hitsCount=post.get('hitsCount'),
                 likesCount=post.get('likes').get('count', 0),
                 date_created=post.get('date'),
                 subsite_id=self.subsite_id,
                 is_show_thanks=post.get('is_show_thanks'),
                 is_filled_by_editors=post.get('is_filled_by_editors'),
                 isEditorial=post.get('isEditorial')) for post in timeline
            if not post.get('isRepost')
        ]
        return parsed

    def __get_timeline(self,
                       subsite: int,
                       sorting: str = 'new',
                       count: int = 50,
                       offset: int = 0) -> list:
        response = self.api.execute(
            f"subsite/{subsite}/timeline/{sorting}?count={count}&offset={offset}"
        )
        if response.status_code == 429:
            print(
                f'{self.__time()}: 429 Too Many Requests. Requests processed since last 429 error: {self.stats.requests_since_last_429}'
            )
            self.stats.requests_since_last_429 = 0
            time.sleep(60)
            return self.__get_timeline(subsite, sorting, count, offset)

        response_json = response.json()
        print(f"__get_timeline:{response.status_code}: {self.__time()}")

        return response_json.get('result')
示例#6
0
class GetPosts:
    @dataclass
    class Stats:
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()

    def get_posts(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))
        try:
            for post_id in range(1, 165000):
                if self.stats.request_count % 100 == 0:
                    self.db.commit()
                    print(
                        "{0}: {1} requests processed ({2} posts, {3} errors)".
                        format(datetime.now().strftime("%H:%M:%S"),
                               self.stats.request_count, self.stats.post_count,
                               self.stats.error_count))

                self.__get_post(post_id)

        except Exception:
            print("Exception!")
            raise

        finally:
            self.db.commit()

    def __get_post(self, post_id):
        response = self.api.execute("entry/" + str(post_id))
        if response.status_code == 429:
            # Too Many Requests
            print(
                datetime.now().strftime("%H:%M:%S") +
                ": 429 Too Many Requests. Requests processed since last 429 error: "
                + str(self.stats.requests_since_last_429) +
                ". Wait for 60 seconds and repeat")
            self.stats.requests_since_last_429 = 0
            time.sleep(60)
            self.__get_post(post_id)
            return

        response_json = response.json()
        print(str(response.status_code) + ": " + str(response_json))

        if "error" in response_json:
            self.db.execute_insert(
                """
                    insert into post_errors (post_id, status_code, response)
                        values (%s, %s, %s);
                """,
                (post_id, response.status_code, json.dumps(response_json)))
            self.stats.error_count += 1

        else:
            self.db.execute_insert(
                """
                    insert into posts (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (post_id, json.dumps(response_json)))
            self.stats.post_count += 1

        self.stats.request_count += 1
        self.stats.requests_since_last_429 += 1
示例#7
0
import json
import time
from datetime import datetime

from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper
from src.common.ochoba_api_wrapper import OchobaApiWrapper

post_id = 220958
request_interval_minutes = 30

config = ConfigLoader.load()
api = OchobaApiWrapper(config["api"])
db = DataBaseWrapper(config["db"])

print(datetime.now().strftime("%H:%M:%S") + ": Started")

while True:
    response = api.execute("entry/" + str(post_id))
    print(datetime.now().strftime("%H:%M:%S") + ": Got " +
          str(response.status_code))
    if response.status_code == 200:
        db.execute_insert(
            """
                insert into post_history (post_id, request_time, json)
                    values (%s, %s, %s);
            """, (post_id, datetime.now(), json.dumps(response.json())))
        db.commit()

    time.sleep(60 * request_interval_minutes)