class GetSubsites: @dataclass class Stats: request_count: int = 0 post_count: int = 0 error_count: int = 0 requests_since_last_429: int = 0 def __init__(self): config = ConfigLoader.load() self.api = OchobaApiWrapper(config["api"]) self.db = DataBaseWrapper(config["db"]) self.stats = self.Stats() def get_subsites(self): print("Started at " + datetime.now().strftime("%H:%M:%S")) self.__get_subsites_list("sections") self.__get_subsites_list("companies") self.db.commit() def __get_subsites_list(self, subsite_type): response = self.api.execute("subsites_list/" + subsite_type) subsites_list = response.json()["result"] for subsite_data in subsites_list: print(subsite_data) self.db.execute_insert( """ insert into subsites (id, json) values (%s, %s) on conflict (id) do update set json = excluded.json; """, (subsite_data["id"], json.dumps(subsite_data)))
class ParseUserData: def __init__(self): config = ConfigLoader.load() self.db = DataBaseWrapper(config["db"]) def parse(self): page_size = 500 page = 0 while True: print("Fetch page #{0} ({1})".format(page, page_size * page)) result = self.db.execute_select( """ select json from users order by id limit %s offset %s """, (page_size, page_size * page)) if len(result) == 0: break for row in result: user_data = json.loads(row[0])["result"] self.db.execute_update( """ update users set created = to_timestamp(%s), name = %s, type = %s, karma = %s, is_plus = %s, is_verified = %s, is_available_for_messenger = %s, entries_count = %s, comments_count = %s, favorites_count = %s, subscribers_count = %s where id = %s """, (user_data["created"], user_data["name"], user_data["type"], user_data["karma"], user_data["is_plus"], user_data["is_verified"], user_data["isAvailableForMessenger"], user_data["counters"]["entries"], user_data["counters"]["comments"], user_data["counters"]["favorites"], user_data["subscribers_count"], user_data["id"])) page += 1 self.db.commit()
class GetSubsites: @dataclass class Stats: request_count: int = 0 post_count: int = 0 error_count: int = 0 requests_since_last_429: int = 0 def __init__(self): config = ConfigLoader.load() self.api = OchobaApiWrapper(config["api"]) self.db = DataBaseWrapper(config["db"]) self.stats = self.Stats() def get_subsites(self): print("Started at " + datetime.now().strftime("%H:%M:%S")) errors = self.db.execute_select( """ select user_id from user_errors where status_code = 404 """, None) for error in errors: user_id = error[0] subsite_data = self.db.execute_select_one( """ select id from subsites where id = %s """, (user_id, )) if subsite_data is None: self.__get_subsite(user_id) def __get_subsite(self, subsite_id): response = self.api.execute("subsite/" + str(subsite_id)) if response.status_code == 429: # Too Many Requests print( datetime.now().strftime("%H:%M:%S") + ": 429 Too Many Requests. Requests processed since last 429 error: " + str(self.stats.requests_since_last_429) + ". Wait for 60 seconds and repeat") self.stats.requests_since_last_429 = 0 time.sleep(60) self.__get_subsite(subsite_id) return print( str(subsite_id) + ": " + str(response.status_code) + ": " + str(response.json())) if response.status_code == 200: self.db.execute_insert( """ insert into subsites (id, json) values (%s, %s) on conflict (id) do update set json = excluded.json; """, (subsite_id, json.dumps(response.json()["result"]))) self.db.commit()
class ParsePostData: def __init__(self): config = ConfigLoader.load() self.db = DataBaseWrapper(config["db"]) self.tag_regex = re.compile(config["api"]["tag_regex"]) def __parse_tags(self, post_id, text): search_index = 0 while True: match = self.tag_regex.search(text, search_index) if match is None: break parsed_tag = urllib.parse.unquote(match.group(0)) if len(parsed_tag) >= 3 and not parsed_tag[1:].isdigit(): self.db.execute_insert( """ insert into post_tags (post_id, value, source) values (%s, %s, %s) """, (post_id, parsed_tag.lower(), text) ) search_index = match.end(0) def parse(self): offset_base = 0 page_size = 500 page = 0 while True: offset = offset_base + page_size * page print(f"Fetch page #{page} ({offset})") result = self.db.execute_select( """ select id, json from posts order by id limit %s offset %s """, (page_size, offset) ) if len(result) == 0: break for row in result: post_id = row[0] try: post_data = json.loads(row[1])["result"] if "blocks" in post_data: blocks = post_data["blocks"] for block in blocks: block_type = block["type"] block_data = block["data"] text_length = 0 if "text" in block_data: text_length = len(block_data["text"]) self.__parse_tags(post_id, block_data["text"]) if block_type == "list": for item in block_data["items"]: text_length += len(item) self.__parse_tags(post_id, item) self.db.execute_insert( """ insert into post_blocks (post_id, type, data, text_length) values (%s, %s, %s, %s) """, (post_id, block_type, json.dumps(block_data), text_length) ) co_author_id = None co_author_name = None if "co_author" in post_data: co_author_id = post_data["co_author"]["id"] co_author_name = post_data["co_author"]["name"] self.db.execute_update( """ update posts set created = to_timestamp(%s), type = %s, subsite_id = %s, subsite_name = %s, subsite_type = %s, author_id = %s, author_name = %s, co_author_id = %s, co_author_name = %s, title = %s, is_enabled_comments = %s, is_enabled_likes = %s, is_repost = %s, is_show_thanks = %s, is_filled_by_editors = %s, is_editorial = %s, hotness = %s, comments_count = %s, favorites_count = %s, hits_count = %s, likes_count = %s, likes_sum = %s where id = %s """, ( post_data["date"], post_data["type"], post_data["subsite"]["id"], post_data["subsite"]["name"], post_data["subsite"]["type"], post_data["author"]["id"], post_data["author"]["name"], co_author_id, co_author_name, post_data["title"], post_data["isEnabledComments"], post_data["isEnabledLikes"], post_data["isRepost"], post_data.get("is_show_thanks"), post_data.get("is_filled_by_editors"), post_data.get("isEditorial"), post_data.get("hotness"), post_data["commentsCount"], post_data["favoritesCount"], post_data["hitsCount"], post_data["likes"]["count"], post_data["likes"]["summ"], post_id ) ) except Exception: print(f"Exception for post #{post_id}") raise page += 1 self.db.commit()
class GetSubsiteTimeline: @dataclass class Stats(): request_count: int = 0 post_count: int = 0 error_count: int = 0 requests_since_last_429: int = 0 def __init__(self): config = ConfigLoader.load() self.api = OchobaApiWrapper(config["api"]) self.db = DataBaseWrapper(config["db"]) self.stats = self.Stats() self.offset = 1 self.count = 50 self.subsite_id = 203796 @staticmethod def __time(): return datetime.now().strftime("%H:%M:%S") def get_posts(self): print(f"Started at {self.__time()}") timeline = self.__get_timeline(self.subsite_id, 'new', self.count, self.offset) while timeline: print(f'{len(timeline)}/{self.offset}') parsed_timeline = self.__parse_timeline(timeline) self.__db_insert(parsed_timeline) if self.stats.request_count % 10 == 0: self.db.commit() print( f'{self.__time()}: {self.stats.request_count} requests processed ({self.stats.post_count} posts, {self.stats.error_count} errors)' ) self.offset += self.count timeline = self.__get_timeline(self.subsite_id, 'new', self.count, self.offset) self.db.commit() def __db_insert(self, parsed_timeline: list): for post in parsed_timeline: self.db.execute_insert( """ insert into posts (id, commentscount, favoritescount, hitscount, likescount, date_created, subsite_id, is_show_thanks, is_filled_by_editors, iseditorial) values (%s, %s, %s, %s, %s, to_timestamp(%s), %s, %s, %s, %s) on conflict (id) do update set date_created = excluded.date_created; """, ([*post.values()])) self.stats.post_count += 1 self.stats.request_count += 1 self.stats.requests_since_last_429 += 1 def __parse_timeline(self, timeline: list) -> list: parsed = [ dict(entry_id=post.get('id'), commentsCount=post.get('commentsCount'), favoritesCount=post.get('favoritesCount'), hitsCount=post.get('hitsCount'), likesCount=post.get('likes').get('count', 0), date_created=post.get('date'), subsite_id=self.subsite_id, is_show_thanks=post.get('is_show_thanks'), is_filled_by_editors=post.get('is_filled_by_editors'), isEditorial=post.get('isEditorial')) for post in timeline if not post.get('isRepost') ] return parsed def __get_timeline(self, subsite: int, sorting: str = 'new', count: int = 50, offset: int = 0) -> list: response = self.api.execute( f"subsite/{subsite}/timeline/{sorting}?count={count}&offset={offset}" ) if response.status_code == 429: print( f'{self.__time()}: 429 Too Many Requests. Requests processed since last 429 error: {self.stats.requests_since_last_429}' ) self.stats.requests_since_last_429 = 0 time.sleep(60) return self.__get_timeline(subsite, sorting, count, offset) response_json = response.json() print(f"__get_timeline:{response.status_code}: {self.__time()}") return response_json.get('result')
class GetPosts: @dataclass class Stats: request_count: int = 0 post_count: int = 0 error_count: int = 0 requests_since_last_429: int = 0 def __init__(self): config = ConfigLoader.load() self.api = OchobaApiWrapper(config["api"]) self.db = DataBaseWrapper(config["db"]) self.stats = self.Stats() def get_posts(self): print("Started at " + datetime.now().strftime("%H:%M:%S")) try: for post_id in range(1, 165000): if self.stats.request_count % 100 == 0: self.db.commit() print( "{0}: {1} requests processed ({2} posts, {3} errors)". format(datetime.now().strftime("%H:%M:%S"), self.stats.request_count, self.stats.post_count, self.stats.error_count)) self.__get_post(post_id) except Exception: print("Exception!") raise finally: self.db.commit() def __get_post(self, post_id): response = self.api.execute("entry/" + str(post_id)) if response.status_code == 429: # Too Many Requests print( datetime.now().strftime("%H:%M:%S") + ": 429 Too Many Requests. Requests processed since last 429 error: " + str(self.stats.requests_since_last_429) + ". Wait for 60 seconds and repeat") self.stats.requests_since_last_429 = 0 time.sleep(60) self.__get_post(post_id) return response_json = response.json() print(str(response.status_code) + ": " + str(response_json)) if "error" in response_json: self.db.execute_insert( """ insert into post_errors (post_id, status_code, response) values (%s, %s, %s); """, (post_id, response.status_code, json.dumps(response_json))) self.stats.error_count += 1 else: self.db.execute_insert( """ insert into posts (id, json) values (%s, %s) on conflict (id) do update set json = excluded.json; """, (post_id, json.dumps(response_json))) self.stats.post_count += 1 self.stats.request_count += 1 self.stats.requests_since_last_429 += 1
import json import time from datetime import datetime from src.common.config_loader import ConfigLoader from src.common.data_base_wrapper import DataBaseWrapper from src.common.ochoba_api_wrapper import OchobaApiWrapper post_id = 220958 request_interval_minutes = 30 config = ConfigLoader.load() api = OchobaApiWrapper(config["api"]) db = DataBaseWrapper(config["db"]) print(datetime.now().strftime("%H:%M:%S") + ": Started") while True: response = api.execute("entry/" + str(post_id)) print(datetime.now().strftime("%H:%M:%S") + ": Got " + str(response.status_code)) if response.status_code == 200: db.execute_insert( """ insert into post_history (post_id, request_time, json) values (%s, %s, %s); """, (post_id, datetime.now(), json.dumps(response.json()))) db.commit() time.sleep(60 * request_interval_minutes)