class SqlPlot: def __init__(self): config = ConfigLoader.load() self.db = DataBaseWrapper(config["db"]) def show(self, sql_queries, title, x_label, y_label): show_legend = False for sql_query in sql_queries: data = self.db.execute_select(sql_query.get('query'), None) x = [] y = [] for row in data: x.append(row[0]) y.append(row[1]) label = sql_query.get('label') plot.plot(x, y, label=label) show_legend |= label is not None plot.title(title) plot.xlabel(x_label) plot.xticks(rotation=45) plot.ylabel(y_label) plot.grid(True) if show_legend: plot.legend() plot.show()
class ParseUserData: def __init__(self): config = ConfigLoader.load() self.db = DataBaseWrapper(config["db"]) def parse(self): page_size = 500 page = 0 while True: print("Fetch page #{0} ({1})".format(page, page_size * page)) result = self.db.execute_select( """ select json from users order by id limit %s offset %s """, (page_size, page_size * page)) if len(result) == 0: break for row in result: user_data = json.loads(row[0])["result"] self.db.execute_update( """ update users set created = to_timestamp(%s), name = %s, type = %s, karma = %s, is_plus = %s, is_verified = %s, is_available_for_messenger = %s, entries_count = %s, comments_count = %s, favorites_count = %s, subscribers_count = %s where id = %s """, (user_data["created"], user_data["name"], user_data["type"], user_data["karma"], user_data["is_plus"], user_data["is_verified"], user_data["isAvailableForMessenger"], user_data["counters"]["entries"], user_data["counters"]["comments"], user_data["counters"]["favorites"], user_data["subscribers_count"], user_data["id"])) page += 1 self.db.commit()
class GetSubsites: @dataclass class Stats: request_count: int = 0 post_count: int = 0 error_count: int = 0 requests_since_last_429: int = 0 def __init__(self): config = ConfigLoader.load() self.api = OchobaApiWrapper(config["api"]) self.db = DataBaseWrapper(config["db"]) self.stats = self.Stats() def get_subsites(self): print("Started at " + datetime.now().strftime("%H:%M:%S")) errors = self.db.execute_select( """ select user_id from user_errors where status_code = 404 """, None) for error in errors: user_id = error[0] subsite_data = self.db.execute_select_one( """ select id from subsites where id = %s """, (user_id, )) if subsite_data is None: self.__get_subsite(user_id) def __get_subsite(self, subsite_id): response = self.api.execute("subsite/" + str(subsite_id)) if response.status_code == 429: # Too Many Requests print( datetime.now().strftime("%H:%M:%S") + ": 429 Too Many Requests. Requests processed since last 429 error: " + str(self.stats.requests_since_last_429) + ". Wait for 60 seconds and repeat") self.stats.requests_since_last_429 = 0 time.sleep(60) self.__get_subsite(subsite_id) return print( str(subsite_id) + ": " + str(response.status_code) + ": " + str(response.json())) if response.status_code == 200: self.db.execute_insert( """ insert into subsites (id, json) values (%s, %s) on conflict (id) do update set json = excluded.json; """, (subsite_id, json.dumps(response.json()["result"]))) self.db.commit()
class ParsePostData: def __init__(self): config = ConfigLoader.load() self.db = DataBaseWrapper(config["db"]) self.tag_regex = re.compile(config["api"]["tag_regex"]) def __parse_tags(self, post_id, text): search_index = 0 while True: match = self.tag_regex.search(text, search_index) if match is None: break parsed_tag = urllib.parse.unquote(match.group(0)) if len(parsed_tag) >= 3 and not parsed_tag[1:].isdigit(): self.db.execute_insert( """ insert into post_tags (post_id, value, source) values (%s, %s, %s) """, (post_id, parsed_tag.lower(), text) ) search_index = match.end(0) def parse(self): offset_base = 0 page_size = 500 page = 0 while True: offset = offset_base + page_size * page print(f"Fetch page #{page} ({offset})") result = self.db.execute_select( """ select id, json from posts order by id limit %s offset %s """, (page_size, offset) ) if len(result) == 0: break for row in result: post_id = row[0] try: post_data = json.loads(row[1])["result"] if "blocks" in post_data: blocks = post_data["blocks"] for block in blocks: block_type = block["type"] block_data = block["data"] text_length = 0 if "text" in block_data: text_length = len(block_data["text"]) self.__parse_tags(post_id, block_data["text"]) if block_type == "list": for item in block_data["items"]: text_length += len(item) self.__parse_tags(post_id, item) self.db.execute_insert( """ insert into post_blocks (post_id, type, data, text_length) values (%s, %s, %s, %s) """, (post_id, block_type, json.dumps(block_data), text_length) ) co_author_id = None co_author_name = None if "co_author" in post_data: co_author_id = post_data["co_author"]["id"] co_author_name = post_data["co_author"]["name"] self.db.execute_update( """ update posts set created = to_timestamp(%s), type = %s, subsite_id = %s, subsite_name = %s, subsite_type = %s, author_id = %s, author_name = %s, co_author_id = %s, co_author_name = %s, title = %s, is_enabled_comments = %s, is_enabled_likes = %s, is_repost = %s, is_show_thanks = %s, is_filled_by_editors = %s, is_editorial = %s, hotness = %s, comments_count = %s, favorites_count = %s, hits_count = %s, likes_count = %s, likes_sum = %s where id = %s """, ( post_data["date"], post_data["type"], post_data["subsite"]["id"], post_data["subsite"]["name"], post_data["subsite"]["type"], post_data["author"]["id"], post_data["author"]["name"], co_author_id, co_author_name, post_data["title"], post_data["isEnabledComments"], post_data["isEnabledLikes"], post_data["isRepost"], post_data.get("is_show_thanks"), post_data.get("is_filled_by_editors"), post_data.get("isEditorial"), post_data.get("hotness"), post_data["commentsCount"], post_data["favoritesCount"], post_data["hitsCount"], post_data["likes"]["count"], post_data["likes"]["summ"], post_id ) ) except Exception: print(f"Exception for post #{post_id}") raise page += 1 self.db.commit()
from src.common.config_loader import ConfigLoader from src.common.data_base_wrapper import DataBaseWrapper config = ConfigLoader.load() db = DataBaseWrapper(config["db"]) offset_base = 0 page_size = 500 page = 0 while True: offset = offset_base + page_size * page result = db.execute_select( """ select id, json from post_history where hits is null order by id limit %s offset %s """, (page_size, offset) ) if len(result) == 0: break for row in result: record_id = row[0] print("parsing " + str(record_id)) post_data = json.loads(row[1])["result"] db.execute_update( """ update post_history
from src.common.config_loader import ConfigLoader from src.common.data_base_wrapper import DataBaseWrapper config = ConfigLoader.load() db = DataBaseWrapper(config["db"]) offset_base = 0 page_size = 500 page = 0 while True: offset = offset_base + page_size * page print(f"Fetch page #{page} ({offset})") result = db.execute_select( """ select id, json from subsites order by id limit %s offset %s """, (page_size, offset)) if len(result) == 0: break for row in result: subsite_id = row[0] subsite_data = json.loads(row[1]) print(subsite_data) db.execute_update( """ update subsites set created = to_timestamp(%s), name = %s,