class ParseUserData: def __init__(self): config = ConfigLoader.load() self.db = DataBaseWrapper(config["db"]) def parse(self): page_size = 500 page = 0 while True: print("Fetch page #{0} ({1})".format(page, page_size * page)) result = self.db.execute_select( """ select json from users order by id limit %s offset %s """, (page_size, page_size * page)) if len(result) == 0: break for row in result: user_data = json.loads(row[0])["result"] self.db.execute_update( """ update users set created = to_timestamp(%s), name = %s, type = %s, karma = %s, is_plus = %s, is_verified = %s, is_available_for_messenger = %s, entries_count = %s, comments_count = %s, favorites_count = %s, subscribers_count = %s where id = %s """, (user_data["created"], user_data["name"], user_data["type"], user_data["karma"], user_data["is_plus"], user_data["is_verified"], user_data["isAvailableForMessenger"], user_data["counters"]["entries"], user_data["counters"]["comments"], user_data["counters"]["favorites"], user_data["subscribers_count"], user_data["id"])) page += 1 self.db.commit()
class ParsePostData: def __init__(self): config = ConfigLoader.load() self.db = DataBaseWrapper(config["db"]) self.tag_regex = re.compile(config["api"]["tag_regex"]) def __parse_tags(self, post_id, text): search_index = 0 while True: match = self.tag_regex.search(text, search_index) if match is None: break parsed_tag = urllib.parse.unquote(match.group(0)) if len(parsed_tag) >= 3 and not parsed_tag[1:].isdigit(): self.db.execute_insert( """ insert into post_tags (post_id, value, source) values (%s, %s, %s) """, (post_id, parsed_tag.lower(), text) ) search_index = match.end(0) def parse(self): offset_base = 0 page_size = 500 page = 0 while True: offset = offset_base + page_size * page print(f"Fetch page #{page} ({offset})") result = self.db.execute_select( """ select id, json from posts order by id limit %s offset %s """, (page_size, offset) ) if len(result) == 0: break for row in result: post_id = row[0] try: post_data = json.loads(row[1])["result"] if "blocks" in post_data: blocks = post_data["blocks"] for block in blocks: block_type = block["type"] block_data = block["data"] text_length = 0 if "text" in block_data: text_length = len(block_data["text"]) self.__parse_tags(post_id, block_data["text"]) if block_type == "list": for item in block_data["items"]: text_length += len(item) self.__parse_tags(post_id, item) self.db.execute_insert( """ insert into post_blocks (post_id, type, data, text_length) values (%s, %s, %s, %s) """, (post_id, block_type, json.dumps(block_data), text_length) ) co_author_id = None co_author_name = None if "co_author" in post_data: co_author_id = post_data["co_author"]["id"] co_author_name = post_data["co_author"]["name"] self.db.execute_update( """ update posts set created = to_timestamp(%s), type = %s, subsite_id = %s, subsite_name = %s, subsite_type = %s, author_id = %s, author_name = %s, co_author_id = %s, co_author_name = %s, title = %s, is_enabled_comments = %s, is_enabled_likes = %s, is_repost = %s, is_show_thanks = %s, is_filled_by_editors = %s, is_editorial = %s, hotness = %s, comments_count = %s, favorites_count = %s, hits_count = %s, likes_count = %s, likes_sum = %s where id = %s """, ( post_data["date"], post_data["type"], post_data["subsite"]["id"], post_data["subsite"]["name"], post_data["subsite"]["type"], post_data["author"]["id"], post_data["author"]["name"], co_author_id, co_author_name, post_data["title"], post_data["isEnabledComments"], post_data["isEnabledLikes"], post_data["isRepost"], post_data.get("is_show_thanks"), post_data.get("is_filled_by_editors"), post_data.get("isEditorial"), post_data.get("hotness"), post_data["commentsCount"], post_data["favoritesCount"], post_data["hitsCount"], post_data["likes"]["count"], post_data["likes"]["summ"], post_id ) ) except Exception: print(f"Exception for post #{post_id}") raise page += 1 self.db.commit()
) if len(result) == 0: break for row in result: record_id = row[0] print("parsing " + str(record_id)) post_data = json.loads(row[1])["result"] db.execute_update( """ update post_history set hits = %s, rating = %s, comments = %s, favorites = %s where id = %s """, ( post_data["hitsCount"], post_data["likes"]["summ"], post_data["commentsCount"], post_data["favoritesCount"], record_id ) ) page += 1 db.commit()
select id, json from subsites order by id limit %s offset %s """, (page_size, offset)) if len(result) == 0: break for row in result: subsite_id = row[0] subsite_data = json.loads(row[1]) print(subsite_data) db.execute_update( """ update subsites set created = to_timestamp(%s), name = %s, type = %s, description = %s, is_verified = %s, is_enable_writing = %s, subscriber_count = %s where id = %s """, (subsite_data["created"], subsite_data["name"], subsite_data["type"], subsite_data["description"], subsite_data["is_verified"], subsite_data["is_enable_writing"], subsite_data["subscribers_count"], subsite_id)) db.commit() page += 1