def parse_publication(payload, pub_id=None, return_dict=False): if pub_id is None: pub_id = payload["payload"]["collection"]["id"] publication_dict = payload["payload"]["references"]["Collection"][pub_id] publication = Publication(pub_id) publication.display_name = publication_dict["name"] publication.description = publication_dict["description"] publication.creator_user_id = publication_dict["creatorId"] image_dict = publication_dict["image"] image = parse_images(image_dict, return_dict) if image is not None: publication.image = image logo_dict = publication_dict["logo"] logo = parse_images(logo_dict, return_dict) if logo is not None: publication.logo = logo publication.follower_count = publication_dict["metadata"]["followerCount"] if "postCount" in publication_dict["metadata"]: publication.post_count = publication_dict["metadata"]["postCount"] if "domain" in publication_dict: publication.url = "http://" + publication_dict["domain"] else: publication.url = ROOT_URL + publication_dict["slug"] publication.name = publication_dict["slug"] if return_dict: return to_dict(publication) else: return publication
def populateDataframe(postObjectList): colNames = ['post_date', 'post_creatorId', 'post_id', 'post_content', 'post_username', 'read_time', 'detectedLanguage', 'blockquote_list', 'existTophighlight', 'image_count', 'subtitle', 'title', 'number_blockquotes', 'tophighlight', 'word_count', 'post_tags', 'response_count', 'number_post_tags', 'url', 'recommend_count'] postdf = pd.DataFrame(columns=colNames) objColNames = ['blockquote_list', 'post_tags'] numericColNames = ['post_date','read_time', 'image_count', 'number_blockquotes','word_count', 'response_count', 'number_post_tags','recommend_count'] postdf[objColNames] = postdf[objColNames].astype(object) postdf[numericColNames] = postdf[numericColNames].astype(float) for post in postObjectList: try: post_dict = to_dict(post) #print(post_dict.keys()) removeImage = post_dict.pop("preview_image", None) numTags = len(post_dict["post_tags"]) numBlockquotes = len(post_dict["blockquote_list"]) post_dict["number_post_tags"] = numTags post_dict["number_blockquotes"] = numBlockquotes # Have values in case you need to check for missing values if(numBlockquotes==0): post_dict["blockquote_list"] = np.NAN if(numTags==0): post_dict["post_tags"] = np.NAN if(not post_dict["tophighlight"]): post_dict["tophighlight"] = np.NAN postdf = postdf.append(post_dict, ignore_index=True) #print(newpostdf) #quotes = newpostdf["blockquote_list"][0] #firstquote = quotes[0] except: continue return postdf
def parse_tags(tags_list_dict, return_dict=False): if tags_list_dict is not None and len(tags_list_dict) > 0: tags_list = [] for tag_dict in tags_list_dict: tag = Tag() tag.unique_slug = tag_dict["slug"] tag.name = tag_dict["name"] tag.post_count = tag_dict["postCount"] metadata_dict = tag_dict["metadata"] if metadata_dict is not None: tag.follower_count = metadata_dict["followerCount"] if return_dict: tags_list.append(to_dict(tag)) else: tags_list.append(tag) return tags_list
def parse_user(payload, return_dict=False): user_dict = payload["payload"]["user"] user_id = user_dict["userId"] user = User(user_id) username = user_dict["username"] display_name = user_dict["name"] avatar = user_dict["imageId"] bio = user_dict["bio"] twitter_name = user_dict["twitterScreenName"] facebook_id = user_dict["facebookAccountId"] user_meta_dict = payload["payload"]["userMeta"] ref_dict = payload["payload"]["references"] # interest_tags = user_meta_dict["interestTags"] # user.interest_tags = parse_tags(interest_tags, return_dict) # author_tags = user_meta_dict["authorTags"] # user.author_tags = parse_tags(author_tags, return_dict) publication_ids = ref_dict["Collection"] if publication_ids is not None and len(publication_ids.keys()) > 0: publication_list = [] for pub_id in publication_ids.keys(): publication = parse_publication(payload, pub_id, return_dict) publication_list.append(publication) if len(publication_list) > 0: user.publications = publication_list stats_dict = ref_dict["SocialStats"][user_id] following_count = stats_dict["usersFollowedCount"] followby_count = stats_dict["usersFollowedByCount"] user.user_id = user_id user.username = username user.display_name = display_name user.avatar = avatar user.bio = bio user.twitter = twitter_name user.facebook = facebook_id user.following_count = following_count user.followedby_count = followby_count if return_dict: return to_dict(user) else: return user
def parse_images(image_dict, return_dict=False): if image_dict is not None: image_id = image_dict[ "imageId"] if "imageId" in image_dict else image_dict["id"] if image_id: image = Image(image_id) image.original_width = image_dict["originalWidth"] image.original_height = image_dict["originalHeight"] # This isn't working. # image.url = u"https://cdn-images-1.medium.com/fit/t/{width}/{height}/{id}" \ # .format(width=image.original_width, # height=image.original_height, # id=image.image_id) if return_dict: return to_dict(image) else: return image else: return None
def parse_post_dict(post_dict, post_id=None): if post_id is None: post_id = post_dict["id"] post = Post(post_id) unique_slug = post_dict["uniqueSlug"] title = post_dict["title"] post_date = post_dict["createdAt"] # print(post_date) publication_id = post_dict["approvedHomeCollectionId"] url = ROOT_URL ref_dict = payload["payload"]["references"] if publication_id is not None and publication_id: publication_dict = ref_dict["Collection"][publication_id] # custom publication domain if "domain" in publication_dict and publication_dict["domain"]: url = "https://" + publication_dict["domain"] else: # simple publication url += publication_dict["slug"] else: # personal post, no publication creator_id = post_dict["creatorId"] username = ref_dict["User"][creator_id]["username"] url += "@{username}".format(username=username) url += u"/{path}".format(path=unique_slug) virtual_dict = post_dict["virtuals"] recommend_count = virtual_dict["recommends"] response_count = virtual_dict["responsesCreatedCount"] read_time = virtual_dict["readingTime"] word_count = virtual_dict["wordCount"] image_count = virtual_dict["imageCount"] preview_image = virtual_dict["previewImage"] # post_tags = virtual_dict["tags"] # post.post_tags = parse_tags(post_tags, return_dict) # Nick: Need to add post_content here. create new function from the payload # post.unique_slug = unique_slug post.title = title post.post_date = post_date post.url = url post.recommend_count = recommend_count post.response_count = response_count post.read_time = read_time post.word_count = word_count post.image_count = image_count image = parse_images(preview_image, return_dict) if image is not None: post.preview_image = image # print("{id}, {title}".format(id=post_id, title=title)) # print("{recommend}, {response}, {read}".format( # recommend=recommend_count, response=response_count, read=read_time)) if return_dict: return to_dict( post ) #Nick - if flag for return_dict is true then this is what's returned. else: return post