def _get_content_from_data(self, data, session: Session, self_html: str, post_hint: str) -> str: content: str = "" url_overridden_by_dest: str = get_node_value_if_exists( data, "url_overridden_by_dest") if len(url_overridden_by_dest ) > 0 and url_overridden_by_dest[:1] == '/': url_overridden_by_dest = "https://www.reddit.com" + url_overridden_by_dest preview_image: Optional[str] = cast( Optional[str], get_node(data, "preview", "images", 0, "source", "url")) is_gallery: str = str(get_node_value_if_exists(data, "is_gallery")) domain: Optional[str] = cast(str, get_node(data, "domain")) if self_html != "": content += html.unescape(self_html) if is_gallery == "True": content += self._manage_gallery(data) c: Optional[str] = self._manage_external_content( session, url_overridden_by_dest, post_hint, preview_image, domain, data) if c is not None: content += c content = self._manage_reddit_preview_images(content) content = content.replace("<video ", "<video controls ") return content
def _format_video(self, node: dict) -> str: content: str = "" if "id" not in node: link: Optional[str] = cast( Optional[str], json_utils.get_node(node, "link", "__ref")) if link is not None: content = self._build(link) else: video_id: str = b64decode(cast( str, node["id"]).encode("ascii")).decode("ascii") page = requests.get( url="https://www.eurosport.fr/cors/feed_player_video_vid%s.json" % video_id[len("Video:"):]) j = json.loads(page.text) poster = "" if "PictureUrl" in j: poster = j["PictureUrl"] if "VideoUrl" in j: content = """<video width="100%%" controls="" preload="auto" poster="%s"> <source src="%s" /> </video>""" % (poster, j["VideoUrl"]) elif "EmbedUrl" in j: content = """<iframe src="%s"/>""" % (j["EmbedUrl"]) content += "<p><i><small>%s</small></i></p>" % json_utils.get_node( node, "title") return content
def _format(self, node: dict) -> str: type_name = cast(str, json_utils.get_node(node, "__typename")) node_format: str = "<p><i><small>Unknown type name: '%s'%s</small></i></p>" % ( type_name, CONTENT_MARKER ) # default value if type name not handled if type_name == "HyperLink": node_format = "<a href=\"%s\">%s</a>" % (node["url"], CONTENT_MARKER) elif type_name == "Picture": node_format = "<img src=\"%s\" alt=\"%s\"></img>%s" % ( node["url"], node["caption"], CONTENT_MARKER) elif type_name == "Video": node_format = self._format_video(node) elif type_name in ["Paragraph", "ListItem"]: node_format = "<p>%s</p>" % CONTENT_MARKER elif type_name == "Text": node_format = node["content"] elif type_name == "H2": node_format = "<h2>%s</h2>" % CONTENT_MARKER elif type_name == "HyperLinkInternal": node_format = "<a href=\"%s\">%s</a>" % (self._build( cast(str, json_utils.get_node(node, "content", "__ref"))), node["label"]) elif type_name == "Link": node_format = node["url"] elif type_name == "InternalContent": node_format = self._build( cast(str, json_utils.get_node(node, "content", "__ref"))) elif type_name == "List": node_format = self._format_list(node) elif type_name == "Blockquote": node_format = "<blockquote>%s</blockquote>" % CONTENT_MARKER elif type_name in ["Body", "CyclingStage", "Program"]: node_format = CONTENT_MARKER elif type_name == "BreakLine": node_format = "<br/>" elif type_name in ["TeamSportsMatch", "Article"]: node_format = self._build( cast(str, json_utils.get_node(node, "link", "__ref"))) elif type_name == "Table": node_format = self._format_table(node) elif type_name == "TableLine": node_format = self._format_table_line(node) elif type_name == "TableColumn": node_format = self._format_table_column(node) elif type_name == "Embed": node_format = self._format_embed(node) return node_format
def get_comments(self, comments: dict, deep: int = 0) -> str: """Append comments to the content. The webscrapped version contains only 2 levels in threads. The comments are displayed in a <ul> list. Only the comment, no nickname, no points, no date. Args: comments (dict): json containing comments Returns: str: html content for comments """ comments_html: str = "" if deep < 3: # not to deep if "body_html" in comments: comments_html += "<li>%s</li>" % html.unescape( comments["body_html"]) replies = get_node(comments, "replies", "data", "children") if isinstance(replies, list): for reply in replies: if "kind" in reply and reply[ "kind"] == "t1" and "data" in reply: comments_html += self.get_comments( reply["data"], deep + 1) comments_html = "<ul>%s</ul>" % comments_html return comments_html
def _format_table_column(self, node: dict) -> str: tds = "" for td in cast(List[str], json_utils.get_node(node, "contents", "__refs")): tds = "\n\t<td>%s</td>" % self._build(td) return tds
def _format_list(self, node: dict) -> str: content: str = "" nodes_index: Optional[List[str]] = cast( Optional[List[str]], json_utils.get_node(node, "listItems", "__refs")) if nodes_index is not None: for node_index in nodes_index: content += self._build(node_index) return content
def __init__(self, data: dict, ql_ref: str) -> None: self.data: dict = data self.root: Optional[dict] = json_utils.get_node( self.data, "props", "pageProps", "serverQueryRecords") if self.root is not None: #self.root = self.root[next(iter(self.root))] self.ql_article: dict = self.root[ql_ref] # self.ql_article: Optional[dict] = cast(Optional[dict], json_utils.get_first_node_in_subpath( # data, ql_ref)) self.graph_ql_body: Optional[str] = None
def _get_datatypes_json(self, data: dict, ttype: str) -> List[dict]: datatype_json: List[dict] = [] if len(data) > 0: for d in data: nodes = get_node(d, "data", "children") if nodes is not None: for node in nodes: if "kind" in node and node[ "kind"] == ttype and "data" in node: datatype_json.append(node["data"]) return datatype_json
def _build(self, node_name: str) -> str: content: str = "" node = cast(dict, json_utils.get_node(cast(dict, self.root), node_name)) content_formatter: str = self._format(node) nodes_index = cast( Optional[List[str]], json_utils.get_node(cast(dict, self.root), node_name, "contents", "__refs")) if isinstance(nodes_index, list): for node_index in nodes_index: refs_node_name = cast( str, json_utils.get_node(cast(dict, self.root), node_index, "__id")) if refs_node_name is not None: content += self._build(refs_node_name) return content_formatter.replace(CONTENT_MARKER, content)
def build_article(self) -> str: content: str = "" if self.ql_article is not None: if "title" in self.ql_article: content += "<h1>%s</h1>" % self.ql_article["title"] # picture_id = cast(str, json_utils.get_first_node_in_subpath( # self.ql_article, "picture", "__ref")) picture_id = json_utils.get_node(cast(dict, self.ql_article), "picture", "__ref") if picture_id is not None: picture_node = json_utils.get_node(cast(dict, self.root), cast(str, picture_id)) if picture_node is not None: content += "<img src=\"%s\"/>" % json_utils.get_node( picture_node, "url") self.graph_ql_body = cast( str, json_utils.get_node(self.ql_article, "graphQLBody", "__ref")) if self.graph_ql_body is not None: content += self._build(self.graph_ql_body) return content
def _get_content_by_post_hint(self, session: Session, url: str, post_hint: str, preview_image: Optional[str], domain: str, data: dict) -> Optional[str]: external_content: Optional[str] = None if post_hint == "rich:video": external_content = "<p><img src=\"%s\"/></p><p><a href=\"%s\">Source : %s</a></p>" % ( preview_image, url, domain) elif post_hint == "hosted:video": video_url = get_node(data, "media", "reddit_video", "hls_url") external_content = """<p><video poster="%s" muted="muted" > <source src="%s" type="application/vnd.apple.mpegURL"> </video></p>""" % (preview_image, video_url) elif post_hint == "image" or is_a_picture_url(url): external_content = "<p><img src=\"%s\"/></p>" % url elif post_hint in ["", "link"]: external_content = super().get_readable_content( session, url, headers=HEADERS, add_source_link=True) return external_content