def get_article_text(self, link): logger.debug("Grabbing article %s", link) article_html = get_article(link) result = {} result["raw_html"] = article_html article = bs4.BeautifulSoup(article_html) if article.body is None: return None title = article.body.find(class_="article-title") result["title"] = title.text.strip() author = article.body.find(class_="author") if author is not None: result["author"] = author.text.strip() else: result["author"] = None subtitle = article.body.find(class_="article-flash") if subtitle is not None: result["subtitles"] = [subtitle.string.strip()] content = article.body.find(class_="art-content") if content is None: return None else: result["text"] = u" ".join(content.stripped_strings) return result
def get_article(self, link): logger.debug("Grabbing article %s", link) article_html = get_article(link) result = {} result["raw_html"] = article_html article = bs4.BeautifulSoup(article_html) # Try to find the subtitle subtitle = article.find('font', size=3, color="#ff8000") if subtitle is not None and subtitle.find('b') is not None: result["subtitles"] = [subtitle.b.text.strip()] author = article.find('div', class_="clanekAVTOR") if author is not None: result["author"] = author.text.strip() else: result["author"] = None text_container = article.find(id="_xclaimwords_wrapper") if text_container is None: return None # Remove all script tags from text container scripts = text_container.findAll('script') [script.extract() for script in scripts] result["text"] = u" ".join(text_container.stripped_strings) return result
def get_article(self, link): logger.debug("Grabbing article %s", link) article_html = get_article(link.replace("24ur.com", "www.24ur.com")) result = {} result["raw_html"] = article_html tree = etree.fromstring(article_html, etree.HTMLParser()) summary = tree.xpath('//div[@class="summary"]/p/text()') result["subtitles"] = unicode(summary) author_texts = tree.xpath("//div[@class='containerLeftSide']/text()") author_text = u" ".join(text.strip() for text in author_texts) if u"|" in author_text: author = author_text[author_text.rfind('|'):] else: author = None result["author"] = author # Elaborate way of getting rid of all script tags and other garbage in this HTML. Looking for # a better way. content = tree.xpath("//div[@id='content']") if len(content) == 0: return None text = re.sub("\s\s+", " ", bs4.BeautifulSoup(lxml.html.tostring(content[0], encoding="utf-8").decode("utf-8")).get_text()) result["text"] = text if u"Preverite vpisani naslov ali uporabite možnost iskanja po naših straneh." in result["text"]: return None return result
def get_article(self, link): logger.debug("Grabbing article %s", link) article_html = get_article(link) result = {} result["raw_html"] = article_html tree = etree.fromstring(article_html, etree.HTMLParser()) result["subtitles"] = [ text.strip() for text in tree.xpath('//article/p[@class="uvod"]/text()') ] # Sometimes they use bodytext for this text = tree.xpath('//article/p[@class="tekst"]//text()') if len(text) == 0: text = tree.xpath('//article/p[@class="bodytext"]/text()') result["text"] = '\n'.join(text) author = tree.xpath('//article/p[@class="bodyslika"]/span/text()') if len(author) > 0: result["author"] = (' '.join(author)).strip() else: result["author"] = None return result
def get_article_text(self, article_id): logger.debug("[RTVSlo] Grabbing article ID %s", article_id) article_html = get_article(self.RTV_ARTICLE_URL + str(article_id)) result = {} result["raw_html"] = article_html article = bs4.BeautifulSoup(article_html) result["title"] = article.title.text.strip() subtitles = article.find_all("div", class_="subtitle") subtitles = [div.text for div in subtitles] result["subtitles"] = subtitles text_content = article.find_all("p") text_content = u"\n".join( [u" ".join(p.stripped_strings) for p in text_content]) result["text"] = text_content return result
def get_article_text(self, link): logger.debug("Grabbing article %s", link) article_html = get_article(link) result = {} result["raw_html"] = article_html article = bs4.BeautifulSoup(article_html) title = article.title if title is None: return None result["title"] = title.text.strip() subtitle = article.find(id="EXCERPT", text=True) if subtitle is None: subtitle = article.find(id="EXCERPT_mnenja", text=True) if subtitle is not None: result["subtitles"] = [subtitle.text.strip()] content_item = article.find(id="D_NEWS") if content_item is None: content_item = article.find(id="D_NEWS_MNENJA") author = article.find(class_="d_author") if author is not None: result["author"] = author.text.strip() else: result["author"] = None if content_item is not None: text_content = u" ".join([ p_item.text.strip() for p_item in content_item.find_all('p', text=True) if p_item is not None ]) text_content = text_content.replace(" ", " ") result["text"] = text_content return result else: logger.warn("Unknown article content for %s", link) return None
def get_article_text(self, article_id): logger.debug("Grabbing article ID %s", article_id) article_html = get_article(self.ZURNAL_PRINT_URL + str(article_id)) result = {} result["raw_html"] = article_html article = bs4.BeautifulSoup(article_html) article = article.body.find("article") result["title"] = article.hgroup.h1.text author = article.find(id="meta_el").find(class_="left").text try: author = author[:author.index('/')].strip() result["author"] = author except ValueError as e: result["author"] = None content_div = article.find_all("div", class_="entry") result["text"] = u" ".join(content_div[0].stripped_strings) return result
def get_article_text(self, link): logger.debug("Grabbing article %s", link) article_html = get_article(link) result = {} result["raw_html"] = article_html tree = etree.fromstring(article_html, etree.HTMLParser()) # This is a structure for editorials author = None try: author = tree.xpath('//article[@id="article"]/div')[1].xpath( "./text()")[2].strip() except: try: a = tree.xpath('//article[@id="article"]/header/p')[0].xpath( './i/text()')[0].strip() if "Avtor:" in a: author = a.replace("Avtor:", "").strip() except: author = None result["author"] = author try: result["subtitles"] = [ ' '.join( tree.xpath('//article[@id="article"]/header')[0].xpath( './p')[-1].xpath('./text()')).strip() ] except IndexError: result["subtitles"] = None result["text"] = '\n\n'.join([ ' '.join(x).strip() for x in map(lambda x: x.xpath('.//text()'), tree.xpath('//article[@id="article"]/p')) ]).strip() return result
def parse_article(self, article_url): link, data = article_url article = {} try: article_html = get_article(link) article["raw_html"] = article_html except Exception as e: logger.warn("Failed to parse article %s", link, exc_info=True) return article["text"] = data["text"] article["title"] = data["title"] article["published"] = data["published"] article["source"] = "Val202" article["source_url"] = link article["language"] = "si" article["author"] = data["author"] # Generate ID from link article["id"] = get_sha_hash(data["guid"]) return article
def get_article_text(self, link): logger.debug("Grabbing article %s", link) article_html = get_article(link) result = {} result["raw_html"] = article_html article = bs4.BeautifulSoup(article_html) author = article.body.find(class_="article-source") if author is not None and author.strong is not None: result["author"] = author.strong.text.strip() else: result["author"] = None subtitle = article.body.find('p', class_="intro-box", text=True) if subtitle is not None: result["subtitles"] = [subtitle.text.strip()] content = article.body.article if content is None: return None else: result["text"] = u" ".join(content.stripped_strings) return result