def test_happy(self) -> None: """Tests the happy path.""" relations = get_relations() relation_name = "gazdagret" relation = relations.get_relation(relation_name) result_from_overpass = "******" expected = helpers.get_content(relations.get_workdir(), "streets-gazdagret.csv") relation.get_files().write_osm_streets(result_from_overpass) actual = helpers.get_content(relations.get_workdir(), "streets-gazdagret.csv") self.assertEqual(actual, expected)
def test_happy(self) -> None: """Tests the happy path.""" refdir = os.path.join(os.path.dirname(__file__), "refdir") refpath = os.path.join(refdir, "utcak_20190514.tsv") relations = get_relations() relation_name = "gazdagret" relation = relations.get_relation(relation_name) expected = helpers.get_content(relations.get_workdir(), "streets-reference-gazdagret.lst") relation.write_ref_streets(refpath) actual = helpers.get_content(relations.get_workdir(), "streets-reference-gazdagret.lst") self.assertEqual(actual, expected)
def get_article(self, url): '''Implementation for getting an article from JPost. Args: url: A URL in the www.jpost.com/* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) try: a = soup.find('h1', attrs={'class': 'article-title'}) headline = a.text.strip().strip('\r\n') paragraphs = soup.find("div", {"class": "article-text"}) article = paragraphs.find("p") except Exception as e: log.error('Error scraping JPost article at %s: %s' % (url, e)) body = article.text log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.JPOST)
def test_happy(self) -> None: """Tests the happy path.""" refdir = os.path.join(os.path.dirname(__file__), "refdir") refpath = os.path.join(refdir, "hazszamok_20190511.tsv") refpath2 = os.path.join(refdir, "hazszamok_kieg_20190808.tsv") relations = get_relations() relation_name = "gazdagret" expected = helpers.get_content( relations.get_workdir(), "street-housenumbers-reference-gazdagret.lst") relation = relations.get_relation(relation_name) relation.write_ref_housenumbers([refpath, refpath2]) actual = helpers.get_content( relations.get_workdir(), "street-housenumbers-reference-gazdagret.lst") self.assertEqual(actual, expected)
def test_happy(self) -> None: """Tests the happy path.""" relations = get_relations() relation_name = "gazdagret" relation = relations.get_relation(relation_name) expected = helpers.get_content(relations.get_workdir(), "gazdagret-streets.percent") ret = relation.write_missing_streets() todo_count, done_count, percent, streets = ret self.assertEqual(todo_count, 1) self.assertEqual(done_count, 4) self.assertEqual(percent, '80.00') self.assertEqual(streets, ['Only In Ref utca']) actual = helpers.get_content(relations.get_workdir(), "gazdagret-streets.percent") self.assertEqual(actual, expected)
def get_article(self, url): '''Implementation for getting an article from the Russia Today. url: A URL in the russia_today.com domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = helpers.decode(soup.h1.string) article = soup.find('div', attrs={'class': 'cont-wp'}) paragraphs = article.find_all('p', attrs={'class': None}) p_text = [helpers.decode(p.get_text()) for p in paragraphs] # Get rid of 'Tags' and 'Trends' headers, and 'READ MORE' links body = ' '.join([ p for p in p_text if not (p.startswith('\nREAD') or p == 'Tags' or p == 'Trends') ]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.RUSSIA_TODAY)
def test_one_arg(self) -> None: """Tests the case when only one argument is given.""" workdir = os.path.join(os.path.dirname(__file__), "workdir") actual = helpers.get_content(os.path.join(workdir, "gazdagret.percent")) expected = "54.55" self.assertEqual(actual, expected)
def get_article(self, url): '''Implementation for getting an article from the Globe and Mail. url: A URL in the theglobeandmail.com/* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) soup.h1.a.extract() headline = soup.h1.get_text().encode('ascii', 'ignore').strip('\n') article = soup.find('div', attrs={'class': 'entry-content'}) # Remove other content that is inline with the article text [div.extract() for div in article.find_all('div', attrs={'class': 'entry-related'})] [aside.extract() for aside in article.find_all('aside')] paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([p.get_text().encode('ascii', 'ignore') for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.GLOBE_AND_MAIL)
def get_article(self, url): '''Implementation for getting an article from CNN. Args: url: A URL in the www.cnn.* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) a = soup.find("title") k = a.text.split("-") headline = k[0] date = k[1] c = soup.findAll("p", attrs={'class': 'zn-body__paragraph'}) body = "" for paragraph in c: try: body += paragraph.text.decode("utf-8").replace("\"", "'") + " " except UnicodeEncodeError: pass log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.CNN)
def get_article(self, url): '''Implementation for getting an article from the Guardian. url: A URL in the guardian.com domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = soup.h1.string.strip('\n') if url.split('.com/')[1].startswith('theguardian'): article = soup.find('div', attrs={'class': 'flexible-content-body'}) else: article = soup.find('div', attrs={'class': 'content__article-body'}) paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([p.get_text() for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.GUARDIAN)
def get_article(self, url): '''Implementation for getting an article from the CBC. url: A URL in the cbc.ca/news/* domain. Returns: The Article representing the article at that url, or None if unable to scrape the article. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) try: headline = soup.h1.string except AttributeError: log.error('Exception trying to scrape CBC headline from %s' % (url)) return None article = soup.find('div', attrs={'class': 'story-content'}) paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([p.get_text() for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.CBC)
def get_article(self, url): '''Implementation for getting an article from the NYTimes. url: A URL in the ny_times.com domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = helpers.decode(soup.h1.string) try: article = soup.find('div', attrs={'class': 'articleBody'}) paragraphs = article.find_all('p', attrs={'itemprop': 'articleBody'}) except AttributeError: # this article's html uses different attributes... sigh... # Hopefully there are only 2 versions article = soup.find('div', attrs={'class': 'story-body'}) paragraphs = article.find_all('p', attrs={'class': 'story-content'}) p_text = [helpers.decode(p.get_text()) for p in paragraphs] body = ' '.join([p for p in p_text]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.NY_TIMES)
def get_article(self, url): '''Returns an Article representing the article at url.''' try: html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = self.get_headline(soup) body = self.get_body(soup) date = self.get_date(soup) except Exception as e: logger.log.error( "Hit exception on line number %s getting article for %s:" " %s" % (sys.exc_info()[-1].tb_lineno, url, e)) return None try: headline = helpers.decode(headline) body = helpers.decode(body) date = helpers.decode(date) except Exception as e: logger.log.error('Error on line %s decoding url %s: %s' % (sys.exc_info()[-1].tb_lineno, url, e)) return None logger.log.info('URL: %s' % url) logger.log.info('headline: %s' % headline) logger.log.info('Body: %s' % body) return news_interface.Article(headline, body, url, self.news_org, date)
def get_article(self, url): '''Returns an Article representing the article at url.''' try: html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = self.get_headline(soup) body = self.get_body(soup) date = self.get_date(soup) except Exception as e: logger.log.error("Hit exception on line number %s getting article for %s:" " %s" % (sys.exc_info()[-1].tb_lineno, url, e)) return None try: headline = helpers.decode(headline) body = helpers.decode(body) date = helpers.decode(date) except Exception as e: logger.log.error('Error on line %s decoding url %s: %s' % (sys.exc_info()[-1].tb_lineno, url, e)) return None logger.log.info('URL: %s' % url) logger.log.info('headline: %s' % headline) logger.log.info('Body: %s' % body) return news_interface.Article(headline, body, url, self.news_org, date)
def test_happy(self) -> None: """Tests the happy path.""" def get_abspath(path: str) -> str: if os.path.isabs(path): return path return os.path.join(os.path.dirname(__file__), path) with unittest.mock.patch('util.get_abspath', get_abspath): expected = helpers.get_content( get_abspath("workdir/streets-reference-gazdagret.lst")) argv = ["", "gazdagret"] with unittest.mock.patch('sys.argv', argv): get_reference_streets.main() actual = helpers.get_content( get_abspath("workdir/streets-reference-gazdagret.lst")) self.assertEqual(actual, expected)
def test_happy(self) -> None: """Tests the happy path.""" relations = get_relations() relation_name = "gazdagret" result_from_overpass = "******" result_from_overpass += "1\tTörökugrató utca\t1\n" result_from_overpass += "1\tTörökugrató utca\t2\n" result_from_overpass += "1\tTűzkő utca\t9\n" result_from_overpass += "1\tTűzkő utca\t10\n" result_from_overpass += "1\tOSM Name 1\t1\n" result_from_overpass += "1\tOSM Name 1\t2\n" result_from_overpass += "1\tOnly In OSM utca\t1\n" expected = helpers.get_content(relations.get_workdir(), "street-housenumbers-gazdagret.csv") relation = relations.get_relation(relation_name) relation.get_files().write_osm_housenumbers(result_from_overpass) actual = helpers.get_content(relations.get_workdir(), "street-housenumbers-gazdagret.csv") self.assertEqual(actual, expected)
def test_happy(self) -> None: """Tests the happy path.""" relations = get_relations() relation_name = "gazdagret" relation = relations.get_relation(relation_name) expected = helpers.get_content(relations.get_workdir(), "gazdagret.percent") ret = relation.write_missing_housenumbers() todo_street_count, todo_count, done_count, percent, table = ret self.assertEqual(todo_street_count, 3) self.assertEqual(todo_count, 5) self.assertEqual(done_count, 6) self.assertEqual(percent, '54.55') table = table_doc_to_string(table) self.assertEqual( table, [['Street name', 'Missing count', 'House numbers'], ['Törökugrató utca', '2', '7<br />10'], ['Tűzkő utca', '2', '1<br />2'], ['Hamzsabégi út', '1', '1']]) actual = helpers.get_content(relations.get_workdir(), "gazdagret.percent") self.assertEqual(actual, expected)
def handle_static(request_uri: str) -> Tuple[str, str]: """Handles serving static content.""" tokens = request_uri.split("/") path = tokens[-1] if request_uri.endswith(".js"): content_type = "application/x-javascript" elif request_uri.endswith(".css"): content_type = "text/css" if path.endswith(".js") or path.endswith(".css"): return helpers.get_content(get_staticdir(), path), content_type return "", ""
def get_article(self, url): '''Implementation for getting an article from USA Today. url: A URL in the http://www.usatoday.com/story/* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) article = soup.article headline = helpers.decode(article.h1.string) paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([helpers.decode(p.get_text()) for p in paragraphs]) return news_interface.Article(headline, body, url, news_orgs.USA_TODAY)
def handle_main_street_percent(relation: helpers.Relation) -> Tuple[yattag.Doc, str]: """Handles the street percent part of the main page.""" url = "/osm/missing-streets/" + relation.get_name() + "/view-result" percent = "N/A" if os.path.exists(relation.get_files().get_streets_percent_path()): percent = helpers.get_content(relation.get_files().get_streets_percent_path()) doc = yattag.Doc() if percent != "N/A": date = get_last_modified(relation.get_files().get_streets_percent_path()) with doc.tag("strong"): with doc.tag("a", href=url, title=_("updated") + " " + date): doc.text(percent + "%") return doc, percent with doc.tag("strong"): with doc.tag("a", href=url): doc.text(_("missing streets")) return doc, "0"
def get_article(self, url): '''Implementation for getting an article from BBC. url: A URL in the www.bbc.* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = soup.h1.string article = soup.find('div', attrs={'class': 'story-body'}) paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([p.get_text() for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.BBC)
def get_article(self, url): '''Implementation for getting an article from the New York Post. url: A URL in the nypost.com domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = helpers.decode(soup.h1.a.string) article = soup.find('div', attrs={'class': 'entry-content'}) paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([helpers.decode(p.get_text()) for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.NY_POST)
def get_article(self, url): '''Implementation for getting an article from Al Jazeera. Args: url: A URL in the aljazeera.* domain. Returns: The Article representing the article at that url, or None if unable to get the Article. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = None potential_classes = ["heading-story", "articleOpinion-title"] for h1_class in potential_classes: try: headline = soup.find("h1", {"class": h1_class}).string break except AttributeError: continue if not headline: log.error( 'Exception trying to scrape Al Jazeera headline from %s' % (url)) return None headline = helpers.decode(headline) try: paragraphs = soup.find("div", {"class": "article-body"}) article = paragraphs.findAll("p") except AttributeError: paragraphs = soup.find("div", {"class": "text"}) article = paragraphs.findAll("p") body = ' '.join([helpers.decode(p.text) for p in article]) #log.info(headline) #log.info(body) return news_interface.Article(headline, body, url, news_orgs.ALJAZEERA)
def get_article(self, url): '''Implementation for getting an article from Todays Zaman. Args: url: A URL in the www.todayszaman.com/* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) a = soup.find("title") headline = helpers.decode(a.text) paragraphs = soup.find("div", {"id": "newsText"}) article = paragraphs.findAll("p") body = ' '.join([helpers.decode(p.text) for p in article]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.TODAYS_ZAMAN)
def run(self, anon=False, forward=False): try: data = get_content(self.message) except ContentError as e: self.send.message(e) with config.DB.atomic() as tnx: try: content = Content.create(type=data[0], text=data[1], file_id=data[2]) post = Post(content=content, token=gen_token(), address=Address.new()) if not anon: post.user = self.user_id if forward: if self.message.forward_from: self.send_message( 'you cannot forward messages for forward posting, write your message' ) return (type(self), {'forward': forward}) post.forward_message_id = self.message.message_id post.created_at = self.message.date post.save() if str( self.user_id ) == config.ADMIN_ID and config.ADMIN_DEFAULT_BALANCE > 0 and not config.DEBUG: post.send(config.ADMIN_DEFAULT_BALANCE, bot=self.bot) post.address.is_accepted = True post.address.save() self.send_message('message posted') else: self.send_message(TEXT_PAY.format(post.address.address)) self.send_message(post.address.address) tnx.commit() except Exception as e: logger.error(e) tnx.rollback()
def get_article(self, url): '''Implementation for getting an article from REUTERS. url: A URL in the www.reuters.com* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline_div = soup.find('div', attrs={'class': 'column1 gridPanel grid8'}) headline = helpers.decode(headline_div.h1.string) paragraphs = soup.find('div', attrs={ 'class': 'column1 gridPanel grid8' }).findAll("p") body = ' '.join([helpers.decode(p.text) for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.REUTERS)
def get_article(self, url): '''Implementation for getting an article from Times of Israel. Args: url: A URL in the www.timesofisrael.com/* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) h1 = soup.find('h1', attrs={'class': 'headline'}) headline = helpers.decode(h1.text) paragraphs = soup.findAll("p", {"itemprop": "articleBody"}) body = ' '.join([helpers.decode(p.text) for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.TIMES_OF_ISRAEL)
def test_happy(self) -> None: """Tests the happy path.""" workdir = os.path.join(os.path.dirname(__file__), "workdir") actual = helpers.get_content(workdir, "gazdagret.percent") expected = "54.55" self.assertEqual(actual, expected)