Python get_content示例，helpers.get_content Python示例

示例#1

0

显示文件

 def test_happy(self) -> None:
     """Tests the happy path."""
     relations = get_relations()
     relation_name = "gazdagret"
     relation = relations.get_relation(relation_name)
     result_from_overpass = "******"
     expected = helpers.get_content(relations.get_workdir(),
                                    "streets-gazdagret.csv")
     relation.get_files().write_osm_streets(result_from_overpass)
     actual = helpers.get_content(relations.get_workdir(),
                                  "streets-gazdagret.csv")
     self.assertEqual(actual, expected)

示例#2

0

显示文件

 def test_happy(self) -> None:
     """Tests the happy path."""
     refdir = os.path.join(os.path.dirname(__file__), "refdir")
     refpath = os.path.join(refdir, "utcak_20190514.tsv")
     relations = get_relations()
     relation_name = "gazdagret"
     relation = relations.get_relation(relation_name)
     expected = helpers.get_content(relations.get_workdir(),
                                    "streets-reference-gazdagret.lst")
     relation.write_ref_streets(refpath)
     actual = helpers.get_content(relations.get_workdir(),
                                  "streets-reference-gazdagret.lst")
     self.assertEqual(actual, expected)

示例#3

0

显示文件

    def get_article(self, url):
        '''Implementation for getting an article from JPost.

    Args:
      url: A URL in the www.jpost.com/* domain.

    Returns:
      The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)

        try:
            a = soup.find('h1', attrs={'class': 'article-title'})
            headline = a.text.strip().strip('\r\n')
            paragraphs = soup.find("div", {"class": "article-text"})
            article = paragraphs.find("p")
        except Exception as e:
            log.error('Error scraping JPost article at %s: %s' % (url, e))

        body = article.text

        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.JPOST)

示例#4

0

显示文件

 def test_happy(self) -> None:
     """Tests the happy path."""
     refdir = os.path.join(os.path.dirname(__file__), "refdir")
     refpath = os.path.join(refdir, "hazszamok_20190511.tsv")
     refpath2 = os.path.join(refdir, "hazszamok_kieg_20190808.tsv")
     relations = get_relations()
     relation_name = "gazdagret"
     expected = helpers.get_content(
         relations.get_workdir(),
         "street-housenumbers-reference-gazdagret.lst")
     relation = relations.get_relation(relation_name)
     relation.write_ref_housenumbers([refpath, refpath2])
     actual = helpers.get_content(
         relations.get_workdir(),
         "street-housenumbers-reference-gazdagret.lst")
     self.assertEqual(actual, expected)

示例#5

0

显示文件

 def test_happy(self) -> None:
     """Tests the happy path."""
     relations = get_relations()
     relation_name = "gazdagret"
     relation = relations.get_relation(relation_name)
     expected = helpers.get_content(relations.get_workdir(),
                                    "gazdagret-streets.percent")
     ret = relation.write_missing_streets()
     todo_count, done_count, percent, streets = ret
     self.assertEqual(todo_count, 1)
     self.assertEqual(done_count, 4)
     self.assertEqual(percent, '80.00')
     self.assertEqual(streets, ['Only In Ref utca'])
     actual = helpers.get_content(relations.get_workdir(),
                                  "gazdagret-streets.percent")
     self.assertEqual(actual, expected)

示例#6

0

显示文件

文件： russia_today.py 项目： pandrewhk/perspectives

    def get_article(self, url):
        '''Implementation for getting an article from the Russia Today.

    url: A URL in the russia_today.com domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline = helpers.decode(soup.h1.string)

        article = soup.find('div', attrs={'class': 'cont-wp'})
        paragraphs = article.find_all('p', attrs={'class': None})
        p_text = [helpers.decode(p.get_text()) for p in paragraphs]
        # Get rid of 'Tags' and 'Trends' headers, and 'READ MORE' links
        body = ' '.join([
            p for p in p_text
            if not (p.startswith('\nREAD') or p == 'Tags' or p == 'Trends')
        ])

        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url,
                                      news_orgs.RUSSIA_TODAY)

示例#7

0

显示文件

 def test_one_arg(self) -> None:
     """Tests the case when only one argument is given."""
     workdir = os.path.join(os.path.dirname(__file__), "workdir")
     actual = helpers.get_content(os.path.join(workdir,
                                               "gazdagret.percent"))
     expected = "54.55"
     self.assertEqual(actual, expected)

示例#8

0

显示文件

文件： globe_and_mail.py 项目： pandrewhk/perspectives

  def get_article(self, url):
    '''Implementation for getting an article from the Globe and Mail.

    url: A URL in the theglobeandmail.com/* domain.

    Returns: The Article representing the article at that url.
    '''
    html = helpers.get_content(url)
    if not html:
      return None

    soup = BeautifulSoup(html)

    soup.h1.a.extract()
    headline = soup.h1.get_text().encode('ascii', 'ignore').strip('\n')
    article = soup.find('div', attrs={'class': 'entry-content'})

    # Remove other content that is inline with the article text
    [div.extract() for div in article.find_all('div', attrs={'class': 'entry-related'})]
    [aside.extract() for aside in article.find_all('aside')]

    paragraphs = article.find_all('p', attrs={'class': None})
    body = ' '.join([p.get_text().encode('ascii', 'ignore') for p in paragraphs])

    log.info(headline)
    log.info(body)
    return news_interface.Article(headline, body, url, news_orgs.GLOBE_AND_MAIL)

示例#9

0

显示文件

    def get_article(self, url):
        '''Implementation for getting an article from CNN.

    Args:
      url: A URL in the www.cnn.* domain.

    Returns:
      The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        a = soup.find("title")
        k = a.text.split("-")
        headline = k[0]
        date = k[1]
        c = soup.findAll("p", attrs={'class': 'zn-body__paragraph'})
        body = ""
        for paragraph in c:
            try:
                body += paragraph.text.decode("utf-8").replace("\"", "'") + " "
            except UnicodeEncodeError:
                pass
        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.CNN)

示例#10

0

显示文件

    def get_article(self, url):
        '''Implementation for getting an article from the Guardian.

    url: A URL in the guardian.com domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline = soup.h1.string.strip('\n')

        if url.split('.com/')[1].startswith('theguardian'):
            article = soup.find('div',
                                attrs={'class': 'flexible-content-body'})
        else:
            article = soup.find('div',
                                attrs={'class': 'content__article-body'})
        paragraphs = article.find_all('p', attrs={'class': None})
        body = ' '.join([p.get_text() for p in paragraphs])

        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.GUARDIAN)

示例#11

0

显示文件

文件： cbc.py 项目： pandrewhk/perspectives

    def get_article(self, url):
        '''Implementation for getting an article from the CBC.

    url: A URL in the cbc.ca/news/* domain.

    Returns: The Article representing the article at that url, or None if
    unable to scrape the article.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)

        try:
            headline = soup.h1.string
        except AttributeError:
            log.error('Exception trying to scrape CBC headline from %s' %
                      (url))
            return None

        article = soup.find('div', attrs={'class': 'story-content'})
        paragraphs = article.find_all('p', attrs={'class': None})
        body = ' '.join([p.get_text() for p in paragraphs])
        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.CBC)

示例#12

0

显示文件

    def get_article(self, url):
        '''Implementation for getting an article from the NYTimes.

    url: A URL in the ny_times.com domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline = helpers.decode(soup.h1.string)

        try:
            article = soup.find('div', attrs={'class': 'articleBody'})
            paragraphs = article.find_all('p',
                                          attrs={'itemprop': 'articleBody'})
        except AttributeError:
            # this article's html uses different attributes... sigh...
            # Hopefully there are only 2 versions
            article = soup.find('div', attrs={'class': 'story-body'})
            paragraphs = article.find_all('p',
                                          attrs={'class': 'story-content'})

        p_text = [helpers.decode(p.get_text()) for p in paragraphs]
        body = ' '.join([p for p in p_text])

        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.NY_TIMES)

示例#13

0

显示文件

文件： news_interface.py 项目： leilacc/perspectives

    def get_article(self, url):
        '''Returns an Article representing the article at url.'''
        try:
            html = helpers.get_content(url)
            if not html:
                return None

            soup = BeautifulSoup(html)
            headline = self.get_headline(soup)
            body = self.get_body(soup)
            date = self.get_date(soup)
        except Exception as e:
            logger.log.error(
                "Hit exception on line number %s getting article for %s:"
                " %s" % (sys.exc_info()[-1].tb_lineno, url, e))
            return None

        try:
            headline = helpers.decode(headline)
            body = helpers.decode(body)
            date = helpers.decode(date)
        except Exception as e:
            logger.log.error('Error on line %s decoding url %s: %s' %
                             (sys.exc_info()[-1].tb_lineno, url, e))
            return None

        logger.log.info('URL: %s' % url)
        logger.log.info('headline: %s' % headline)
        logger.log.info('Body: %s' % body)

        return news_interface.Article(headline, body, url, self.news_org, date)

示例#14

0

显示文件

文件： news_interface.py 项目： IshmatovMaxim/perspectives

  def get_article(self, url):
    '''Returns an Article representing the article at url.'''
    try:
      html = helpers.get_content(url)
      if not html:
        return None

      soup = BeautifulSoup(html)
      headline = self.get_headline(soup)
      body = self.get_body(soup)
      date = self.get_date(soup)
    except Exception as e:
      logger.log.error("Hit exception on line number %s getting article for %s:"
                       " %s" % (sys.exc_info()[-1].tb_lineno, url, e))
      return None

    try:
      headline = helpers.decode(headline)
      body = helpers.decode(body)
      date = helpers.decode(date)
    except Exception as e:
      logger.log.error('Error on line %s decoding url %s: %s' %
                       (sys.exc_info()[-1].tb_lineno, url, e))
      return None

    logger.log.info('URL: %s' % url)
    logger.log.info('headline: %s' % headline)
    logger.log.info('Body: %s' % body)

    return news_interface.Article(headline, body, url, self.news_org,
                                  date)

示例#15

0

显示文件

    def test_happy(self) -> None:
        """Tests the happy path."""
        def get_abspath(path: str) -> str:
            if os.path.isabs(path):
                return path
            return os.path.join(os.path.dirname(__file__), path)

        with unittest.mock.patch('util.get_abspath', get_abspath):
            expected = helpers.get_content(
                get_abspath("workdir/streets-reference-gazdagret.lst"))

            argv = ["", "gazdagret"]
            with unittest.mock.patch('sys.argv', argv):
                get_reference_streets.main()

            actual = helpers.get_content(
                get_abspath("workdir/streets-reference-gazdagret.lst"))
            self.assertEqual(actual, expected)

示例#16

0

显示文件

 def test_happy(self) -> None:
     """Tests the happy path."""
     relations = get_relations()
     relation_name = "gazdagret"
     result_from_overpass = "******"
     result_from_overpass += "1\tTörökugrató utca\t1\n"
     result_from_overpass += "1\tTörökugrató utca\t2\n"
     result_from_overpass += "1\tTűzkő utca\t9\n"
     result_from_overpass += "1\tTűzkő utca\t10\n"
     result_from_overpass += "1\tOSM Name 1\t1\n"
     result_from_overpass += "1\tOSM Name 1\t2\n"
     result_from_overpass += "1\tOnly In OSM utca\t1\n"
     expected = helpers.get_content(relations.get_workdir(),
                                    "street-housenumbers-gazdagret.csv")
     relation = relations.get_relation(relation_name)
     relation.get_files().write_osm_housenumbers(result_from_overpass)
     actual = helpers.get_content(relations.get_workdir(),
                                  "street-housenumbers-gazdagret.csv")
     self.assertEqual(actual, expected)

示例#17

0

显示文件

 def test_happy(self) -> None:
     """Tests the happy path."""
     relations = get_relations()
     relation_name = "gazdagret"
     relation = relations.get_relation(relation_name)
     expected = helpers.get_content(relations.get_workdir(),
                                    "gazdagret.percent")
     ret = relation.write_missing_housenumbers()
     todo_street_count, todo_count, done_count, percent, table = ret
     self.assertEqual(todo_street_count, 3)
     self.assertEqual(todo_count, 5)
     self.assertEqual(done_count, 6)
     self.assertEqual(percent, '54.55')
     table = table_doc_to_string(table)
     self.assertEqual(
         table,
         [['Street name', 'Missing count', 'House numbers'],
          ['Törökugrató utca', '2', '7<br />10'],
          ['Tűzkő utca', '2', '1<br />2'], ['Hamzsabégi út', '1', '1']])
     actual = helpers.get_content(relations.get_workdir(),
                                  "gazdagret.percent")
     self.assertEqual(actual, expected)

示例#18

0

显示文件

文件： wsgi.py 项目： ImreSamu/osm-gimmisn

def handle_static(request_uri: str) -> Tuple[str, str]:
    """Handles serving static content."""
    tokens = request_uri.split("/")
    path = tokens[-1]

    if request_uri.endswith(".js"):
        content_type = "application/x-javascript"
    elif request_uri.endswith(".css"):
        content_type = "text/css"

    if path.endswith(".js") or path.endswith(".css"):
        return helpers.get_content(get_staticdir(), path), content_type

    return "", ""

示例#19

0

显示文件

  def get_article(self, url):
    '''Implementation for getting an article from USA Today.

    url: A URL in the http://www.usatoday.com/story/* domain.

    Returns: The Article representing the article at that url.
    '''
    html = helpers.get_content(url)
    if not html:
      return None

    soup = BeautifulSoup(html)
    article = soup.article
    headline = helpers.decode(article.h1.string)
    paragraphs = article.find_all('p', attrs={'class': None})
    body = ' '.join([helpers.decode(p.get_text()) for p in paragraphs])
    return news_interface.Article(headline, body, url, news_orgs.USA_TODAY)

示例#20

0

显示文件

文件： wsgi.py 项目： ImreSamu/osm-gimmisn

def handle_main_street_percent(relation: helpers.Relation) -> Tuple[yattag.Doc, str]:
    """Handles the street percent part of the main page."""
    url = "/osm/missing-streets/" + relation.get_name() + "/view-result"
    percent = "N/A"
    if os.path.exists(relation.get_files().get_streets_percent_path()):
        percent = helpers.get_content(relation.get_files().get_streets_percent_path())

    doc = yattag.Doc()
    if percent != "N/A":
        date = get_last_modified(relation.get_files().get_streets_percent_path())
        with doc.tag("strong"):
            with doc.tag("a", href=url, title=_("updated") + " " + date):
                doc.text(percent + "%")
        return doc, percent

    with doc.tag("strong"):
        with doc.tag("a", href=url):
            doc.text(_("missing streets"))
    return doc, "0"

示例#21

0

显示文件

    def get_article(self, url):
        '''Implementation for getting an article from BBC.

    url: A URL in the www.bbc.* domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline = soup.h1.string
        article = soup.find('div', attrs={'class': 'story-body'})
        paragraphs = article.find_all('p', attrs={'class': None})
        body = ' '.join([p.get_text() for p in paragraphs])
        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.BBC)

示例#22

0

显示文件

文件： ny_post.py 项目： pandrewhk/perspectives

    def get_article(self, url):
        '''Implementation for getting an article from the New York Post.

    url: A URL in the nypost.com domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline = helpers.decode(soup.h1.a.string)
        article = soup.find('div', attrs={'class': 'entry-content'})
        paragraphs = article.find_all('p', attrs={'class': None})
        body = ' '.join([helpers.decode(p.get_text()) for p in paragraphs])
        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.NY_POST)

示例#23

0

显示文件

    def get_article(self, url):
        '''Implementation for getting an article from Al Jazeera.

    Args:
      url: A URL in the aljazeera.* domain.

    Returns:
      The Article representing the article at that url, or None if unable to
      get the Article.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)

        headline = None
        potential_classes = ["heading-story", "articleOpinion-title"]
        for h1_class in potential_classes:
            try:
                headline = soup.find("h1", {"class": h1_class}).string
                break
            except AttributeError:
                continue
        if not headline:
            log.error(
                'Exception trying to scrape Al Jazeera headline from %s' %
                (url))
            return None

        headline = helpers.decode(headline)

        try:
            paragraphs = soup.find("div", {"class": "article-body"})
            article = paragraphs.findAll("p")
        except AttributeError:
            paragraphs = soup.find("div", {"class": "text"})
            article = paragraphs.findAll("p")
        body = ' '.join([helpers.decode(p.text) for p in article])
        #log.info(headline)
        #log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.ALJAZEERA)

示例#24

0

显示文件

文件： todays_zaman.py 项目： pandrewhk/perspectives

  def get_article(self, url):
    '''Implementation for getting an article from Todays Zaman.

    Args:
      url: A URL in the www.todayszaman.com/* domain.

    Returns:
      The Article representing the article at that url.
    '''
    html = helpers.get_content(url)
    if not html:
      return None

    soup = BeautifulSoup(html)
    a = soup.find("title")
    headline = helpers.decode(a.text)
    paragraphs = soup.find("div", {"id": "newsText"})
    article = paragraphs.findAll("p")
    body = ' '.join([helpers.decode(p.text) for p in article])
    log.info(headline)
    log.info(body)
    return news_interface.Article(headline, body, url, news_orgs.TODAYS_ZAMAN)

示例#25

0

显示文件

文件： post.py 项目： AHAPX/LibertatemBot

 def run(self, anon=False, forward=False):
     try:
         data = get_content(self.message)
     except ContentError as e:
         self.send.message(e)
     with config.DB.atomic() as tnx:
         try:
             content = Content.create(type=data[0],
                                      text=data[1],
                                      file_id=data[2])
             post = Post(content=content,
                         token=gen_token(),
                         address=Address.new())
             if not anon:
                 post.user = self.user_id
             if forward:
                 if self.message.forward_from:
                     self.send_message(
                         'you cannot forward messages for forward posting, write your message'
                     )
                     return (type(self), {'forward': forward})
                 post.forward_message_id = self.message.message_id
                 post.created_at = self.message.date
             post.save()
             if str(
                     self.user_id
             ) == config.ADMIN_ID and config.ADMIN_DEFAULT_BALANCE > 0 and not config.DEBUG:
                 post.send(config.ADMIN_DEFAULT_BALANCE, bot=self.bot)
                 post.address.is_accepted = True
                 post.address.save()
                 self.send_message('message posted')
             else:
                 self.send_message(TEXT_PAY.format(post.address.address))
                 self.send_message(post.address.address)
             tnx.commit()
         except Exception as e:
             logger.error(e)
             tnx.rollback()

示例#26

0

显示文件

文件： reuters.py 项目： pandrewhk/perspectives

    def get_article(self, url):
        '''Implementation for getting an article from REUTERS.

    url: A URL in the www.reuters.com* domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline_div = soup.find('div',
                                 attrs={'class': 'column1 gridPanel grid8'})
        headline = helpers.decode(headline_div.h1.string)
        paragraphs = soup.find('div',
                               attrs={
                                   'class': 'column1 gridPanel grid8'
                               }).findAll("p")
        body = ' '.join([helpers.decode(p.text) for p in paragraphs])
        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.REUTERS)

示例#27

0

显示文件

    def get_article(self, url):
        '''Implementation for getting an article from Times of Israel.

    Args:
      url: A URL in the www.timesofisrael.com/* domain.

    Returns:
      The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)

        h1 = soup.find('h1', attrs={'class': 'headline'})
        headline = helpers.decode(h1.text)
        paragraphs = soup.findAll("p", {"itemprop": "articleBody"})
        body = ' '.join([helpers.decode(p.text) for p in paragraphs])

        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url,
                                      news_orgs.TIMES_OF_ISRAEL)

示例#28

0

显示文件

 def test_happy(self) -> None:
     """Tests the happy path."""
     workdir = os.path.join(os.path.dirname(__file__), "workdir")
     actual = helpers.get_content(workdir, "gazdagret.percent")
     expected = "54.55"
     self.assertEqual(actual, expected)