Пример #1
0
    def _get_zgqnb_article(self, url, date, page_name):
        print url
        medium = Medium.objects.get(pk=1836)
        urls = bs4.BeautifulSoup(requests.get(url).content).find('div', {'id' : 'titleList'}).find_all('a')
        for a in urls:
            article_url = urljoin(url, a.get('href'))
            soup = bs4.BeautifulSoup(requests.get(article_url).content)
            title = soup.find('h1').text
            print title
            article = Article()
            article.medium = medium
            article.title = title
            article.url = article_url
            article.publication_date = date
            article.page = page_name

            p_list = []
            for p in soup.find('div', {'id' : 'ozoom'}).find_all('p'):
                p_list.append(p.text)
            content = '\n'.join(p_list)
            article.content = content
            if Article.objects.filter(medium=medium).filter(url=article_url).count():
                article = Article.objects.filter(medium=medium).get(url=article_url)
                article.content = content

            article.save()
Пример #2
0
    def _get_rmrb_article(self, content):
        medium = Medium.objects.get(pk=1081)
        article = Article()
        article.medium = medium
        article.title = content.find('h1').text.strip()

        for author_name in content.find_all('div', {'class' : 'summary'})[0].find_all('a'):
            try:
                author = Journalist.objects.get(medium=medium, name=author_name.text.strip())
            except:
                pass
            else:
                article.author = author
                break

        for li in content.find_all('div', {'class' : 'summary'})[-1].find_all('li'):
            if li.text.find(u'报纸日期') != -1:
                p = re.compile('(\d+)-(\d+)-(\d+)')
                publication_date = p.search(li.text).group()

            if li.text.find(u'版名') != -1:
                page = li.text.replace('\n','').replace(u'【版名】', '').replace(' ', '')
            else:
                page = '头版'

        article.issue = self._get_issue_from_date(publication_date, 'rmrb')
        article.page = page
        article.publication_date = datetime.datetime.strptime(publication_date, '%Y-%m-%d')
        article, created = Article.objects.get_or_create(medium=article.medium, title=article.title, issue=article.issue, publication_date=article.publication_date)
        print article.title
        return article
Пример #3
0
    def _get_whb_article(self, url, date,issue, page):
        medium = Medium.objects.get(pk=1399)
        soup = bs4.BeautifulSoup(requests.get(url).content)
        for title in soup.find('div', {'id' : 'BT'}).find_all('a'):

            article_page_url = urljoin(url, title.get('href'))
            r = requests.get(article_page_url)
            if r.status_code == 404:
                continue
            article_page = bs4.BeautifulSoup( r.content)

            if Article.objects.filter(medium=medium).filter(url=article_page_url).count():
                article = Article.objects.filter(medium=medium).get(url=article_page_url)
            else:
                article = Article()
                article.medium = medium

                article.url = article_page_url
                article.publication_date = date
                article.page = page.text.strip()
                article.issue = issue

            print article_page_url
            title = article_page.title.text.strip().replace(u'文汇报 - ', '')
            article.title = title
            article.content = article_page.find('div', {'id' : 'articleText'}).text.strip().replace(u'  ', '\n  ')
            article.save()
Пример #4
0
    def _get_qlwb_article(self, url, date, issue, page):
        print page
        medium = Medium.objects.get(pk=1025)
        soup = bs4.BeautifulSoup(requests.get(url).content)

        if Article.objects.filter(medium=medium).filter(url=url).count():
            article = Article.objects.filter(medium=medium).get(url=url)
        else:
            article = Article()

            article.medium = medium
            article.title = soup.find('td', {'class' : 'font01'}).text.strip().replace(u'  ', '\n  ')

            article.url = url
            article.publication_date = date
            article.page = page
            article.issue = issue
            
        article.content = soup.find('span', {'id' : 'contenttext'}).text.strip().replace(u'  ', '\n  ')
        article.save()