Пример #1
0
    def _get_zgqnb_article(self, url, date, page_name):
        print url
        medium = Medium.objects.get(pk=1836)
        urls = bs4.BeautifulSoup(requests.get(url).content).find('div', {'id' : 'titleList'}).find_all('a')
        for a in urls:
            article_url = urljoin(url, a.get('href'))
            soup = bs4.BeautifulSoup(requests.get(article_url).content)
            title = soup.find('h1').text
            print title
            article = Article()
            article.medium = medium
            article.title = title
            article.url = article_url
            article.publication_date = date
            article.page = page_name

            p_list = []
            for p in soup.find('div', {'id' : 'ozoom'}).find_all('p'):
                p_list.append(p.text)
            content = '\n'.join(p_list)
            article.content = content
            if Article.objects.filter(medium=medium).filter(url=article_url).count():
                article = Article.objects.filter(medium=medium).get(url=article_url)
                article.content = content

            article.save()
Пример #2
0
    def _get_rmrb_article(self, content):
        medium = Medium.objects.get(pk=1081)
        article = Article()
        article.medium = medium
        article.title = content.find('h1').text.strip()

        for author_name in content.find_all('div', {'class' : 'summary'})[0].find_all('a'):
            try:
                author = Journalist.objects.get(medium=medium, name=author_name.text.strip())
            except:
                pass
            else:
                article.author = author
                break

        for li in content.find_all('div', {'class' : 'summary'})[-1].find_all('li'):
            if li.text.find(u'报纸日期') != -1:
                p = re.compile('(\d+)-(\d+)-(\d+)')
                publication_date = p.search(li.text).group()

            if li.text.find(u'版名') != -1:
                page = li.text.replace('\n','').replace(u'【版名】', '').replace(' ', '')
            else:
                page = '头版'

        article.issue = self._get_issue_from_date(publication_date, 'rmrb')
        article.page = page
        article.publication_date = datetime.datetime.strptime(publication_date, '%Y-%m-%d')
        article, created = Article.objects.get_or_create(medium=article.medium, title=article.title, issue=article.issue, publication_date=article.publication_date)
        print article.title
        return article
Пример #3
0
    def _get_nfzm_article(self, url, date, issue):
        medium = Medium.objects.get(pk=951)
        article                  = Article()
        article.medium           = medium
        article.issue            = issue
        article.url              = url
        article.publication_date = date

        r = requests.get(url, cookies={'PHPSESSID': 'l19dgbf6ticijmo9ka9osvufk0'})
        content = bs4.BeautifulSoup(r.content)
        article.title = content.title.string.split('-')[-1].strip()
        article.content = content.find('section', {'id' : 'articleContent'}).text

        author = content.find('span', {'class' : 'author'}).find_all('em')
        if author[1].text.find(u'南方周末记者') != -1:
            author, created = Journalist.objects.get_or_create(medium=medium, name=author[2].text.strip())
            if not created:
                article.author = author
        elif author[1].text.find(u'南方周末特约撰稿') != -1:
            article.author_name = author[2].text.strip()
        elif author[1].text.find(u'南方周末编辑部') != -1:
            article.author_name = u'南方周末编辑部'

        print article.author or article.author_name
        return article
Пример #4
0
    def _get_whb_article(self, url, date,issue, page):
        medium = Medium.objects.get(pk=1399)
        soup = bs4.BeautifulSoup(requests.get(url).content)
        for title in soup.find('div', {'id' : 'BT'}).find_all('a'):

            article_page_url = urljoin(url, title.get('href'))
            r = requests.get(article_page_url)
            if r.status_code == 404:
                continue
            article_page = bs4.BeautifulSoup( r.content)

            if Article.objects.filter(medium=medium).filter(url=article_page_url).count():
                article = Article.objects.filter(medium=medium).get(url=article_page_url)
            else:
                article = Article()
                article.medium = medium

                article.url = article_page_url
                article.publication_date = date
                article.page = page.text.strip()
                article.issue = issue

            print article_page_url
            title = article_page.title.text.strip().replace(u'文汇报 - ', '')
            article.title = title
            article.content = article_page.find('div', {'id' : 'articleText'}).text.strip().replace(u'  ', '\n  ')
            article.save()
Пример #5
0
    def _get_qlwb_article(self, url, date, issue, page):
        print page
        medium = Medium.objects.get(pk=1025)
        soup = bs4.BeautifulSoup(requests.get(url).content)

        if Article.objects.filter(medium=medium).filter(url=url).count():
            article = Article.objects.filter(medium=medium).get(url=url)
        else:
            article = Article()

            article.medium = medium
            article.title = soup.find('td', {'class' : 'font01'}).text.strip().replace(u'  ', '\n  ')

            article.url = url
            article.publication_date = date
            article.page = page
            article.issue = issue
            
        article.content = soup.find('span', {'id' : 'contenttext'}).text.strip().replace(u'  ', '\n  ')
        article.save()
Пример #6
0
def add_view(request):
    # page is required authentication to view
    if not request.user.is_authenticated():
        return HttpResponseRedirect(reverse("articles:login_required"))
    if request.method == "POST":
        # get story title and text
        title = request.POST.get("title", None)
        text = request.POST.get("story", None)
        # if title or text is not added
        if not title or not text:
            return render(
                request, "articles/add.html", {"msg": "У вашей истории обязательно должно быть название и сам текст."}
            )
        # get current User object
        u = User.objects.get(pk=request.user.id)
        # get current UserProfile object
        profile = UserProfile.objects.get(user=u)
        # create and save new story
        article = Article(title=title, text=text, author=profile)
        article.publication_date = datetime.now()
        article.save()
        return HttpResponseRedirect(reverse("articles:add_success"))
    return render(request, "articles/add.html")