Python Article.topics 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: newsroom.models

클래스/타입: Article

메소드/함수: topics

hotexamples.com에서의 예제들: 2

Python Article.topics - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 newsroom.models.Article.topics에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Article(12)

category(12)

slug(12)

title(12)

save(12)

published(8)

author_01(6)

author_02(5)

body(4)

external_primary_image(4)

publish_now(3)

byline(2)

created(2)

author_03(2)

pk(2)

primary_image_caption(2)

primary_image_size(2)

summary_text(2)

subtitle(1)

is_published(1)

include_in_rss(1)

exclude_from_list_views(1)

disqus_id(1)

topics(1)

예제 #1

파일 보기

파일: exportfromdrupal.py 프로젝트: danielsinaga1/djangonewsfix

def process(start, finish, html_file):
    soup = BeautifulSoup(open(html_file), "lxml")
    articles = soup.find_all("div")
    print("Number of articles", len(articles))
    if finish > len(articles):
        print("Truncating finish to ", len(articles))
        finish = len(articles)
    if start > finish:
        print("Start > finish")
        return

    articles = articles[start:finish]
    for idx, article in enumerate(articles):
        # Title, Link and date published
        link = article.find("a")["href"]
        link = 'http://gutest.nathangeffen.webfactional.com' + str(link)
        if link.find("/gallery/") == -1:
            print("Processing: ", idx + start, link)
        else:
            print("Ignoring: ", idx + start, link)
            continue

        newarticle = Article()
        newarticle.title = article.find("p",
                                        attrs={'class' : 'title'}).get_text(). \
            replace("â€™", "'").replace("â€œ",'"').replace("â€",'"')
        newarticle.slug = slugify(str(link).rpartition("/")[2])
        import datetime
        published = article.find('p', attrs={'class' : 'date'}).get_text()[2:] \
                    + " +0200"
        published_conv = datetime.datetime.strptime(published,
                                                    "%d/%m/%Y - %H:%M %z")
        newarticle.published = published_conv

        # Fetch article from web
        import urllib.request
        with urllib.request.urlopen(link) as response:
            html = response.read()
        soup = BeautifulSoup(html, "lxml")

        # Byline
        try:
            newarticle.byline = soup.find \
            ("div", { "class" : "article-author" }).get_text()
        except:
            print("Byline not found")

        # Intro
        try:
            intro = soup.find("div", {"class": "article-intro"})
            intro = intro.find("p")
            intro["class"] = "intro"
            newarticle.summary_text = str(intro)
        except:
            print("Intro not found")

        try:
            primary_image = soup.find("div", {"class" : "article-image"}). \
                            find("img")
            link = primary_image["src"]
            link = link.replace("http://gutest.nathangeffen.webfactional.com",
                                "http://groundup.org.za")
            newarticle.external_primary_image = link.replace(
                "/column_width/", "/article_image/")
            newarticle.primary_image_size = "LEAVE"
        except:
            print("Primary image not found")

        try:
            newarticle.primary_image_caption = primary_image["alt"]
        except:
            print("Primary image caption not found")

        # Body

        try:
            text = soup.find("div", {"class":"article-body"}). \
                   find("div", {"class":"field-item"})
        except:
            print("No text")
            text = ""

        body = str(intro) + str(text)  # "".join([str(item) for item in text])
        newarticle.body = body

        # category

        try:
            category = soup.find("div", attrs={
                'class': 'article-category'
            }).get_text()

            if category.lower() == "news":
                newarticle.category = "news"
            elif category.lower() == "featured story":
                newarticle.category = "featured story"
            elif category.lower() == "photo essay":
                newarticle.category = "photo essay"
            elif category.lower() == "photo":
                newarticle.category = "photo"
            elif category.lower() == "opinion":
                newarticle.category = "opinion"
            elif category.lower() == "brief":
                newarticle.category = "brief"
            elif category.lower() == "analysis":
                newarticle.category = "analysis"
            else:
                print("Unknown category: ", category)
        except:
            print("No category")

        # Topics
        try:
            topics = soup.find("div", {"class":"article-subject"}). \
                     get_text(", ")
        except:
            print("No topics")
            topics = ""
        try:
            tags = soup.find("div", {"class": "article-tags"}).get_text(", ")
        except:
            tags = ""

        if tags:
            if topics:
                topics += ", " + tags
            else:
                topics = tags

        topics_split = topics.split(",")
        if len(topics_split) > 8:
            topics = ", ".join(topics_split[0:8])
        newarticle.topics = topics

        # Saving
        try:
            article_to_replace = Article.objects.get(slug=newarticle.slug)
        except Article.DoesNotExist:
            print("Saving as new article")
            newarticle.save()
        else:
            print("Updating existing article")
            newarticle.pk = article_to_replace.pk
            newarticle.created = article_to_replace.created
            newarticle.save()

예제 #2

파일 보기

파일: exportfromdrupal.py 프로젝트: groundupnews/gu

def process(start, finish, html_file):
    soup = BeautifulSoup(open(html_file), "lxml")
    articles = soup.find_all("div")
    print("Number of articles", len(articles))
    if finish > len(articles):
        print("Truncating finish to ", len(articles))
        finish = len(articles)
    if start > finish:
        print("Start > finish")
        return

    articles = articles[start:finish]
    for idx,article in enumerate(articles):
        # Title, Link and date published
        link = article.find("a")["href"]
        link = 'http://gutest.nathangeffen.webfactional.com' + str(link)
        if link.find("/gallery/") == -1:
            print("Processing: ", idx + start, link)
        else:
            print("Ignoring: ", idx + start, link)
            continue

        newarticle = Article()
        newarticle.title = article.find("p",
                                        attrs={'class' : 'title'}).get_text(). \
            replace("â€™", "'").replace("â€œ",'"').replace("â€",'"')
        newarticle.slug = slugify(str(link).rpartition("/")[2])
        import datetime
        published = article.find('p', attrs={'class' : 'date'}).get_text()[2:] \
                    + " +0200"
        published_conv = datetime.datetime.strptime(published,
                                                    "%d/%m/%Y - %H:%M %z")
        newarticle.published = published_conv

        # Fetch article from web
        import urllib.request
        with urllib.request.urlopen(link) as response:
            html = response.read()
        soup = BeautifulSoup(html, "lxml")

        # Byline
        try:
            newarticle.byline = soup.find \
            ("div", { "class" : "article-author" }).get_text()
        except:
            print("Byline not found")

        # Intro
        try:
            intro = soup.find("div", {"class" : "article-intro"})
            intro = intro.find("p")
            intro["class"] = "intro"
            newarticle.summary_text = str(intro)
        except:
            print("Intro not found")

        try:
            primary_image = soup.find("div", {"class" : "article-image"}). \
                            find("img")
            link = primary_image["src"]
            link = link.replace("http://gutest.nathangeffen.webfactional.com",
                                "http://groundup.org.za")
            newarticle.external_primary_image = link.replace("/column_width/",
                                                             "/article_image/")
            newarticle.primary_image_size = "LEAVE"
        except:
            print("Primary image not found")

        try:
            newarticle.primary_image_caption = primary_image["alt"]
        except:
            print("Primary image caption not found")

        # Body

        try:
            text = soup.find("div", {"class":"article-body"}). \
                   find("div", {"class":"field-item"})
        except:
            print("No text")
            text = ""

        body = str(intro) + str(text) # "".join([str(item) for item in text])
        newarticle.body = body

        # category

        try:
            category = soup.find("div", attrs={'class' :
                                               'article-category'}).get_text()

            if category.lower() == "news":
                newarticle.category = "news"
            elif category.lower() == "featured story":
                newarticle.category = "featured story"
            elif category.lower() == "photo essay":
                newarticle.category = "photo essay"
            elif category.lower() == "photo":
                newarticle.category = "photo"
            elif category.lower() == "opinion":
                newarticle.category = "opinion"
            elif category.lower() == "brief":
                newarticle.category = "brief"
            elif category.lower() == "analysis":
                newarticle.category = "analysis"
            else:
                print("Unknown category: ", category)
        except:
            print("No category")

        # Topics
        try:
            topics = soup.find("div", {"class":"article-subject"}). \
                     get_text(", ")
        except:
            print("No topics")
            topics = ""
        try:
            tags = soup.find("div", {"class":"article-tags"}).get_text(", ")
        except:
            tags = ""

        if tags:
            if topics:
                topics += ", " + tags
            else:
                topics = tags

        topics_split = topics.split(",")
        if len(topics_split) > 8:
            topics = ", ".join(topics_split[0:8])
        newarticle.topics = topics

        # Saving
        try:
            article_to_replace = Article.objects.get(slug=newarticle.slug)
        except Article.DoesNotExist:
            print("Saving as new article")
            newarticle.save()
        else:
            print("Updating existing article")
            newarticle.pk = article_to_replace.pk
            newarticle.created = article_to_replace.created
            newarticle.save()