Python Article示例，news.models.Article Python示例

示例#1

0

显示文件

文件： views.py 项目： saurabhsood91/newsClassification

def GetNewArticles(request):
    # Get the articles from RSS
    # aggregator = NewsAggregator()
    # list_of_articles = aggregator.feedreader()
    classifier = Classifier("filename.pkl")
    # Predict
    list_of_classes = []
    # with open("articles_dump", "wb") as dump:
    #     pickle.dump(list_of_articles, dump, pickle.HIGHEST_PROTOCOL)
    with open("articles_dump") as dump:
        list_of_articles = pickle.load(dump)
    for article in list_of_articles:
        list_of_classes.append(article["content"])
    # print list_of_classes
    res = classifier.predict(np.asarray(list_of_classes))

    for i in range(0, len(list_of_articles)):
        if res[i] == 1:
            cat = "Sports"
        elif res[i] == 2:
            cat = "Economy_business_finance"
        elif res[i] == 3:
            cat = "Science_technology"
        else:
            cat = "Lifestyle_leisure"
        element = list_of_articles[i]
        list_of_articles[i]["category"] = cat
        article = Article(article_title=element["title"], article_content=element["content"], article_category=cat)
        article.save()
    json_object = json.dumps(list_of_articles)
    return HttpResponse(json_object)

示例#2

0

显示文件

文件： tests.py 项目： muturi254/ctribune

class ArticleTestClass(TestCase):

    # set up method
    def setUp(self):
        self.james = Editor(first_name='James',
                            last_name='Muriuki',
                            email='*****@*****.**')
        self.james.save_editor()

        # initialize tags
        self.new_tag = Tags(name='tetsting')
        self.new_tag.save()

        self.new_article = Article(title='Test Article',
                                   post='This is a random post',
                                   editor=self.james)
        self.new_article.save()

        self.new_article.tags.add(self.new_tag)

    def tearDown(self):
        Article.objects.all().delete()
        Editor.objects.all().delete()
        Tags.objects.all().delete()

    def test_get_news_today(self):
        today_news = Article.todays_news()
        self.assertTrue(len(today_news) > 0)

    def test_get_news_by_date(self):
        test_date = '2017-03-17'
        date = dt.datetime.strptime(test_date, "%Y-%m-%d").date()
        news_by_date = Article.days_news(date)
        self.assertTrue(len(news_by_date) == 0)

示例#3

0

显示文件

文件： parser.py 项目： AndreiFdv/SE_Project

    def handle(self, *args, **options):

        news = RSSNews(RSS_Links)
        telegraph = Telegraph(access_token=os.getenv('TELEGRAPH_ACCESS_TOKEN'))

        if news.urls:
            for url, date in news.urls.items():
                article = NewsPlease.from_url(url)

                a = Article(author=', '.join(article.authors) or 'Anonymous',
                            title=article.title,
                            short_text=article.description,
                            content=article.maintext,
                            date=date,
                            source_link=url,
                            img=article.image_url)
                a.save()

                response = telegraph.create_page(title=a.title,
                                                 html_content=a.content)

                TelegraphArticle(title=a.title, link=response['url']).save()

                bot.send_telegraph_msg(response['url'])

        self.stdout.write(self.style.SUCCESS('Success'))

示例#4

0

显示文件

文件： views.py 项目： sachinmurali/newsClassifcation

def GetNewArticles(request):
    # Get the articles from RSS
    # aggregator = NewsAggregator()
    # list_of_articles = aggregator.feedreader()
    classifier = Classifier("filename.pkl")
    # Predict
    list_of_classes = []
    # with open("articles_dump", "wb") as dump:
    #     pickle.dump(list_of_articles, dump, pickle.HIGHEST_PROTOCOL)
    with open("articles_dump") as dump:
        list_of_articles = pickle.load(dump)
    for article in list_of_articles:
        list_of_classes.append(article["content"])
    # print list_of_classes
    res = classifier.predict(np.asarray(list_of_classes))

    for i in range(0, len(list_of_articles)):
        if res[i] == 1:
            cat = "Sports"
        elif res[i] == 2:
            cat = "Economy_business_finance"
        elif res[i] == 3:
            cat = "Science_technology"
        else:
            cat = "Lifestyle_leisure"
        element = list_of_articles[i]
        list_of_articles[i]["category"] = cat
        article = Article(article_title=element["title"], article_content=element["content"], article_category=cat)
        article.save()
    json_object = json.dumps(list_of_articles)
    return HttpResponse(json_object)

示例#5

0

显示文件

文件： tests.py 项目： diyarkudrat/GameDayNews

    def test_page_slugify_on_save(self):
        """Tests the slug generated when saving the article"""

        user = User()
        user.save()

        article = Article(title="My Test Page", content="test", author=user)
        article.save()
        self.assertEqual(article.slug, 'my-test-page')

示例#6

0

显示文件

文件： api.py 项目： rugwirobaker/watch

def create_new_article(data):
    #task unstage dictionnary
    new_article = Article()
    new_article.title = data['title']
    new_article.author = data['author']
    new_article.publication_date = data['publication_date']
    new_article.summary = data['summary']
    new_article.article_image = data['image_url']
    new_article.article_url = data['article_url']
    new_article.save()

示例#7

0

显示文件

文件： import_legacy_article.py 项目： Dikutal/Dikutal

def import_legacy_article(title, author, teaser, content, published, slug):
    article = Article(
        title=title,
        author_id=1,
        teaser=teaser,
        content="<p><em>By %s</em></p>\n%s" % (author, content),
        content_format=formats.HTML,
        published=published,
        slug=slug,
        language=languages.DA,
    )
    article.save()

示例#8

0

显示文件

文件： views.py 项目： xsunfeng/cssanews

def add(request):
	print "add"
	title = request.REQUEST.get('title','').encode('utf-8')
	author = request.REQUEST.get('author', '').encode('utf-8')
	content = request.REQUEST.get('content', '').encode('utf-8')
	thumb_url = request.REQUEST.get('thumb_url', '').encode('utf-8')
	desc = request.REQUEST.get('desc', '').encode('utf-8')
	tz = pytz.timezone("US/Eastern")
	tmp = Article(title=title, author=author, content=content, thumb_url=thumb_url, desc=desc, pub_date=tz.localize(datetime.datetime.now()))
	tmp.save()
	response = {}
	return HttpResponse(json.dumps(response), content_type='application/json')

示例#9

0

显示文件

文件： news.py 项目： wleddy/news

def edit(article_handle='0'):
    setExits()
    g.title = "Article"
    articles = Article(g.db)
    
    #import pdb; pdb.set_trace()
    rec_id = cleanRecordID(article_handle)
    rec = articles.get(article_handle)
    if not rec and not rec_id == 0:
        flash('Could not find that artcle')
        return redirect(g.homeURL)
    
    if rec_id == 0:
        rec = articles.new()    
    
    #Is there a form?
    if request.form:
        #import pdb; pdb.set_trace()
        
        articles.update(rec,request.form)
        if valid_form(rec):
            if request.form['publication_date']:
                # convert to a date time
                rec.publication_date = getDatetimeFromString(request.form['publication_date'])
            try:
                articles.save(rec)
                g.db.commit()
                return redirect(g.homeURL)
            except Exception as e:
                g.db.rollback()
                flash(printException("Error when attepting to save article.",e))
                
    
    return render_template('news/article_edit.html',rec=rec)

示例#10

0

显示文件

文件： crawler.py 项目： tuner24/practice

def test():
	base_url = "http://www.lz13.cn/lizhi/qingchunlizhi.html"
	response = requests.get(base_url)
	parsed_body = html.fromstring(response.text)
	article_urls = parsed_body.xpath('//a[contains(@href, "/qingchunlizhi/")]/@href')
	g = Goose({'stopwords_class': StopWordsChinese})

	for url in article_urls:
		article = g.extract(url=url)
		t = article.title
		c = article.cleaned_text
		art = Article(title=t, content=c)
		art.author = 'lizhi'
		art.save()
		print 'get data from %s at %s' % (url, time.ctime())

示例#11

0

显示文件

def news_today(request):
    date = dt.date.today()
    news = Article.todays_news()
    return render(request, 'all-news/today-news.html', {
        "date": date,
        "news": news
    })

示例#12

0

显示文件

文件： tests.py 项目： muturi254/ctribune

    def setUp(self):
        self.james = Editor(first_name='James',
                            last_name='Muriuki',
                            email='*****@*****.**')
        self.james.save_editor()

        # initialize tags
        self.new_tag = Tags(name='tetsting')
        self.new_tag.save()

        self.new_article = Article(title='Test Article',
                                   post='This is a random post',
                                   editor=self.james)
        self.new_article.save()

        self.new_article.tags.add(self.new_tag)

示例#13

0

显示文件

文件： news.py 项目： wleddy/news

def valid_form(rec):
    valid_form = True
    slug = request.form.get('slug','').strip()
    title = request.form.get('title','').strip()
    
    if not slug and title:
        slug = title.lower()
        for s in ' /<>"\'#.()':
            slug = slug.replace(s,'-')
        rec.slug = slug
    
    if not title:
        flash("The title may not be empty")
        valid_form = False
        
    if not slug:
        flash("The slug line may not be empty")
        valid_form = False
        
    if slug:
        sql = 'select * from article where slug = ? and id <> ?'
        find_slug = Article(g.db).select_raw(sql,(slug,rec.id))
        if find_slug and len(find_slug) > 0:
            valid_form = False
            flash("The slug line must be unique")
        
    # If present, the date must be valid format
    publication_date = request.form.get('publication_date','').strip()
    if publication_date:
        test_date = getDatetimeFromString(publication_date)
        if not test_date:
            valid_form = False
            flash('{} is not a valid date'.format(publication_date))
    
    return valid_form

示例#14

0

显示文件

文件： views.py 项目： kpx13/eac

def home_page(request):
    c = get_common_context(request)
    c['request_url'] = 'home'
    c['slideshow'] = Slider.objects.all()
    c['content'] = Page.get_by_slug('home')['content']
    c['n'] = Article.recent_some(1)[0]
    return render_to_response('home.html', c, context_instance=RequestContext(request))

示例#15

0

显示文件

文件： views.py 项目： kpx13/ibm

def home(request):
    c = get_common_context(request)
    c['request_url'] = 'home'
    c['recent_news'] = Article.get_list(c['lang'])[:6]
    c['recent_photos'] = Photo.objects.all()[:3]
    c['home_top']     = Page.get('home_top',     c['lang'])['content']
    c['home_history'] = Page.get('home_history', c['lang'])['content']
    return render_to_response('home.html', c, context_instance=RequestContext(request))

示例#16

0

显示文件

def check_for_updates():

    while True:	
        try:
            posts = aj_gather_data()
            for post in posts:
            	Article(title = post.title, content = post.text, source = post.link, category = categorizer(post.text)).save()
            posts = re_gather_data()
            for post in posts:
            	Article(title = post.title, content = post.text, source = post.link, category = categorizer(post.text)).save()
            posts = ec_gather_data()
            for post in posts:
            	Article(title = post.title, content = post.text, source = post.link, category = categorizer(post.text)).save()
            time.sleep(60)

        except ConnectionError:
            time.sleep(300)

示例#17

0

显示文件

文件： views.py 项目： kpx13/dr1

def news_article_page(request, page_name):
    c = get_common_context(request)
    try:
        c['item'] = Article.get_by_slug(page_name)
        c['news'] = Article.objects.all()
        return render_to_response('news_article.html', c, context_instance=RequestContext(request))
    except:
        raise Http404()

示例#18

0

显示文件

def send_message(article: Article) -> None:
    users = TelegramUser.objects.only("user_id")
    bot = Bot(token=os.getenv('TELEGRAM_BOT'))
    link = 'http://127.0.0.1:8000' + article.get_absolute_url()

    for user in users:
        message = f'<b>{article.title}</b>\n{article.short_text} \n<a href="{link}">Read more</a>'

        bot.send_message(chat_id=user.user_id, text=message, parse_mode='HTML')

示例#19

0

显示文件

文件： views.py 项目： kpx13/solar

def news_article(request, slug):
    c = get_common_context(request)
    article = Article.get(slug, c['lang'])
    c['article'] = article
    if request.method == 'POST':
        if request.POST.get('action') == 'comment':
            Article.objects.get(slug=slug).add_comment(request.user, 
                            request.POST.get('content'))
            return HttpResponseRedirect('/news/%s/' % slug)
    return render_to_response('news_item.html', c, context_instance=RequestContext(request))

示例#20

0

显示文件

文件： parsers.py 项目： vsmaxim/news-clusterization-service

def parse_article_from_link(link: str) -> Article:
    article = newspaper.Article(link, language='ru')
    article.download()
    article.parse()
    return Article(
        title=article.title,
        text=article.text,
        source=link,
        publish_date=article.publish_date,
    )

示例#21

0

显示文件

文件： views.py 项目： kpx13/est

def news_article_page(request, page_name):
    c = get_common_context(request)
    try:
        c['item'] = Article.get_by_slug(page_name)
        c['news'] = Article.objects.all()
        return render_to_response('news_article.html',
                                  c,
                                  context_instance=RequestContext(request))
    except:
        raise Http404()

示例#22

0

显示文件

文件： views.py 项目： chernykovv/rcn-django

def view_article_list(request, p_slug):
    all_articles = True
    page = get_page(p_slug)

    try:
        articles = Article.get_published_objects()

    except Exception, e:
        logging.error(e)
        raise Http404

示例#23

0

显示文件

文件： news.py 项目： wleddy/news

def display():
    setExits()
    #import pdb; pdb.set_trace()
    rendered_html = render_markdown_for(__file__,mod,'news/news.md')
    
    recs = Article(g.db).select()
    
    return render_template('news/news.html',
        rendered_html=rendered_html, recs = recs,
        )

示例#24

0

显示文件

文件： fetchnews.py 项目： jctt1983/Plasterd

    def fetch_news(self, source_id=None, current_date=None):
        page = 1
        lang = settings.NEWS_API_LANG or 'en'
        while True:
            articles = self.fetch_api(
                page=page,
                source_id=source_id,
                current_date=current_date)

            if not articles:
                break

            for a in articles:

                if not a.get('url'):
                    continue

                article = Article.get_by_url(a.get('url'))
                if article:
                    continue

                pub_date = a.get('publishedAt')

                if pub_date:
                    pub_date = parse_datetime(pub_date)
                else:
                    pub_date = timezone.now()

                article = Article(
                    title=a.get('title'),
                    description=a.get('description'),
                    url=a.get('url').lower(),
                    url_image=a.get('urlToImage'),
                    lang=lang,
                    pub_date=pub_date)

                if a.get('source'):
                    source = a.get('source')
                    article.source = self.get_source(
                        source.get('id'),
                        source.get('name'))

                article.save()

                if a.get('author'):
                    author_names = a.get('author').split(',')
                    author_names = map(lambda x: x.strip(), author_names)
                    authors = map(self.get_author, author_names)
                    for author in authors:
                        article.authors.add(author)
                article.save()

            page += 1

示例#25

0

显示文件

文件： views.py 项目： kpx13/eac

def news_page(request, page_name=None):
    c = get_common_context(request)
    if (request.method == 'GET') or not request.POST.get('search_value', None):
        try:
            if page_name:
                c['recent'] = Article.get_by_slug(page_name)
            else:
                c['recent'] = Article.recent_some(1)[0]
            c['news'] = Article.recent_some(1000)
            return render_to_response('news.html', c, context_instance=RequestContext(request))
        except:
            raise Http404('page %s not found' % page_name)
    else: # POST
        c['search_value'] = request.POST['search_value']
        c['news'] = Article.find(c['search_value'])
        if len(c['news']) > 0:
            c['recent'] = c['news'][0]
        else:
            c['not_found'] = True
        return render_to_response('news.html', c, context_instance=RequestContext(request))

示例#26

0

显示文件

文件： views.py 项目： kpx13/solar

def search(request):
    c = get_common_context(request)
    if request.method == 'POST':
        q = request.POST.get('q', '')
        if q:
            c['q'] = q
            c['projects'] = Project.search(q, c['lang'])
            c['news'] = Article.search(q, c['lang'])
            c['seminars'] = Seminar.search(q, c['lang'])
            return render_to_response('search.html', c, context_instance=RequestContext(request))
    return HttpResponseRedirect('/')

示例#27

0

显示文件

文件： news.py 项目： wleddy/news

def delete(rec_id=None):
    setExits()
    g.title = "Article"
    
    if rec_id == None:
        rec_id = request.form.get('id',request.args.get('id',-1))
    
    rec_id = cleanRecordID(rec_id)
    if rec_id <=0:
        flash("That is not a valid record ID")
        return redirect(g.listURL)
        
    rec = Article(g.db).get(rec_id)
    if not rec:
        flash("Record not found")
    else:
        Article(g.db).delete(rec.id)
        g.db.commit()
        
    return redirect(g.listURL)

示例#28

0

显示文件

def search_results(request):

    if 'article' in request.GET and request.GET["article"]:
        search_term = request.GET.get("article")
        searched_articles = Article.search_by_title(search_term)
        message = f"{search_term}"

        return render(request, 'all-news/search.html',{"message":message,"articles": searched_articles})

    else:
        message = "You haven't searched for any term"
        return render(request, 'all-news/search.html',{"message":message})

示例#29

0

显示文件

 def get_by_url(self, request, pk=None):
     """
     Returns a list of promises related to the article found at a URL passed via query parameter
     """
     url = request.query_params.get('url', False)
     if not url:
         raise ValidationError({'error': 'url is required'})
     article = Article.get_or_create_by_url(url=url)
     article.analyze_article(redo=True)
     context = {'request': request}
     serializer = ArticlePromisesSerializer(article, context=context)
     return Response(serializer.data)

示例#30

0

显示文件

文件： listing_kmib.py 项目： NullFull/K-knight

    def handle(self, *args, **options):
        """
        우선 연합 뉴스만 예시로 작성한다.
        """
        urls = list()

        now = datetime.now()
        dates = [now]
        if now.hour < 6:
            dates.append(now - timedelta(1)) # requirement: 6시간 이상 지나지 않았으면 전날도 크롤링

        for adate in dates:
            for page in range(10, 0, -1):
                url = f'http://news.kmib.co.kr/article/list.asp?sid1=all&sid2=&page={page}&sdate={adate:%Y%m%d}&st='
                urls.append(url)

        for url in urls:
            try:
                _header = {
                    'User-Agent': self.press.user_agent
                }
                _response = requests.get(url, headers=_header)
                _response.encoding = self.press.encoding
                _response.close()

            except requests.exceptions.ConnectionError as e:
                _second = random.randrange(5 * 60, 15 * 60)
                time.sleep(_second)
                exit()

            soup = BeautifulSoup(_response.text, 'lxml')
            soup_list = soup.select_one('.nws_list')

            for item in soup_list.select('div.nws'):
                dt = item.select_one('dt').select_one('a')
                url = dt['href']
                title = dt.string
                datetime_str = item.select_one('dd.date').string
                datetime_obj = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M')
                datetime_obj = timezone.make_aware(datetime_obj, timezone=pytz.timezone('Asia/Seoul'), is_dst=False)
                summary = item.select_one('dd.tx').string.strip()
                thumbnail_obj = item.select_one('p.pic a img')
                if thumbnail_obj:
                    thumbnail_src = thumbnail_obj['src']

                if Article.create_new(press=self.press, url=url, title=title, datetime=datetime_obj) is None:
                    time.sleep(10)
                    continue

                print(f'국민일보: {datetime_obj}: {title}: {url}')

            time.sleep(10)

示例#31

0

显示文件

文件： views.py 项目： kpx13/solar

def get_common_context(request):
    c = {}
    c['lang'] = request.LANGUAGE_CODE
    c['request_url'] = request.path
    c['is_debug'] = settings.DEBUG
    c['recent_news'] = Article.get_list(c['lang'])[:2]
    c['recent_projects'] = Project.objects.all()[:3]
    c['recent_comments'] = ProjectComment.objects.all()[:3]
    c['ime_expert'] = Expert.exist(request.user)
    c['ime_participant'] = Participant.exist(request.user)
    c['auth'] = request.user.is_authenticated()
    c.update(csrf(request))
    return c

示例#32

0

显示文件

def past_days_news(request, past_date):
    try:
        # Converts data from the string Url
        date = dt.datetime.strptime(past_date, '%Y-%m-%d').date()
    except ValueError:
        # Raise 404 error when ValueError is thrown
        raise Http404()
        assert False

    if date == dt.date.today():
        return redirect(news_today)

    news = Article.days_news(date)
    return render(request, 'all-news/past-news.html',{"date": date,"news":news})

示例#33

0

显示文件

def news_today(request):
    date = dt.date.today()
    news = Article.todays_news()
    if request.method == 'POST':
        form = NewsLetterForm(request.POST)
        if form.is_valid():
            name = form.cleaned_data['your_name']
            email = form.cleaned_data['email']
            recipient = NewsLetterRecipients(name = name,email =email)
            recipient.save()
            send_welcome_email(name,email)
            HttpResponseRedirect('news_today')
    else:
        form = NewsLetterForms()
    return render(request, 'all-news/today-news.html', {"date": date,"news":news,"letterForm":form})

示例#34

0

显示文件

文件： views.py 项目： antoniogs/news

    def get(self, request, *args, **kwargs):
        last_published_at = Article.get_last_published_at()
        new_articles = 0
        loaded_articles = False
        #some english language news media, selected randomly
        sources = ["bbc-news", "reuters", "the-washington-post", "cnn"]

        page = 1
        found_articles = 0
        url = "https://newsapi.org/v2/everything" \
              "?sources=%s" \
              "&from=%s" \
              "&language=en" \
              "&page=%s&apiKey=%s" % (",".join(sources),
                                      last_published_at,
                                      page,
                                      settings.NEWSAPI_KEY)

        while page == 1 or found_articles == 20:
            page = page + 1
            found_articles = 0

            response = requests.get(url)
            if hasattr(response, "json"):
                json_response = response.json()
                if json_response['status'] == "ok":
                    loaded_articles = True
                    articles = json_response['articles']
                    found_articles = len(articles)

                    for newapi_article in articles:
                        if save_article_and_source(newapi_article,
                                                   last_published_at):
                            new_articles = new_articles + 1

            url = "https://newsapi.org/v2/everything" \
                  "?sources=%s" \
                  "&from=%s" \
                  "&language=en" \
                  "&page=%s&apiKey=%s" % (",".join(sources),
                                          last_published_at,
                                          page,
                                          settings.NEWSAPI_KEY)

        return JsonResponse({
            "loaded_articles": loaded_articles,
            "new_articles": new_articles
        })

示例#35

0

显示文件

文件： news.py 项目： wleddy/news

def view(article_handle=-1):
    setExits()
    
    rec = Article(g.db).get(article_handle)
    if not rec:
        flash("That article could not be found.")
        return redirect(g.homeURL)
        
    g.title = rec.title
    if len(rec.title) > 20:
        g.title = rec.title[:20] + "..."
    
    rendered_html = render_markdown_text(rec.words)
        
    return render_template('news/article.html',
        rendered_html=rendered_html, rec=rec,
        )

示例#36

0

显示文件

文件： listing_news.py 项目： NullFull/K-knight

    def handle(self, *args, **options):
        """
        우선 연합 뉴스만 예시로 작성한다.
        """
        urls = list()

        for idx in range(10, 0, -1):
            urls.append("{}/news/{}".format(self.url_base, idx))

        for url in urls:
            try:
                _header = {
                    'User-Agent': self.press.user_agent
                }
                _response = requests.get(url, headers=_header)
                _response.encoding = self.press.encoding
                _response.close()

            except requests.exceptions.ConnectionError as e:
                _second = random.randrange(5 * 60, 15 * 60)
                time.sleep(_second)
                exit()

            soup_body = BeautifulSoup(_response.text, 'lxml')
            soup_list = soup_body.select_one('.headline-list ul')

            soup_section = soup_list.find_all('li', {'class': 'section02'})

            for item in reversed(soup_section):
                title = item.select_one('.news-tl').select_one('a').string
                url_orig = item.select_one('.news-tl').select_one('a')['href']
                url_parsed = urlparse(url_orig)
                url = url_parsed.netloc + url_parsed.path

                datetime_string = item.select_one('.lead').select_one('.p-time').string
                datetime_obj = datetime.datetime.strptime(datetime_string, '%m-%d %H:%M').replace(year=2019)
                datetime_obj = timezone.make_aware(datetime_obj, timezone=pytz.timezone('Asia/Seoul'), is_dst=False)

                if Article.create_new(press=self.press, url=url, title=title, datetime=datetime_obj) is None:
                    time.sleep(10)
                    continue

                print(f'연합뉴스: {datetime_obj}: {title}: {url}')
                # Article.perceive('https://' + url, title, datetime_obj)

            time.sleep(10)

示例#37

0

显示文件

文件： parsers.py 项目： vsmaxim/news-clusterization-service

def parse_articles_from_rss(rss: str) -> List[Article]:
    articles: List[Article] = []
    entries = parse_rss_entries(rss)

    for entry in entries:
        # TODO: Probably extend to another languages
        article = newspaper.Article(entry.link, language='ru')
        article.download()
        article.parse()
        articles.append(
            Article(
                title=entry.title,
                text=article.text,
                source=entry.link,
                publish_date=entry.pub_date,
            ))

    return articles

示例#38

0

显示文件

def check_for_new_links():

    requests_number = 0

    proxy = random.choice(proxies)

    for site in websites:

        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
        print(site)

        path = "last_links/" + site + "/"

        f = open("rss_links/" + site + ".txt", "r")
        links = f.read().splitlines()
        for link in links:

            t_link = make_filename(link)
            name = path + t_link + ".txt"
            filee = open(name, "r")
            old_links = filee.read().splitlines()
            new_links = []
            i = 0

            for entry in feedparser.parse(link).entries:
                if entry['link'] not in old_links:
                    print(entry['link'])
                    proxy = parse_article(entry['link'], site, requests_number, proxy)
                    Article(title = proxy[0], content = proxy[1], source = entry['link'], category = article_classify(proxy[1])).save()
                    requests_number += 1
                    i+=1
                    new_links.append(entry['link'])

                else:
                    break

            with open(name, "w") as new_file:
                for link in new_links:
                    new_file.write(link + "\n")
                for link in old_links[0:len(old_links)-i]:
                    new_file.write(link + "\n")

示例#39

0

显示文件

文件： views.py 项目： Dikutal/Dikutal

def index(model, request):
    lang = request.GET.get('lang')
    if not lang in lang_filter.keys():
        lang = 'all'

    articles = Article.get_articles(lang)
    paginator = Paginator(articles, NUM_ARTICLES)

    page = request.GET.get('page')
    try:
        latest = paginator.page(page)
    except PageNotAnInteger:
        latest = paginator.page(1)
    except EmptyPage:
        latest = paginator.page(paginator.num_pages)

    return render_to_response('news/index.html', RequestContext(request, {
        'active_tab': 'articles',
        'subtitle': 'Articles',
        'latest': latest,
        'active_lang': lang
    }))

示例#40

0

显示文件

文件： views.py 项目： kpx13/ibm

def news(request):
    c = get_common_context(request)

    items = Article.get_list(c['lang'])

    paginator = Paginator(items, PAGINATION_COUNT)
    page = int(request.GET.get('page', '1'))
    try:
        items = paginator.page(page)
    except PageNotAnInteger:
        page = 1
        items = paginator.page(page)
    except EmptyPage:
        page = paginator.num_pages
        items = paginator.page(page)
    c['page'] = page
    c['page_range'] = paginator.page_range
    if len(c['page_range']) > 1:
        c['need_pagination'] = True
    c['items'] = items

    return render_to_response('news.html', c, context_instance=RequestContext(request))

示例#41

0

显示文件

文件： check_feeds.py 项目： morreene/news-agg

    def handle(self, *args, **options):
 
        # form = FeedForm(request.POST)
        # if form.is_valid():
        #     feed = form.save(commit=False)

        #     exitingFeed = Feed.objects.filter(url = feed.url)
        #     if len(exitingFeed) ==0:

####################################

            feeds = Feed.objects.all()
            # for feed in feeds:    

                # feedData = feedparser.parse(feed.url)
                # # set same fields
                # feed.title = feedData.feed.title
                # feed.save()


                # for entry in feed.entries:
                # for feed_url in Feed.objects.all().values_list("url", flat=True):

            for feed in feeds:
                feedData = feedparser.parse(feed.url)
                # set same fields

                for entry in feedData.entries:

                    exitingEntry = Article.objects.filter(url = entry.link)
                    if len(exitingEntry) == 0:

                        article = Article()
                        article.title = entry.title
                        article.url = entry.link
                        article.description = entry.description
                        # article.full = full_article(entry.link)

                        a = newspaper.Article(entry.link, language='en') 
                        a.download()
                        a.parse()
                        a.nlp()
                        article.keyword = a.keywords
                        article.full = a.text
                        #article.title = a.title


                        d = datetime.datetime(*(entry.published_parsed[0:6]) )
                        dateString = d.strftime('%Y-%m-%d %H:%M:%S')

                        article.publication_date = dateString
                        article.feed = feed
                        article.save()
                    else:
                        print("This article has already downloaded.")




        # for poll_id in options['poll_id']:
        #     try:
        #         poll = Poll.objects.get(pk=poll_id)
        #     except Poll.DoesNotExist:
        #         raise CommandError('Poll "%s" does not exist' % poll_id)

        #     poll.opened = False
        #     poll.save()

        #     self.stdout.write(self.style.SUCCESS('Successfully closed poll "%s"' % poll_id))

示例#42

0

显示文件

文件： tests.py 项目： susanjiang03/pipeline_monitor

    def test_saving_and_retrieving_articles(self):

        #article has title | url | description | newspaper | category
        article_1 = Article()
        article_1.newspaper = "Python Daily"
        article_1.category = "Programming"
        article_1.title = "Python in 10 minutes"
        article_1.url = "http://realpython.com"
        article_1.description = "Learn the Python programming language in 10 minutes"
        article_1.save()

        article_2 = Article()
        article_2.newspaper = "NYT"
        article_2.category = "Programming"
        article_2.title = "Kung Fu React"
        article_2.url = "http://flask.com"
        article_2.description = "Best tutorial to learning react on the web"
        article_2.save()

        saved_articles = Article.objects.all()
        self.assertEqual(saved_articles.count(), 2)

        self.assertEqual(article_1, Article.objects.all()[0])
        self.assertEqual(article_1.newspaper, 'Python Daily')
        self.assertEqual(article_1.category, 'Programming')
        self.assertEqual(article_1.title, 'Python in 10 minutes')
        self.assertEqual(article_1.url, "http://realpython.com")
        self.assertEqual(article_1.description, "Learn the Python programming language in 10 minutes")

        self.assertEqual(article_2, Article.objects.all()[1])
        self.assertEqual(article_2.newspaper, "NYT")
        self.assertEqual(article_2.category, "Programming")
        self.assertEqual(article_2.title, "Kung Fu React")
        self.assertEqual(article_2.url, "http://flask.com")
        self.assertEqual(article_2.description, "Best tutorial to learning react on the web")

示例#43

0

显示文件

文件： views.py 项目： kpx13/solar

def news(request):
    c = get_common_context(request)
    c['list'] = Article.get_list(c['lang'])
    return render_to_response('news.html', c, context_instance=RequestContext(request))

示例#44

0

显示文件

文件： crawler.py 项目： chickie8x/django_hello_world

        article_datetime = soup.select_one('span[title*=T]').attrs.__getitem__('title').split('T')[0].replace('-', '/')
        pub_date = datetime.strptime(article_datetime, '%Y/%m/%d').date()
    except:
        pass

    if not title or not article_description:
        print('no objects found')
    else:
        for detail in content:
            detail
        for img in images:
            item_id = random.randint(5, 1000)
            if url_img:
                dict_content = dict(id=item_id, article_feature_img=url_img, article_title=title.get_text(),
                                    article_desc=article_description.get_text(), article_content=str(detail),
                                    article_category=random.randint(1, 4), published_date=pub_date)
            else:
                print('no url img')
    return dict_content


for i in range(init_depth, depth):
    test = web_spider(url1 + str(i) + url2)

a = filter(None, test)
obj = [Article(article_feature_img=item['article_feature_img'], article_title=item['article_title'],
               article_desc=item['article_desc'], article_content=item['article_content'],
               article_category=Category.objects.get(id=item['article_category']),
               published_date=item['published_date']) for item in a]
Article.objects.bulk_create(obj)

示例#45

0

显示文件

>>> r.full_name
'John Smith'
# Now the new reporter is in the database.
>>> Reporter.objects.all()
<QuerySet [<Reporter: John Smith>]>
# Django provides a rich database lookup API.
>>> Reporter.objects.get(id=1)
<Reporter: John Smith>
>>> Reporter.objects.get(full_name__startswith='John')
<Reporter: John Smith>
>>> Reporter.objects.get(full_name__contains='mith')
<Reporter: John Smith>

# Create an article.
>>> from datetime import date
>>> a = Article(pub_date=date.today(), headline='Django is cool',
content='Yeah.', reporter=r) # r = Reporter(full_name='John Smith')
>>> a.save()

# Now the article is in the database.
>>> Article.objects.all()
<QuerySet [<Article: Django is cool>]>

# Article objects get API access to related Reporter objects.
>>> r = a.reporter
>>> r.full_name
'John Smith'

# And vice versa: Reporter objects get API access to Article objects.
>>>r.article_set.all()
<QuerySet [<Article: Django is cool>]>

示例#46

0

显示文件

文件： views.py 项目： kpx13/h2h

def news_details(request, page_name):
    c = get_common_context(request)
    c["new"] = News.get_by_slug(page_name)
    return render_to_response("new.html", c, context_instance=RequestContext(request))

示例#47

0

显示文件

 def get(self, request):
     is_json = request.content_type == 'application/json'
     limit = 100 if is_json else 20
     items, total = Article.get_articles(limit=limit)
     data = {'articles': items, 'total': total, 'limit': limit}
     return render_view(request, 'news/index.html', data)

示例#48

0

显示文件

文件： views.py 项目： Dikutal/Dikutal

def get_latest_articles(lang):
    articles = Article.get_articles(lang)
    content = [(a, datetime.datetime.now() - a.published) for a in articles]
    return [{'content': c} for (c, v) in sorted(content, key=itemgetter(1))][:NUM_ARTICLES_INDEX]