示例#1
0
  def set_yahoo_articles(self):
    page = 3
    url_list_yahoo = [
      {'category': '国内', 'url': ['http://news.yahoo.co.jp/list/?c=domestic']},
      {'category': '国際', 'url': ['http://news.yahoo.co.jp/list/?c=world']},
      {'category': '経済', 'url': ['http://news.yahoo.co.jp/list/?c=economy']},
      {'category': 'エンタメ', 'url': ['http://news.yahoo.co.jp/list/?c=entertainment']},
      {'category': 'スポーツ', 'url': ['http://news.yahoo.co.jp/list/?c=sports']},
      {'category': 'IT', 'url': ['http://news.yahoo.co.jp/list/?c=computer']},
      {'category': '科学', 'url': ['http://news.yahoo.co.jp/list/?c=science']},
      {'category': '地域', 'url': ['http://news.yahoo.co.jp/list/?c=local']}
    ]
    # make url list
    for item in url_list_yahoo:
      for page_num in range(2, page):
        url = item['url'][0] + '&p=' + str(page_num)
        item['url'].append(url)

    # make Article
    for item in url_list_yahoo:
      for page_num in range(0, page - 1):
        d = pq(item['url'][page_num])

        for (title, url) in zip(d('.list .ttl'), d('.list a')):
          url = 'http://news.yahoo.co.jp' + d(url).attr('href')
          category = item['category']
          title = d(title).text().encode('utf-8')
          content = pq(url)('.hbody').text().encode('utf-8')

          article = Article(url, category, title, content)
          article.get_info()
          self.collection.append(article)
def main():
    _, file_name, nquestion = sys.argv
    clean_up(file_name)
    timer_log("reference resolution")
    questions = []
#     replaced
    with open("../temp/article.clean") as farticle:
        article = Article(farticle.read())
        for sent in article.sentences():
            print(sent)
            try:
                for simp_s in simplify_sen(sent):
                    q_generated = question(simp_s)
                    questions.extend(q_generated)
                    for q in q_generated:
                        print(q)
                    print("")
            except:
                print("failed")
                if debug:
                    traceback.print_exc()
            timer_log("one sentence")
    
    print(ranking.get_top_questions('\n'.join(questions), nquestion))
    timer_log("ranking")
示例#3
0
文件: main.py 项目: kwtucker/DPW
    def __init__(self, app):
        bmodel = BlogModel()
        bview = BlogFeedView()
        form = Form()
        art = Article()

        if app.request.GET:
            # On submit of the form. The info is stored in the  self.the_posted object
            self.the_post = {}
            self.the_post["title"] = app.request.GET["title"]
            self.the_post["author"] = app.request.GET["author"]
            self.the_post["tags"] = app.request.GET["tags"]
            self.the_post["category"] = app.request.GET["category"]
            self.the_post["article"] = app.request.GET["article"]
            self.the_post["date"] = (time.strftime("%m/%d/%Y"))

            # Pass the post object into the format method art.a() to return a string html.
            # Then add the the global log_post array to the add_post_list so that the post will be append to it.
            bmodel.add_post_list(art.a(self.the_post), log_posts)

            # This will then write the response to the browser after the full html document is returned.
            app.response.write(bview.form_success(bmodel.formator(log_posts), form.reg_form))

        else:
            # If no submission display form only to the browser.
            app.response.write(bview.print_out_form(form.reg_form))
示例#4
0
	def load_data(self):
		self.mp.get_cycles()
		self.mp.get_yearly_cycles()
		self.mp.get_monthly_cycles()
		self.tp.get_cycles()
		self.tp.get_yearly_cycles()
		self.tp.get_monthly_cycles()

		for art in self.pattern.arch.articles:
			try:
				art = Article(art,self.pattern.arch.articleTitleToURL[art],self.pattern.arch.articles[art],self.pattern)
				try:
					art.get_views()
					if art.views != []:
						self.average.add_art(art)
						if not len(art.views) > 24*len(self.pattern.relativeDays)-1:
							self.articles.append(art)
						else:
							print "Look at page views for ", art.title 
				except IndexError:
					print art.link_title
			except KeyError:
				pass

		self.average.calc_av_views()
		print "Number of articles for "+self.pattern.title+": ", self.average.num_added
示例#5
0
def article_api(version=1):
    """ Obtain information about an article, given its URL or id """

    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")

    if request.method == "GET":
        url = request.args.get("url")
        uuid = request.args.get("id")
    else:
        url = request.form.get("url")
        uuid = request.form.get("id")
    if url:
        url = url.strip()[0:_MAX_URL_LENGTH]
    if uuid:
        uuid = uuid.strip()[0:_MAX_UUID_LENGTH]
    if url:
        # URL has priority, if both are specified
        uuid = None
    if not url and not uuid:
        return better_jsonify(valid=False, reason="No url or id specified in query")

    with SessionContext(commit=True) as session:

        if uuid:
            a = ArticleProxy.load_from_uuid(uuid, session)
        elif url.startswith("http:") or url.startswith("https:"):
            a = ArticleProxy.load_from_url(url, session)
        else:
            a = None

        if a is None:
            return better_jsonify(valid=False, reason="Article not found")

        if a.html is None:
            return better_jsonify(valid=False, reason="Unable to fetch article")

        # Prepare the article for display
        a.prepare(session)
        register = a.create_register(session, all_names=True)
        # Fetch names of article topics, if any
        topics = (
            session.query(ArticleTopic).filter(ArticleTopic.article_id == a.uuid).all()
        )
        topics = [dict(name=t.topic.name, id=t.topic.identifier) for t in topics]

    return better_jsonify(
        valid=True,
        url=a.url,
        id=a.uuid,
        heading=a.heading,
        author=a.author,
        ts=a.timestamp.isoformat()[0:19],
        num_sentences=a.num_sentences,
        num_parsed=a.num_parsed,
        ambiguity=a.ambiguity,
        register=register,
        topics=topics,
    )
示例#6
0
 def create_article(cls, parent, slug, site=None, title="Root", **kwargs):
     if not site:
         site = Site.objects.get_current()
     newpath = cls.objects.create(site=site, parent=parent, slug=slug)
     article = Article(title=title)
     article.add_revision(ArticleRevision(title=title, **kwargs), save=True)
     article.add_object_relation(newpath)
     return newpath
示例#7
0
 def post(self,ID = None):
     atitle = cgi.escape(self.request.get('blogTitle'))
     atext = cgi.escape(self.request.get('blogText'))
     blog = Article(title=atitle,
                    text=atext)
     blog.put()
     #self.response.write(title)
     #self.response.write(atext)
     self.redirect_to('home')
示例#8
0
def check_for_dtd_error(args):
    a = Article(archive_file=args.article_file.name, read_only=True)
    error = a.check_for_dtd_error()
    if error:
        if args.format_ariespull:
            print "error: DTD error: %s" % error
        else:
            print error
    a.close()
示例#9
0
    def add_feed(self, feed):
        print "Adding feed =>",
        
        f = feedparser.parse(feed)
        for item in f['entries']:
            a = Article()
            
            # Set ID as integer, without feedzilla at beginning
            a.id = item['id']
            a.id = re.sub(r'.*feedzilla\.com:(.*)', r'\1', a.id)
            a.id = int(a.id)
            
            if a.id not in self.articles.keys():
                # Set source, author and title
                a.author = item['author']
                a.title = item['title']
                a.source=item['source']['links'][0]['href']
                a.trueSource="http://news.feedzilla.com/en_us/stories/world-news/"+str(a.id)

                # Set summary, get rid of all the junk at the end
                summary = item['summary']
                summary = summary[:summary.find("\n\n")]
                summary = summary[:summary.find("<")]
                a.summary = summary
            
                # Add the article if it doesn't already exist
                self.articles[a.id] = a

        print "Done"
示例#10
0
文件: feed.py 项目: ledrui/IRIS-News
    def add_feed(self, feed):
        """
        add_feed takes the URL or file path of a Feedzilla feed, cleans it up,
        and adds the articles this Feed object's list.
        """

        log.info("Retrieving feed.")

        f = feedparser.parse(feed)
        for item in f['entries']:
            a = Article()

            # Set ID as integer, without feedzilla at beginning
            a.id = item['id']
            a.id = re.sub(r'.*feedzilla\.com:(.*)', r'\1', a.id)
            a.id = int(a.id)

            if a.id not in self.articles.keys():
                # Set source, author and title
                a.author = item['author']
                a.title = item['title']
                a.source=item['source']['links'][0]['href']
                a.trueSource="http://news.feedzilla.com/en_us/stories/world-news/"+str(a.id)

                # Set summary, get rid of all the junk at the end
                summary = item['summary']
                summary = summary[:summary.find("\n\n")]
                summary = summary[:summary.find("<")]
                a.summary = summary

                # Add the article if it doesn't already exist
                self.articles[a.id] = a
    def setUpClass(cls):
        with open(INPUT_JSON) as f:
            cls.valid_data = json.load(f)

        with open(INPUT_HTML) as f:
          html = f.read()

        article = Article()
        article.url = cls.valid_data['url']
        article.source = cls.valid_data['source']
        parse(article, html)
        cls._crawled_article = article
示例#12
0
 def create_root(cls, site=None, title="Root", **kwargs):
     if not site:
         site = Site.objects.get_current()
     root_nodes = cls.objects.root_nodes().filter(site=site)
     if not root_nodes:
         # (get_or_create does not work for MPTT models??)
         root = cls.objects.create(site=site)
         article = Article(title=title)
         article.add_revision(ArticleRevision(title=title, **kwargs), save=True)
         article.add_object_relation(root)
     else:
         root = root_nodes[0]
     return root
示例#13
0
def speed_test(uuid):
    try:
        print("Starting speed test")
        t0 = time.time()
        with SessionContext(commit = True) as session:
            # Load the article
            a = Article.load_from_uuid(uuid, session)
            if a is not None:
                # Parse it and store the updated version
                a.parse(session, verbose = True)
        t1 = time.time()
        print("Parsing finished in {0:.2f} seconds".format(t1 - t0))
    finally:
        Article.cleanup()
示例#14
0
    def generate_relation_dict(self, news_sources, news_targets):
        '''
        generates a dictionary of string/list(int) in the format
        {source : target_count}
        ie. {s1 : [tc1, tc2, ... tcn],
        s2 : [tc1, tc2, ... tcn], ...
        sn : [tc1, tc2, ... tcn]}
        where sn is the source, tcn is the citation count of each target
        '''
        # initialize the relation dictionary.
        relation_dict = {}

        for source_name, source_url in news_sources.iteritems():
            # create an empty list with a specific size which describe the number
            # of target referenced by each source
            target_count = [0] * len(news_targets)
            # Find the articles which have a specific source website url
            articles = Article.objects(
                Q(website=Website.objects(homepage_url=source_url).only('homepage_url').first()) &
                Q(citations__exists=True)).only('citations')
            for article in articles:
                # Count the times that each target in the news_targets is in the
                # citation list for each article and put it in the target_count
                for citation in article.citations:
                    if not isinstance( citation, int ):
                        i = 0
                        while i < len(news_targets):
                            if citation.target_name.lower() == news_targets.keys()[i].lower():
                                target_count[i] += 1
                            i += 1
            relation_dict[source_name] = target_count
        return relation_dict
示例#15
0
 def update_sentiments(self):
     from watson_developer_cloud import ToneAnalyzerV3Beta
     tone_analyzer = ToneAnalyzerV3Beta(username='******',
                                password='******',
                                version='2016-02-11')
     client = connections.get_connection()
     search = Search(using=client, index='articles', doc_type='article')
     q = Q('bool', must=[Q('missing', field='watson_analyzed')])
     search = search.query(q)
     counter = 0
     for result in search.scan():
         doc = Article.get(result.meta.id)
         try:
             analysis = tone_analyzer.tone(text=doc.body)
             tone_categories = analysis['document_tone']['tone_categories']
             emotion_tones = list(filter(lambda x: x['category_id'] == 'emotion_tone', tone_categories))[0]
             doc.tone = {}
             for tone in emotion_tones['tones']:
                 doc.tone[tone['tone_id']] = tone['score']
             doc.watson_success = True
         except WatsonException:
             continue
         finally:
             doc.watson_analyzed = True
             doc.save()
             counter += 1
         print(counter)
     if counter == 0:
         raise RealError()
示例#16
0
def reparse_api(version=1):
    """ Reparse an already parsed and stored article with a given UUID """
    if not (1 <= version <= 1):
        return better_jsonify(valid="False", reason="Unsupported version")

    uuid = request.form.get("id", "").strip()[0:_MAX_UUID_LENGTH]
    tokens = None
    register = {}
    stats = {}

    with SessionContext(commit=True) as session:
        # Load the article
        a = ArticleProxy.load_from_uuid(uuid, session)
        if a is not None:
            # Found: Parse it (with a fresh parser) and store the updated version
            a.parse(session, verbose=True, reload_parser=True)
            # Save the tokens
            tokens = a.tokens
            # Build register of person names
            register = a.create_register(session)
            stats = dict(
                num_tokens=a.num_tokens,
                num_sentences=a.num_sentences,
                num_parsed=a.num_parsed,
                ambiguity=a.ambiguity,
            )

    # Return the tokens as a JSON structure to the client,
    # along with a name register and article statistics
    return better_jsonify(valid=True, result=tokens, register=register, stats=stats)
def generate_model(lang, sites, mxParse=-1, mxSetSize=3):
    model = LanguageModel(lang)
    mongo = LanguageModel_Mongo("", lang, None)
    parsed = 0

    articleDB = ArticleDB()
    while (parsed < mxParse or (mxParse == -1 and parsed < articleDB.count())):
        a = articleDB.get(index=parsed)
        txt = ""#' '.join(a.get('text',''))
        adate = ' '.join(a.get('time',''))
        url = ""#''.join(a.get('url',''))
        atitle = ""

        if isinstance(a.get('url', []), list):
            url = ' '.join(a.get('url',''))
        elif isinstance(a.get('url', ""), basestring):
            url = a.get('url', "")
        if isinstance(a.get('text', []), list):
            txt = ' '.join(a.get('text',''))
        elif isinstance(a.get('text', ""), basestring):
            txt = a.get('text', "")
        if isinstance(a.get('title', []), list):
            atitle = ' '.join(a.get('title',''))
        elif isinstance(a.get('title', ""), basestring):
            atitle = a.get('title', "")
        for s in sites:
            if s in url:
                a = Article(text=txt, title=atitle, src=url, date=adate, nid=a['_id'], language_model=model)
                a.analyze(mxSetSize)


        parsed += 1

    print "Parsed ", parsed, " Articles. Inserting into Database"
    mongo.collection.drop()
    for k, w in model.words.iteritems():
        mongo.__process_word__(w)

    #Update Language Info
    langInfo = LanguageInfoModel_Mongo()

    keys = sorted(model.words.keys())
    freq = model.getWordsByFrequency()

    langInfo.updateLanguage(lang, parsed, len(model.words.keys()), sorted(freq.keys())[len(freq)-1], sites)

    return mongo
示例#18
0
def page():
    """ Handler for a page displaying the parse of an arbitrary web
        page by URL or an already scraped article by UUID """
    url = request.args.get("url", None)
    uuid = request.args.get("id", None)
    if url:
        url = url.strip()[0:_MAX_URL_LENGTH]
    if uuid:
        uuid = uuid.strip()[0:_MAX_UUID_LENGTH]
    if url:
        # URL has priority, if both are specified
        uuid = None
    if not url and not uuid:
        # !!! TODO: Separate error page
        return redirect(url_for("routes.main"))

    with SessionContext(commit=True) as session:

        if uuid:
            a = ArticleProxy.load_from_uuid(uuid, session)
        elif url.startswith("http:") or url.startswith("https:"):
            a = ArticleProxy.scrape_from_url(url, session)  # Forces a new scrape
        else:
            a = None

        if a is None:
            # !!! TODO: Separate error page
            return redirect(url_for("routes.main"))

        # Prepare the article for display (may cause it to be parsed and stored)
        a.prepare(session, verbose=True, reload_parser=True)
        register = a.create_register(session, all_names=True)

        # Fetch names of article topics, if any
        topics = (
            session.query(ArticleTopic).filter(ArticleTopic.article_id == a.uuid).all()
        )
        topics = [dict(name=t.topic.name, id=t.topic.identifier) for t in topics]

        # Fetch similar (related) articles, if any
        DISPLAY = 10  # Display at most 10 matches
        similar = Search.list_similar_to_article(session, a.uuid, n=DISPLAY)

        return render_template(
            "page.html", article=a, register=register, topics=topics, similar=similar
        )
示例#19
0
def parseArticle(args):
    if len(args) != 1:
        print 'No article given. Please specify a link to an article to parse.'
        return

    article = Article(url=args[0])
    print 'Working with: %s' % article.url

    print 'Attempting to download the article.'
    couldDownload = article.downloadArticle()
    if not couldDownload:
        print 'Could not download the article. Is the URL correct? Is their site up? Is GTWifi working for you right now?'
        return
    print 'Successfully downloaded the article.'
    article.parseArticle()
    print 'Article body:'
    print article.content
示例#20
0
    def get_articles(self, number=None):
        global username

        show_article_template = Template(filename='get_articles.html')
        sources = User.objects(name=username).first().news_sources
        targets = User.objects(name=username).first().news_targets
        articles = []

        for s in sources:
            articles += Article.objects(website=Website.objects(name=s).first()).only('title', 'url').all()
        for t in targets:
            articles += Article.objects(website=Website.objects(name=t).first()).only('title', 'url').all()

        if not number:
            number = len(articles)

        return show_article_template.render(articles=articles[ :int(number)])
示例#21
0
def consume_si(args):
    a = Article(archive_file=args.article_file.name)
    if args.si_package:
        si = MetadataPackage(archive_file=args.si_package.name)
    else:
        logger.debug("No metadata package provided in call, trying to find one in AI ...")
        si_package = find_si_package(a.doi) #TODO: add manual directory input
        if not si_package:
            a.close()
            raise IOError("Can't find metadata package for %s" %
                          a.doi)
        try:
            si = MetadataPackage(archive_file=si_package)
        except IOError, e:
            a.close()
            raise IOError("Can't find metadata package for %s at %s" %
                          (a.doi, si_package))
示例#22
0
 def get(self,ID = None):
     if ID == None:
         self.redirect_to('home')
     blog = Article.get_by_id(long(ID))
     template_values = {
         'blog': blog,
     }
     view = BaseTemplate.render('template/home/view.html',template_values)
     self.response.write(view)
示例#23
0
 def article_test(session):
     sentence_stream = Article.sentence_stream(limit = TEST_SET)
     for sent in sentence_stream:
         txt = " ".join(t["x"] for t in sent if "x" in t)
         if txt:
             toklist = tokenize(txt, enclosing_session = session)
             dlist = tagger.tag(toklist)
             print("Sentence: '{0}'".format(txt))
             print("Tagging result:\n{0}".format("\n".join(str(d) for d in dlist)))
示例#24
0
    def menuable_articles (self) :
	lines = self.__load_content_lines ()
	menu  = self.__load_book_menu (lines)
	articles = self.__parse_articles (lines, menu)
	for art in articles :
	    start_line = art['start_line']
	    end_line   = art['end_line']
	    author     = art['author']
	    title      = art['title']

	    print "%06d, %06d, %s, %s" % (start_line, end_line, author, title)

	    contents   = lines[start_line : end_line]
	    contents.insert(0, title.encode('utf8'))
	    contents.insert(1, author.encode('utf8'))

	    article = Article()
	    article.parse(contents, self.outer)
示例#25
0
 def __init__(self, user_id, article_id):
     self.user_id = user_id
     es = ElasticStorage.get_instance(dev=False)
     doc = ESArticle.get(article_id)
     self.anger = doc.tone.anger
     self.disgust = doc.tone.disgust
     self.fear = doc.tone.fear
     self.joy = doc.tone.anger
     self.sadness = doc.tone.anger
     self.total_articles = 1
示例#26
0
def on_article_delete(instance, *args, **kwargs):
    # If an article is deleted, then throw out its URLPaths
    # But move all descendants to a lost-and-found node.
    site = Site.objects.get_current()

    try:
        lost_and_found = URLPath.objects.get(slug=settings.LOST_AND_FOUND_SLUG, parent=URLPath.root(), site=site)
    except URLPath.DoesNotExist:
        lost_and_found = URLPath.objects.create(slug=settings.LOST_AND_FOUND_SLUG, parent=URLPath.root(), site=site)
        article = Article(
            title=_(u"Lost and found"), group_read=True, group_write=False, other_read=False, other_write=False
        )
        article.add_revision(
            ArticleRevision(content=_(u"Articles who lost their parents" "==============================="))
        )

    for urlpath in URLPath.objects.filter(articles__article=article, site=site):
        for child in urlpath.get_children():
            child.move_to(lost_and_found)
示例#27
0
 def setUp(self):
     # A random article taken from Mongo with the id field removed.
     self.valid_article = Article()
     self.valid_article.guid="http://www.cnn.com/2015/09/17/opinions/spicer-facebook-dislike-button/index.html"
     self.valid_article.title="Facebook 'dislike' button a comeback for negative thinking"
     self.valid_article.url="http://rss.cnn.com/c/35494/f/676977/s/4a0b2f94/sc/15/l/0L0Scnn0N0C20A150C0A90C170Copinions0Cspicer0Efacebook0Edislike0Ebutton0Cindex0Bhtml0Deref0Frss0Itech/story01.htm"
     self.valid_article.timestamp=1442860397
     self.valid_article.source="cnn"
     self.valid_article.feed="cnn_technology"
     self.valid_article.content = "Facebook 'dislike' button a comeback for negative thinking \nBy Andre Spicer\nAndre Spicer\nThat is about to change. Facebook has announced it will create a \"dislike button.\" Only last year, Mark Zuckerberg said, \"Some people have asked for a dislike button because they want to be able to say 'that thing isn't good,' and we're not going to do that ... I don't think that's socially very valuable, or great for the community.\"\nNow, Zuckerberg has admitted that \"not every moment is good\" and perhaps a dislike button isn't such a bad idea after all.\nZuckerberg may have changed his mind, but many other people have not. Some think the \ndislike button\naggressive behavior online\n. But the biggest worry is the button will \"\nactively foster negativity\nIt seems we have become so fragile that any sign of negativity -- even a simple thumbs down on a social media website -- is something that must be avoided at all costs. All we want is a constant stream of thumbs up. The slightest sign someone might disagree with us is enough to send us into an emotional tailspin.  \nOne of the most insidious ideas of our time is positive thinking. It's drilled into many of us: think positive, don't think negative. It's no wonder people find the prospect of the dislike button so worrisome.  \nBeing positive certainly comes with benefits. But research is starting to reveal that all this \nupbeat thinking has some big downsides\n. When we are unable to express negative feelings, many human emotions become off limits. We avoid taking a realistic look at problems, which means we overlook risks and do stupid things. Those who don't feel on top of the world start to think there is something seriously wrong with them. Those in an upbeat mood tend to be more selfish and feel more socially disconnected. What is even more surprising is that people told to think positively often end up feeling worse.\nAs we start to recognize the limits of always looking on the bright side, negative thinking is making a comeback. \nIt's not just Facebook that will allow you to dislike things. Some companies have started to support their employees in pointing out problems. One particularly interesting method that firms are using to avoid the mistakes made by our bias toward positive thinking is the \"\npre-mortem\nLiving with the thumbs down will be tough. We may get upset, be disturbed and sometimes feel gloomy. Excessive negativity can easily become bullying. But having a space to share our negative feelings every now and then can help us own up to the many problems that we face, and hopefully, deal with them in a levelheaded way.\nJoin us on Facebook.com/CNNOpinion.\nRead CNNOpinion's Flipboard magazine.\nPowered by Livefyre"
示例#28
0
	def controled_crawl(self):
		while self.queue.count() > 0:
			for item in self.queue.find().sort('depth', pymongo.ASCENDING):
				logger.info(item["depth"])
				#logger.info("url %s depth %d" %(item["url"], item['depth']))
				
				p = Page(item["url"], item["source_url"],item["depth"], item["date"], True)
				
				if p.fetch():
					a = Article(p.url,p.html, p.source_url, p.depth,p.date, True)
					if a.extract(): 
						logging.info("extracted")
						if a.filter(self.query, self.directory):
							logging.info("valid")
							if a.check_depth(a.depth):
								
								a.fetch_links()
								if len(a.links) > 0:
									for url, domain in zip(a.links, a.domains):
										if url not in self.queue.distinct("url") and url not in self.results.distinct("url") and url not in self.logs.distinct("url"):
											self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
											
									logging.info("Inserted %d nexts url" %len(a.links))
								try:
									
									self.results.insert(a.export())
								except pymongo.errors.DuplicateKeyError:
									logging.info("Exists already")
									
									
					else:
						try:
							self.logs.insert(a.log())
						except pymongo.errors.DuplicateKeyError:
							logging.info("Exists already")
							
				else:
					try:
						self.logs.insert(p.log())
					except pymongo.errors.DuplicateKeyError:
						logging.info("Exists already")
						
						
				self.queue.remove(item)
				logging.info("Processing %i urls"%self.queue.count())
				if self.queue.count() == 0:
					break
			if self.queue.count() == 0:
				break
			if self.results.count() > 200000:
				self.queue.drop()
				break
示例#29
0
    def parseArticle(self, baseUrl, link):
        data = self.readUrl(link)

        searchItems = re.findall(
            r'<article class="news-article">.*?<div class="article-info">.*?<div class="col-sm-3 col-xs-6">.*?<strong>EMITIRANO</strong>:<br>(.+?)</div>.*?</div>.*?<div id="jplayer_container" class="audio-player  played repeat-on">.*?<div class="track-info">.*?<p class="track-title">(.+?)</p>.*?<div class="download-section">.*?<h4>Preuzmite datoteku</h4>.*?<a href="(.+?)" class="attachment-file">.*?<span class="file-size pull-right">(.+?)</span>.*?</a>.*?</div>.*?<blockquote>.*?<h3>.*?</h3>.*?<p>(.+?)</p>.*?</blockquote>.*?</article>',
            data,
            re.DOTALL,
        )

        if not searchItems:
            return None

        article = Article()
        article.pubDate = searchItems[0][0].strip()
        article.title = searchItems[0][1].strip()
        article.link = baseUrl + searchItems[0][2].strip()
        # article.size = searchItems[0][3].strip()
        article.description = searchItems[0][4].strip()

        return article
示例#30
0
def gen_simple_trees(criteria, stats):
    """ Generate simplified parse trees from articles matching the criteria """
    for a in Article.articles(criteria):
        if not a.root_domain or "raduneyti" in a.root_domain:
            # Skip ministry websites due to amount of chaff found there
            continue
        tree = Tree(url = a.url, authority = a.authority)
        # Note the parse timestamp
        stats["parsed"] = a.parsed
        tree.load(a.tree)
        for ix, stree in tree.simple_trees():
            yield stree, tree.score(ix), tree.length(ix)
示例#31
0
    def add_feed(self, feed):
        """
        add_feed takes the URL, file path, or data of a feed, cleans it up,
        and adds the articles this Feed object's list.
        """
        f = feedparser.parse(feed)
        ago24h = (
            datetime.datetime.utcnow().replace(tzinfo=pytz.utc)
            - datetime.timedelta(hours=24))

        ignore_total = 0
        for item in f['entries']:
            a = Article(item)
            try:
                if self.disableFilter or a.published > ago24h:
                    self.articles.append(Article(item))
                else:
                    ignore_total += 1
            except TypeError:
                log.error("TypeError in adding article {}.".format(
                    a.link))

        if ignore_total > 0:
            print "Ignored {} from more than 24h ago".format(ignore_total)
示例#32
0
文件: richard.py 项目: piskel/Richard
    def load_articles(self,
                      query: str,
                      quantity: int = 100,
                      sort_by: str = SORT_BY_CLOSE_TO_END,
                      filter_str: str = ""):
        result_dict = self.load_result_dict(
            Richard.build_search_url(query, sort_by, filter_string=filter_str))
        total_articles_count = RichardUtils.access_property(
            result_dict, Richard.PATH_TOTAL_ARTICLES_COUNT)

        loaded_articles = []
        current_offset = 0

        if total_articles_count < quantity:
            quantity = total_articles_count

        while quantity > len(loaded_articles):
            articles_list = RichardUtils.access_property(
                result_dict, Richard.PATH_ARTICLE_LIST)

            for article in articles_list:
                article_url = Richard.RICARDO_URL + RichardUtils.access_property(
                    article, Richard.PATH_ARTICLE_URL)
                article_obj = Article(article_url)
                article_obj.quick_update(article)
                loaded_articles.append(article_obj)

                if quantity <= len(loaded_articles):
                    break

            current_offset += Richard.SEARCH_OFFSET_STEP
            result_dict = self.load_result_dict(
                Richard.build_search_url(query, sort_by, current_offset,
                                         filter_str))

        self.article_list.extend(loaded_articles)
示例#33
0
def main(filename):
    Base.metadata.create_all(engine)
    session = Session()
    articles = pd.read_csv(filename)

    for index, row in articles.iterrows():
        logger.info('Loading article uid {} into DB'.format(row['uid']))
        article = Article(row['uid'], row['body'], row['host'],
                          row['newspapper_uid'], row['n_tokens_body'],
                          row['n_tokens_title'], row['title'], row['url'])

        session.add(article)

    session.commit()
    session.close()
示例#34
0
def scrape_category(category_url):
    try:
        soup = BeautifulSoup(requests.get(BASE_URL + category_url).text)
    except ConnectionError:
        print(
            "Couldn't connect to Internet! Please check your connection & Try again."
        )
        exit(1)

    # Selecting links which are in the category page
    links = [a.attrs.get("href") for a in soup.select("article li a")]
    # Removing links for the categories with anchor on same page
    links = [link for link in links if not link.startswith("#")]

    print("Found: " + str(len(links)) + " links")
    i = 1

    # Traverse each link to find article and save it.
    for link in links:
        try:
            if i % 10 == 0:
                sleep(5)  # Sleep for 5 seconds before scraping every 10th link
            link = link.strip()
            print("Scraping link no: " + str(i) + " Link: " + link)
            i += 1
            link_soup = BeautifulSoup(requests.get(link).text)
            # Remove the space occupied by Google Ads (Drop script & ins node)
            [script.extract() for script in link_soup(["script", "ins"])]
            for code_tag in link_soup.find_all("pre"):
                code_tag["class"] = code_tag.get("class", []) + ["prettyprint"]
            article = link_soup.find("article")
            # Now add this article to list of all articles
            page = Article(title=link_soup.title.string,
                           content=article.encode("UTF-8"))
            articles.append(page)
        # Sometimes hanging. So Ctrl ^ C, and try the next link.
        # Find out the reason & improve this.
        except KeyboardInterrupt:
            continue
        except ConnectionError:
            print(
                "Internet disconnected! Please check your connection & Try again."
            )
            if articles:
                print("Making PDF of links scraped till now.")
                break
            else:
                exit(1)
示例#35
0
 def next_tick(self):
     userInput = input("(a) to add, (r) to remove and (e) to exit: ")
     if (userInput == "a"):
         if (self.available == 0):
             self.status = "Stock is full! Not able to add new item!"
         else:
             name = input("Name of your article? ")
             weight = float(input("Weight of your article?"))
             place = input("On which Space should i put your item?")
             article = Article(name, weight)
             self.add_article(place, article)
     if (userInput == "r"):
         place = input("Which space should i empty")
         self.remove_article(place)
     if (userInput == "e"):
         self.run = False
示例#36
0
def main(filename):
    Base.metadata.create_all(engine)  #genera el schema
    session = Session()  #generamos la sesion
    articles = pd.read_csv(filename)  #leermos el csv

    #iteramos por todos los articulos en los archivos limpios
    for index, row in articles.iterrows():
        logger.info('Loading article uid {} into DB'.format(row['uid']))
        article = Article(row['uid'], row['body'], row['link'], row['title'],
                          row['newspaper_uid'], row['host'],
                          row['n_tokens_body'], row['n_tokens_title'])

        session.add(article)

    session.commit()  #genera commit
    session.close()  #cierra la sesion
示例#37
0
    def __IndexEnglishArticles(self):
        """Scans the original_root directory for English articles.

    Populates `self.articles_` with a list of Article objects representing
    each.

    Returns:
        list of Article objects
    """
        self.__original_articles = []
        for root, _, files in os.walk(self.__original_root):
            for name in files:
                if not name == '.DS_Store' and re.search(r'\/en$', root):
                    self.__original_articles.append(
                        Article(os.path.dirname(root)))
        return self.__original_articles
示例#38
0
def form():

    if request.method == 'POST':

        article_id = article_storage.add(
            Article(
                request.form.get('header'),
                request.form.get('signature'),
                request.form.get('body'),
            ))

        session[article_id] = True

        return redirect(url_for('show_article', article_id=article_id))

    return render_template('form.html')
示例#39
0
def main(filename):
    Base.metadata.create_all(
        engine)  #Nos va a permitir generar nuestro schema en nuestra db
    session = Session()  #Inicializamos nuestra sesion
    articles = pd.read_csv(filename)

    for index, row in articles.iterrows(
    ):  #iterrows: nos permite generar un loop adentro de cada una de nuestra filas de nuestro db
        logger.info('Loading article uid {} into db'.format(row['id']))
        article = Article(row['id'], row['body'], row['host'],
                          row['newspaper_uid'], row['n_valid_tokens_title'],
                          row['n_valid_tokens_body'], row['title'], row['url'])

        session.add(article)

    session.close()
示例#40
0
def main(filename):
    # Generating schema in DB
    Base.metadata.create_all(Engine)
    session = Session()
    articles = pd.read_csv(filename)
    # Iterating the DataFrame
    for index, row in articles.iterrows():
        logger.info('Loading article uid {} into DB'.format(row['uid']))
        article = Article(row['uid'], row['body'], row['host'],
                          row['newspaper_uid'], row['tokens_body'],
                          row['tokens_title'], row['title'], row['url'])
        # Inserting the article into the DB
        session.add(article)
    #Po'que ajá
    session.commit()
    session.close()
示例#41
0
    def fetch_articles(self):
        # TODO: REIMPLEMENT get_urls & not tmp
        urls = self.get_urls_tmp()
        #urls = self.get_urls()
        firsts = []
        for i in urls:
            try:
                firsts.append(urls[i][0])
            except:
                pass

        Articles = []
        for link in firsts:
            Articles.append(Article(link))
            
        return Articles
示例#42
0
def parse2Article(mdFiles={}, baseWriteDir=""):
    articles = []
    for (key, value) in mdFiles.items():
        for originFileName in value:
            ofn = originFileName.split('/')
            fileName = ofn[len(ofn) - 1]
            t = fileName.split('.')
            articleTitle = t[0]
            fileName = fileName.replace('md', 'html')
            sourceFileName = originFileName
            input_file = codecs.open(sourceFileName,
                                     mode="r",
                                     encoding="utf-8")
            text = input_file.read()
            input_file.close()
            articleHtml = markdown.markdown(
                text, extensions=["fenced_code", "tables"])
            # 获取创建日期
            time_local = time.localtime(get_creation_date(sourceFileName))
            ctime = time_local
            #作者
            author = "弗兰克零"
            ## 写入文件html文件 可以使用模板引擎进行渲染
            template = jinja_environment.get_template('article_template.html')
            htmlTemplate = template.render({
                'articleTitle':
                articleTitle,
                'articleContent':
                articleHtml,
                'mtime':
                time.strftime("%Y-%m-%d", ctime),
                'author':
                author
            })
            # 处理相对路径的图片,绝对路径以HTTP开头的不处理
            htmlTemplate = htmlTemplate.replace("src=\"../../", "src=\"./")
            ## 写入文件
            rf = baseWriteDir + fileName
            output_file = codecs.open(rf,
                                      "w",
                                      encoding="utf-8",
                                      errors="xmlcharrefreplace")
            output_file.write(htmlTemplate)
            article = Article(articleTitle, fileName, sourceFileName, rf,
                              ctime)
            articles.append(article)
    return articles
示例#43
0
def get_page(url):
    # last_update_datetime = read_update_date()
    # uzmi zadnjih 4 sata
    local_now = dt.datetime.now(CONST_TIMEZONE)
    last_update_datetime = local_now - timedelta(hours=4)

    headers = {
        "User-Agent":
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'
    }

    page = requests.get(url, headers=headers)

    soup = BeautifulSoup(page.content, 'html.parser')

    entityList = soup.find("", {"class": "EntityList--ListItemRegularAd"})
    # entityListItem = entityList.find_all("a", {"class": "link"})

    #iščupaj naslov,cijenu i ostale elemente oglasa
    entityArticles = entityList.find_all("article")
    for article in entityArticles:
        id = article.find("a")["name"]
        title = article.find("a").get_text().strip()
        price = article.find("", {
            "class": "price--eur"
        }).get_text().strip().replace(u'\xa0', ' ') + "  " + article.find(
            "", {
                "class": "price--hrk"
            }).get_text().strip().replace(u'\xa0', ' ')
        datetime_str = article.find("time").get("datetime")
        # datetime = dt.datetime.strptime(datetime_str[0:19], '%Y-%m-%dT%H:%M:%S')
        datetime = dt.datetime.fromisoformat(datetime_str)
        link = "njuskalo.hr" + article.find("a")['href']
        # image_url = "https:" + article.find("img")['data-src']
        # req = requests.get(image_url, stream=True)
        image = ""
        # print(image)
        published = datetime.strftime("%d.%m.%Y. %H:%M:%S")

        #pogledaj koji su novi unazad 4 sata
        if datetime > last_update_datetime:
            #pogledaj da li imamo id tog oglasa u tablici(tamo zapisujemo id-eve koje šaljemo na mail da ne ponavljamo),
            # jer njuškalo periodički ponovno obnovi objavu starog oglasa
            if (not search_in_file(id)):
                articleList.append(
                    Article(title, price, image, link, published))
                write_in_file(id)
示例#44
0
def nytimes_scraper(url, sentiment):
    limit = 3  # Used to limit the amount of articles to be scraped

    # Use Requests to get content from webpage
    website_domain = 'https://www.nytimes.com'
    response = requests.get(url)

    # Use Beautiful Soup to parse HTML and find articles
    soup = BeautifulSoup(response.content, 'html5lib')
    articles = soup.find_all('div', class_='css-1l4spti')

    # Get the title, link, and contents of each article in a list
    usable_articles = []
    for index, article in zip(range(limit), articles):
        this_article = Article()

        # Get article title
        this_article.title = article.find('h2').get_text()

        # Get article link
        this_article.link = website_domain + article.find('a')['href']

        # Only get article contents if sentiment analysis is toggled on.
        if sentiment == "on":
            # Get divisions (div's) from the article that are related to the article story.
            response = requests.get(this_article.link)
            soup = BeautifulSoup(response.content, 'html5lib')
            divisions = soup.find_all(
                'div', class_='css-1fanzo5 StoryBodyCompanionColumn')

            # Get paragraphs from each div
            content_from_each_div = []
            for div in divisions:
                paragraphs = div.find_all('p')

                # Join all the paragraphs from this div
                div_content = []
                for paragraph in paragraphs:
                    div_content.append(paragraph.get_text())
                div_content = " ".join(div_content)
                content_from_each_div.append(div_content)

            # Join all div contents together and set as articles contents
            this_article.content = " ".join(content_from_each_div)

            # Perform sentiment analysis
            this_article.sentiment = TextBlob(this_article.content).sentiment
        else:
            pass

        # Add article to list of articles
        usable_articles.append(this_article)

    return usable_articles
示例#45
0
 def read_parses(parse_path, relations_dict=None):
     parses = [json.loads(x) for x in open(parse_path)]
     for doc_id in parses[0]:
         print >> logs, "Doc ID:%s" % doc_id
         doc = parses[0][doc_id]
         sentences = []
         for sid, sen in enumerate(doc['sentences']):
             parse_tree = sen['parsetree']
             dep_tree = sen['dependencies']
             words = sen['words']
             sentences.append(Sentence(sid, parse_tree, dep_tree, words))
         if relations_dict is not None:
             relations = relations_dict[doc_id]
         else:
             relations = []
         params = {'sens': sentences, 'rels': relations}
         yield Article(doc_id, params)
示例#46
0
def main(filename):
    name = filename.split('.')[0]
    engine, Session = createEngine(name)
    Base.metadata.create_all(engine)
    session = Session()
    articles = pd.read_csv(filename)

    for index, row in articles.iterrows():
        logger.info(f'Loading article uid {row["uid"]} into DB')
        article = Article(row['uid'], row['body'], row['title'],
                          row['newspaper_uid'], row['n_tokens_body'],
                          row['n_tokens_title'], row['url'])

        session.add(article)

    session.commit()
    session.close()
示例#47
0
async def on_ready():
    for guild in bot.guilds:
        groceries_list = GroceriesList(guild.id, [])
        categories = get(guild.categories, name = shopping_category)
        if categories == None:
            categories =await guild.create_category(shopping_category)
        for text_channel in categories.text_channels:
            grocery=Grocery(text_channel.id, [])
            messages = await text_channel.history(limit=1000).flatten()
            for message in messages:
                if message.author == bot.user:
                    for embed in message.embeds:
                        for field in embed.fields:
                            footer=embed.footer.text.split(" | ")
                            groceries_list.add(grocery.add(Article(field.name, int(field.value), footer[2], message_id=int(message.id), most_similar=footer[0])))
        groceries_lists.add(groceries_list)
    print("Bot Started !")
示例#48
0
 def extract_article(self, soup: BeautifulSoup, source_url: str) -> \
         Optional[Article]:
     title_tag = soup.find('title')
     if title_tag is None:
         return None
     title = title_tag.text
     date_node = soup.find(id='author')
     if date_node is None:
         return None
     datetime = date_node.text.strip()
     text_node = soup.find(class_='article-content')
     if text_node is None:
         return None
     text = text_node.text
     text = self._cleanup(text)
     raw_html = str(soup)
     return Article(title, datetime, text, source_url, raw_html)
示例#49
0
def main(filename):
    Base.metadata.create_all(engine)
    session = Session()

    articles = pd.read_csv(filename)
    # iterrow permite generar un loop en cada una de las filas del datagrame
    #portanto lo que devolvera sera el indice y la columna
    for index, row in articles.iterrows():
        logger.info('cargando el articulo uid {} en la DB'.format(row['uid']))
        article = Article(row['uid'], row['body'], row['host'],
                          row['newspaper_uid'], row['n_tokenise_body'],
                          row['n_tokenise_title'], row['title'], row['url'])
        #ingresando articulos a la base de datos
        session.add(article)

    session.commit()
    session.close()
示例#50
0
def add_new_article_from_pdf():

    file = request.files['file']
    if file:
        tmp_file = tempfile.NamedTemporaryFile(delete=False)

        log.info("Saving PDF as temporary file %s, before processing..." %
                 tmp_file.name)
        file.save(tmp_file)

        article = Article.create_from_pdf(tmp_file.name)

        return repr(article)

    else:
        log.error("No file received!")
        return ""
示例#51
0
    def _process_urls(self):
        """Queue all manually entered URLs for the crawler.

        Running this method more than once does nothing.
        """
        if self._urls is None:
            return
        try:
            for url in self._urls:
                article = Article(url)
                self._downloader.queue_article(article)
        except TypeError:
            raise ValueError(
                "'urls' must be a list of article URLs to process.")
        finally:
            # Regardless of it we successfully queued all of the links, we don't want to try again.
            self._urls = None
示例#52
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print(message is broken)
        return

    task = msg

    article1 = Article(task['url'])
    article1.download()
    article1.parse()
    task['text'] = article1.text
    dedupe_news_queue_client.sendMessage(task)
示例#53
0
    def scrape_story(self, url: str):
        print(url)
        if url in self.seen_urls:
            return
        self.seen_urls.add(url)

        try:
            res = requests.get(url)
            res.raise_for_status()
        except:
            print("Failed to GET URL: {}".format(url))
            return

        soup = BeautifulSoup(res.text, 'html.parser')

        title = self.outlet_crawler.get_title(soup)
        img = self.outlet_crawler.get_img(soup)
        published_date = self.outlet_crawler.get_datetime(soup)
        lede = self.outlet_crawler.get_lede(soup)
        keywords, location = self.outlet_crawler.extract_keywords(soup)

        if not published_date:
            print("Failed to parse datetime: {}".format(url))
            return

        # Only keep articles from previous 5 days
        if published_date - datetime.now(timezone.utc) > timedelta(days=5):
            return

        if published_date.date() not in self.dates:
            self.dates.append(published_date.date())

        title = title.replace('\'', '\\\'')

        article = Article(published_date, url, self.outlet, title, img,
                          location, keywords)
        self.stories.append(article)

        if len(self.stories) > 20:
            self.update_db()
            self.stories = []

        print(published_date, url, self.outlet, title, img, location, keywords)

        return article
示例#54
0
    def load_data(self, path, is_dir=False):

        data = []
        filename = None

        if is_dir:
            filenames = [name for name in os.listdir(path) if not name.startswith.(".")]
        else:
            filenames = [path]

        for filename in filenames:
            with open(os.path.join(path, filename),'r', encoding="utf-8") as data:
                tp = json.load(data)
                for article in tp:
                    try:
                        self.corpus.append(Article(article))
                    except:
                        print("於 %s 發生某篇文章的解析錯誤" % filename)
示例#55
0
    def execute(self):
        current_lang_val = self.config_manager.language
        if current_lang_val == "en":
            new_name_en = input(i18n.t('ENTER_THE_NAME_OF_THE_ARTICLE_EN'))
            new_name_pl = input(i18n.t('ENTER_THE_NAME_OF_THE_ARTICLE_PL'))
        else:
            new_name_pl = input(i18n.t('ENTER_THE_NAME_OF_THE_ARTICLE_PL'))
            new_name_en = input(i18n.t('ENTER_THE_NAME_OF_THE_ARTICLE_EN'))
        new_quantity = int(input(i18n.t('ENTER_THE_QUANTITY_OF_THE_ARTICLE')))
        new_id = self.logger.get_available_id()
        new_obj = Article(new_id, [new_name_pl, new_name_en], new_quantity,
                          new_quantity, True)

        self.base.add_article(new_obj)
        self.logger.add_log(new_id,
                            Log(str(datetime.date(datetime.now())), "Added"))
        self.app_info_logger.log_info(i18n.t('ARTICLE_ADDED'))
        IOWrapper.continue_pause()
示例#56
0
def parse_xml(pmids, results):
    article_list = []
    # parse xml with BeautifoulSoup
    soup = BeautifulSoup(results, 'xml')
    try:
        # parse xml file into a list of Article objects
        titles = soup.find_all('ArticleTitle')
        abstracts = soup.find_all('Abstract')
        print("* Parsing xml file...")
        for i in range(0, len(titles)):
            article_list.append(
                Article(pmids[i], titles[i].text, abstracts[i].text))
    except:
        print('\033[91m' + 20 * "*" +
              " An error ocurred while parsing xml file. Try it again" +
              '\033[0m')
        sys.exit(1)
    return article_list
示例#57
0
def get_article_list():
    item_list = []
    for i in range(1, 58):
        # 将文章标题保存在h4_list里
        link = "https://blog.csdn.net/liuchuo/article/list/" + str(i)
        r = requests.get(link)
        soup = BeautifulSoup(r.text, "html.parser")
        h4_list = soup.find_all("h4", class_="text-truncate")
        # 遍历h4_list找到所有包含"甲级"但不包含"java"的标题
        for each in h4_list:
            if "乙级" in each.a.text and "java" not in each.a.text.lower():
                re_result = re.findall('[0-9]{4}', each.a.text)
                if re_result:
                    title = each.a.text.strip().lstrip("原").strip()
                    item_list.append(
                        Article(re_result[0], title, each.a['href'],
                                get_article_content(each.a['href'])))
    return item_list
示例#58
0
 def extract_article(self, soup: BeautifulSoup, source_url: str) -> \
         Optional[Article]:
     story_body = soup.find(id='story-body')
     art_contents = story_body.find_all('p') if story_body else None
     if art_contents:
         text = document_fromstring('\n'.join(list(map(
             str, art_contents)))).text_content()
         text = self._remove_clutter_from_text(text)
         title_node = soup.find(class_='story-headline')
         if not title_node:
             return None
         title = title_node.text
         datetime = self._extract_date(soup)
         source_url = source_url
         raw_html = str(soup)
         return Article(title, datetime, text, source_url, raw_html)
     else:
         return None
示例#59
0
def apnews_scraper(url, sentiment):
    limit = 10  # Will not find articles equal to the limit due to inconsistent HTML.
    website_domain = 'https://apnews.com'

    # Use Requests to get content for webpage
    response = requests.get(url)

    # Use Beautiful Soup to parse HTML and find articles
    soup = BeautifulSoup(response.content, 'html5lib')
    articles = soup.find_all('article', class_='feed')[0].find_all(
        'a')  # Class names on this site is inconsistent, find all hyperlinks.

    # Get the title, link, and contents of each article in a list
    usable_articles = []
    for index, article in zip(range(limit), articles):
        # If the hypelink element contains a h1 element it is a link to an article.
        if article.find('h1') == None:
            continue
        else:
            this_article = Article()

            # Get article title
            this_article.title = article.find('h1').get_text()

            # Get article link
            this_article.link = website_domain + article['href']

            # Only get article contents if sentiment analysis is toggled on.
            if sentiment == "on":
                # Get paragraphs from the article that are related to the article story.
                response = requests.get(this_article.link)
                soup = BeautifulSoup(response.content, 'html5lib')
                paragraphs = soup.find_all('p')

                # Get content from each paragraph
                content = []
                for paragraph in paragraphs:
                    content.append(paragraph.get_text())

                # Join all paragraph content together and set as articles contents
                this_article.content = " ".join(content)

                # Perform sentiment analysis
                this_article.sentiment = TextBlob(
                    this_article.content).sentiment
            else:
                pass

            # Add article to list of articles
            usable_articles.append(this_article)

    return usable_articles
示例#60
0
    def load_articles(self, num_pages: int):
        for i in range(num_pages):
            url = '{}/search/page/1/news?page={}&sort=1'.format(
                self.base_url, i)
            r = requests.get(url)
            if not r.ok:
                continue

            soup = bs(r.json()['news'], 'html.parser')

            for group in soup.findAll('div', {'class': 'group'}):
                a = group.find('a')
                if a:
                    self.articles.append(
                        Article(title=a.text,
                                orig_link='{}{}'.format(
                                    self.base_url, a.get('href')),
                                backup_link=None))