def import_web_url(request, url): """ Import an URL from the web (can be anything). """ form = forms.WebPagesImportForm({ 'urls': url, 'status': IMPORT_STATUS.MANUAL }) article = None if form.is_valid(): user_import = form.save(request.user) if user_import.status == IMPORT_STATUS.FINISHED: if 'articles' in user_import.results['created']: article_url = user_import.results['created']['articles'][0] try: article = Article.objects.get(url=article_url) except: # Just in case we hit # http://dev.1flow.net/1flow/1flow/group/51970/ # But it should have been wrapped earlier, thus we # do not do it in first intention. article = Article.objects.get(url=clean_url(article_url)) if article.content_type in CONTENT_TYPES_FINAL: return HttpResponsePermanentRedirect( redirect_to_read(request.user, article)) else: feed_url = user_import.results['created']['feeds'][0] subscription = Subscription.objects.get( feed=BaseFeed.objects.get(feed_url), user=request.user) return HttpResponsePermanentRedirect( reverse('source_selector') + u"#" + subscription.id) else: messages.warning( request, _(u'Could not import url “<code>{0}</code>”. Check your ' u'latest history entry to know why.').format(url), extra_tags='sticky safe') return HttpResponsePermanentRedirect(reverse('historyentry_list')) return render( request, 'import-web-url.html', { 'article': article, 'url': url, 'poll_url': reverse('article_conversion_status', args=(article.id, )) })
def create_tweet_from_id(tweet_id, feeds=None, origin=None): """ From a Tweet ID, create a 1flow tweet via the REST API. https://dev.twitter.com/rest/reference/get/statuses/show/%3Aid .. todo:: use http://celery.readthedocs.org/en/latest/reference/celery.contrib.batches.html # NOQA to bulk get statuses and not exhaust the API Quota. """ raise NotImplementedError('Needs a full review / redesign for tweets.') if feeds is None: feeds = [] elif not hasattr(feeds, '__iter__'): feeds = [feeds] # TODO: find tweet publication date while fetching content… # TODO: set Title during fetch… try: new_tweet, created = Tweet.create_tweet( url=tweet_id.replace(' ', '%20'), title=_(u'Imported item from {0}').format(clean_url(tweet_id)), feeds=feeds, origin=ORIGINS.WEBIMPORT) except: # NOTE: duplication handling is already # taken care of in Tweet.create_tweet(). LOGGER.exception(u'Tweet creation from URL %s failed.', tweet_id) return None, False mutualized = created is None if created or mutualized: for feed in feeds: feed.recent_items_count += 1 feed.all_items_count += 1 ze_now = now() for feed in feeds: feed.latest_item_date_published = ze_now # Even if the tweet wasn't created, we need to create reads. # In the case of a mutualized tweet, it will be fetched only # once, but all subscribers of all feeds must be connected to # it to be able to read it. for subscription in feed.subscriptions.all(): subscription.create_read(new_tweet, verbose=created) # Don't forget the parenthesis else we return ``False`` everytime. return new_tweet, created or (None if mutualized else False)
def import_web_url(request, url): """ Import an URL from the web (can be anything). """ form = forms.WebPagesImportForm({'urls': url, 'status': IMPORT_STATUS.MANUAL}) article = None if form.is_valid(): user_import = form.save(request.user) if user_import.status == IMPORT_STATUS.FINISHED: if 'articles' in user_import.results['created']: article_url = user_import.results['created']['articles'][0] try: article = Article.objects.get(url=article_url) except: # Just in case we hit # http://dev.1flow.net/1flow/1flow/group/51970/ # But it should have been wrapped earlier, thus we # do not do it in first intention. article = Article.objects.get(url=clean_url(article_url)) if article.content_type in CONTENT_TYPES_FINAL: return HttpResponsePermanentRedirect( redirect_to_read(request.user, article) ) else: feed_url = user_import.results['created']['feeds'][0] subscription = Subscription.objects.get( feed=BaseFeed.objects.get(feed_url), user=request.user ) return HttpResponsePermanentRedirect( reverse('source_selector') + u"#" + subscription.id) else: messages.warning( request, _(u'Could not import url “<code>{0}</code>”. Check your ' u'latest history entry to know why.').format(url), extra_tags='sticky safe') return HttpResponsePermanentRedirect(reverse('historyentry_list')) return render(request, 'import-web-url.html', {'article': article, 'url': url, 'poll_url': reverse('article_conversion_status', args=(article.id, ))})
def resolve(cls, url, clean=False): """ return the real URL of :param:`url` if it is a dupe. Return ``None`` if not registered as duplicate. """ if clean: url = clean_url(url) try: return cls.objects.get(url=url).values('real_url') except: return None
def test_utm_with_other_things(self): for bad_url, good_url in ( (u'http://www.begeek.fr/visitez-le-tardis-de-doctor-who-sur-google-maps-101125?utm_source=Plus+d‘actu&utm_medium=cpc&utm_campaign=Plus+d‘actu', # NOQA u'http://www.begeek.fr/visitez-le-tardis-de-doctor-who-sur-google-maps-101125'), # NOQA (u'http://techcrunch.com/2013/05/18/hell-no-tumblr-users-wont-go-to-yahoo/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+francaistechcrunch+%28TechCrunch+en+Francais%29', # NOQA u'http://techcrunch.com/2013/05/18/hell-no-tumblr-users-wont-go-to-yahoo/'), # NOQA (u'http://www.liberation.fr/politiques/2013/09/24/la-niche-fiscale-pour-les-parents-d-enfants-scolarises-sera-conservee_934193?=rss-450', # NOQA u'http://www.liberation.fr/politiques/2013/09/24/la-niche-fiscale-pour-les-parents-d-enfants-scolarises-sera-conservee_934193'), # NOQA # This one must not be changed. (u'http://tctechcrunch2011.files.wordpress.com/2013/09/screen-shot-2013-09-24-at-5-57-35-am.png?w=1280&h=948', # NOQA u'http://tctechcrunch2011.files.wordpress.com/2013/09/screen-shot-2013-09-24-at-5-57-35-am.png?w=1280&h=948'), # NOQA ): self.assertEquals(clean_url(bad_url), good_url)
def save(self, user): """ Record the current user and the lines count. """ # Just in case. self.instance.urls = u'\n'.join( clean_url(l.strip()) for l in self.instance.urls.splitlines()) self.instance.user = user self.instance.lines = self.instance.count super(WebPagesImportForm, self).save() if self.instance.status == models.IMPORT_STATUS.MANUAL: self.instance.run() return self.instance
def prepare_feed_url(feed_url): """ Try to validate an URL as much as possible. """ feed_url = clean_url(feed_url) URLValidator()(feed_url) requests_response = requests.get(feed_url) if not requests_response.ok or requests_response.status_code != 200: raise Exception(u'Requests response is not OK/200, aborting') # Switch to the last hop of eventually (multiple-)redirected URLs. feed_url = requests_response.url # Be sure we get the XML result from them, # else FeedBurner gives us a poor HTML page… if u'feedburner' in feed_url and not feed_url.endswith(u'?format=xml'): feed_url += u'?format=xml' return feed_url
def create_article_from_url(url, feeds, origin): """ Create an article from a web url, in feeds, with an origin. """ # TODO: find article publication date while fetching content… # TODO: set Title during fetch… try: new_article, created = Article.create_article( url=url.replace(' ', '%20'), title=_(u'Imported item from {0}').format(clean_url(url)), feeds=feeds, origin=origin) except: # NOTE: duplication handling is already # taken care of in Article.create_article(). LOGGER.exception(u'Article creation from URL %s failed.', url) return None, False mutualized = created is None if created or mutualized: for feed in feeds: feed.recent_items_count += 1 feed.all_items_count += 1 for feed in feeds: if new_article.date_published: if new_article.date_published > feed.latest_item_date_published: feed.latest_item_date_published = new_article.date_published # Even if the article wasn't created, we need to create reads. # In the case of a mutualized article, it will be fetched only # once, but all subscribers of all feeds must be connected to # it to be able to read it. for subscription in feed.subscriptions.all(): subscription.create_read(new_article, verbose=created) # Don't forget the parenthesis else we return ``False`` everytime. return new_article, created or (None if mutualized else False)
def create_feeds_from_url(feed_url, creator=None, recurse=True): """ Return a list of one or more tuple(s) ``(feed, created)``, from a given URL. If the URL given is an RSS/Atom URL, the method will create a feed (if not already in the database), and will return it associated with the ``created`` boolean, given if it was created now, or not. For consistency, the tuple will be returned in a list, so that this method *always* returns a list of tuples. If the URL is a simple website one, it will be opened and parsed to discover eventual RSS/Atom feeds referenced in the page headers, and the method will return a list of tuples. .. todo:: parse the content body to find any RSS/Atom feeds inside. Will make it easy to parse http://www.bbc.co.uk/news/10628494 :param creator: a :class:`User` that will be set as the feed(s) creator. This will allow to eventually give acheivements to users, or at the contrary to ban them if they pollute the DB. :param recurse: In case of a simple web URL, this method will be called recursively. Subsequent calls will be non-recursive by default. You can consider this argument to be "internal". """ feed_url = prepare_feed_url(feed_url) try: feed = RssAtomFeed.objects.get(url=feed_url) except RssAtomFeed.DoesNotExist: # We will create it now. pass else: # Get the right one for the user subscription. if feed.duplicate_of_id: return [(feed.duplicate_of, False)] else: return [(feed, False)] http_logger = HttpResponseLogProcessor() parsed_feed = feedparser.parse(feed_url, handlers=[http_logger]) feed_status = http_logger.log[-1]['status'] # Stop on HTTP errors before stopping on feedparser errors, # because he is much more lenient in many conditions. if feed_status in (400, 401, 402, 403, 404, 500, 502, 503): raise FeedFetchException(u'Error {0} when fetching feed {1}'.format( feed_status, feed_url)) try: check_feedparser_error(parsed_feed) except FeedIsHtmlPageException: if recurse: new_feeds = [] urls_to_try = set(parse_feeds_urls(parsed_feed)) for sub_url in urls_to_try: try: new_feeds += create_feeds_from_url( sub_url, creator=creator, recurse=False) except FeedIsHtmlPageException: # We don't warn for every URL we find, # most of them are CSS/JS/whatever ones. pass except: LOGGER.exception(u'Could not create a feed from ' u'recursed url {0} (from {1})'.format( sub_url, feed_url)) if new_feeds: # LOGGER.info(u'Returning %s created feeds.', len(new_feeds)) return new_feeds # Just before giving up, try a little more with newspaper. # As it is quite slow, do it in the background. discover_feeds_urls.delay(feed_url) raise else: raise except Exception as e: raise Exception(u'Unparsable feed {0}: {1}'.format(feed_url, e)) else: # Wow. FeedParser creates a <anything>.feed . Impressive. fp_feed = parsed_feed.feed website = WebSite.get_from_url(clean_url( fp_feed.get('link', feed_url))) defaults = { 'name': fp_feed.get('title', u'Feed from {0}'.format(feed_url)), 'is_good': True, # Try the RSS description, then the Atom subtitle. 'description_en': fp_feed.get( 'description', fp_feed.get('subtitle', u'')), 'website': website } new_feed, created = RssAtomFeed.objects.get_or_create( url=feed_url, defaults=defaults ) if created: new_feed.user = creator new_feed.save() return [(new_feed, created)]
if replace_newlines: for repl_src in re.findall(ur'[[][^]]+[]][(]', content): # In link text, we replace by a space. repl_dst = repl_src.replace(u'\n', u' ') content = content.replace(repl_src, repl_dst) for repl_src in re.findall(ur'[]][(][^)]+[)]', content): if replace_newlines: # In link URLs, we just cut out newlines. repl_dst = repl_src.replace(u'\n', u'') else: repl_dst = repl_src repl_dst = clean_url(insert_website(repl_dst)) content = content.replace(repl_src, repl_dst) if test: return content else: # Everything went OK. Put back the content where it belongs. self.content = content if replace_newlines: self.content_type = CONTENT_TYPES.MARKDOWN # Disabled until more love is put inside. # self.find_image(commit=False, force=force)
def absolutize_url(self, requests_response=None, force=False, commit=True): """ Make the current article URL absolute. Eg. transform: http://feedproxy.google.com/~r/francaistechcrunch/~3/hEIhLwVyEEI/ into: http://techcrunch.com/2013/05/18/hell-no-tumblr-users-wont-go-to-yahoo/ # NOQA ?utm_source=feeurner&utm_medium=feed&utm_campaign=Feed%3A+francaistechcrunch+%28TechCrunch+en+Francais%29 # NOQA and then remove all these F*G utm_* parameters to get a clean final URL for the current article. Returns ``True`` if the operation succeeded, ``False`` if the absolutization pointed out that the current article is a duplicate of another. In this case the caller should stop its processing because the current article will be marked for deletion. Can also return ``None`` if absolutizing is disabled globally in ``constance`` configuration. """ # Another example: http://rss.lefigaro.fr/~r/lefigaro/laune/~3/7jgyrQ-PmBA/story01.htm # NOQA if self.absolutize_url_must_abort(force=force, commit=commit): return if requests_response is None: try: requests_response = requests.get(self.url) except requests.ConnectionError as e: statsd.gauge('articles.counts.url_errors', 1, delta=True) message = u'Connection error while absolutizing “%s”: %s' args = ( self.url, str(e), ) self.url_error = message % args # Don't waste a version just for that. self.save_without_historical_record() LOGGER.error(message, *args) return if not requests_response.ok or requests_response.status_code != 200: message = u'HTTP Error %s while absolutizing “%s”: %s' args = (requests_response.status_code, requests_response.url, requests_response.reason) with statsd.pipeline() as spipe: spipe.gauge('articles.counts.url_errors', 1, delta=True) if requests_response.status_code in (404, ): self.is_orphaned = True # This is not handled by the post_save() # which acts only at article creation. spipe.gauge('articles.counts.orphaned', 1, delta=True) self.url_error = message % args # Don't waste a version just for that. self.save_without_historical_record() LOGGER.error(message, *args) return # # NOTE: we could also get it eventually from r.headers['link'], # which contains '<another_url>'. We need to strip out # the '<>', and re-absolutize this link, because in the # example it's another redirector. Also r.links is a good # candidate but in the example I used, it contains the # shortlink, which must be re-resolved too. # # So: as we already are at the final address *now*, no need # bothering re-following another which would lead us to the # the same final place. # final_url = clean_url(requests_response.url) # LOGGER.info(u'\n\nFINAL: %s vs. ORIG: %s\n\n', final_url, self.url) if final_url != self.url: # Just for displaying purposes, see below. old_url = self.url if self.url_error: statsd.gauge('articles.counts.url_errors', -1, delta=True) # Even if we are a duplicate, we came until here and everything # went fine. We won't need to lookup again the absolute URL. statsd.gauge('articles.counts.absolutes', 1, delta=True) self.url_absolute = True self.url_error = None self.url = final_url try: if self.name.endswith(old_url): self.name = self.name.replace(old_url, final_url) except: LOGGER.exception(u'Could not replace URL in name of %s #%s', self._meta.model.__name__, self.id) duplicate = False with transaction.atomic(): # Without the atomic() block, saving the current article # (beiing a duplicate) will trigger the IntegrityError, # but will render the current SQL context unusable, unable # to register duplicate, potentially leading to massive # inconsistencies in the caller's context. try: # Don't waste a version just for that. self.save_without_historical_record() except IntegrityError: duplicate = True if duplicate: params = {'%s___url' % self._meta.model.__name__: final_url} original = BaseItem.objects.get(**params) # Just to display the right “old” one in logs. self.url = old_url LOGGER.info( u'%s #%s is a duplicate of #%s, ' u'registering as such.', self._meta.model.__name__, self.id, original.id) original.register_duplicate(self) return False # Any other exception will raise. This is intentional. else: LOGGER.info( u'URL of %s (#%s) successfully absolutized ' u'from %s to %s.', self._meta.model.__name__, self.id, old_url, final_url) else: # Don't do the job twice. if self.url_error: statsd.gauge('articles.counts.url_errors', -1, delta=True) statsd.gauge('articles.counts.absolutes', 1, delta=True) self.url_absolute = True self.url_error = None # Don't waste a version just for that. self.save_without_historical_record() return True
def create_article(cls, title, url, feeds, **kwargs): """ Returns ``True`` if article created, ``False`` if a pure duplicate (already exists in the same feed), ``None`` if exists but not in the same feed. If more than one feed given, only returns ``True`` or ``False`` (mutualized state is not checked). """ tags = kwargs.pop('tags', []) if url is None: # We have to build a reliable orphaned URL, because orphaned # articles are often duplicates. RSS feeds serve us many times # the same article, without any URL, and we keep recording it # as new (but orphaned) content… Seen 20141111 on Chuck Norris # facts, where the content is in the title, and there is no URL. # We have 860k+ items, out of 1k real facts… Doomed. url = ARTICLE_ORPHANED_BASE + generate_orphaned_hash(title, feeds) defaults = { 'name': title, 'is_orphaned': True, # Skip absolutization, it's useless. 'url_absolute': True } defaults.update(kwargs) article, created = cls.objects.get_or_create(url=url, defaults=defaults) # HEADS UP: no statsd here, it's handled by post_save(). else: url = clean_url(url) defaults = {'name': title} defaults.update(kwargs) article, created = cls.objects.get_or_create(url=url, defaults=defaults) if created: created_retval = True LOGGER.info(u'Created %sarticle %s %s.', u'orphaned ' if article.is_orphaned else u'', article.id, u'in feed(s) {0}'.format(_format_feeds(feeds)) if feeds else u'without any feed') else: created_retval = False if article.duplicate_of_id: LOGGER.info(u'Swaping duplicate %s %s for master %s on ' u'the fly.', article._meta.verbose_name, article.id, article.duplicate_of_id) article = article.duplicate_of if len(feeds) == 1 and feeds[0] not in article.feeds.all(): # This article is already there, but has not yet been # fetched for this feed. It's mutualized, and as such # it is considered at partly new. At least, it's not # as bad as being a true duplicate. created_retval = None LOGGER.info(u'Mutualized article %s in feed(s) %s.', article.id, _format_feeds(feeds)) article.create_reads(feeds=feeds) else: # No statsd, because we didn't create any record in database. LOGGER.info(u'Duplicate article %s in feed(s) %s.', article.id, _format_feeds(feeds)) # Special case where a mutualized article arrives from RSS # (with date/author) while it was already here from Twitter # (no date/author). Post-processing of original data will # handle the authors, but at lest we update the date now for # users to have sorted articles until original data is # post-processed (this can take time, given the server load). if article.date_published is None: date_published = kwargs.get('date_published', None) if date_published is not None: article.date_published = date_published article.save() # Tags & feeds are ManyToMany, they # need the article to be saved before. if tags: try: with transaction.atomic(): article.tags.add(*tags) except IntegrityError: LOGGER.exception(u'Could not add tags %s to article %s', tags, article.id) if feeds: try: with transaction.atomic(): article.feeds.add(*feeds) except: LOGGER.exception(u'Could not add feeds to article %s', article.id) # Get a chance to catch the duplicate if workers were fast. # At the cost of another DB read, this will save some work # in repair scripts, and avoid some writes when creating reads. article = cls.objects.get(id=article.id) if article.duplicate_of_id: if settings.DEBUG: LOGGER.debug(u'Catched on-the-fly duplicate %s, returning ' u'master %s instead.', article.id, article.duplicate_of_id) return article.duplicate_of, False return article, created_retval
def test_utm_star(self): good_url = u'http://test.com/mytest/' for bad_url in ( u'http://test.com/mytest/?=', u'http://test.com/mytest/?#', u'http://test.com/mytest/#?=', u'http://test.com/mytest/?=rss', u'http://test.com/mytest/?=rss-450', u'http://test.com/mytest/?=rss-450&', u'http://test.com/mytest/?=rss-450&=rss', u'http://test.com/mytest/?utm_X', u'http://test.com/mytest/?utm_X&', u'http://test.com/mytest/?utm_X=', u'http://test.com/mytest/?utm_X=&', u'http://test.com/mytest/?utm_X=toto', u'http://test.com/mytest/?utm_X=toto&', u'http://test.com/mytest/?utm_source=toto&utm_Y', u'http://test.com/mytest/?utm_source=toto&utm_Y&', u'http://test.com/mytest/?utm_source=toto&utm_Y=', u'http://test.com/mytest/?utm_source=toto&utm_Y=&', u'http://test.com/mytest/?utm_source=toto&utm_Y=titi', u'http://test.com/mytest/?utm_source=toto&utm_Y=titi&', u'http://test.com/mytest/#xtor', u'http://test.com/mytest/#xtor=', u'http://test.com/mytest/#xtor=tata', u'http://test.com/mytest/#xtor&', u'http://test.com/mytest/#xtor=&', u'http://test.com/mytest/#xtor=tata&', u'http://test.com/mytest/?utm_X#xtor', u'http://test.com/mytest/?utm_X#xtor=', u'http://test.com/mytest/?utm_X#xtor=tata', u'http://test.com/mytest/?utm_campaign&#xtor', u'http://test.com/mytest/?utm_campaign&#xtor=', u'http://test.com/mytest/?utm_campaign&#xtor=tata', u'http://test.com/mytest/?utm_X=&#xtor', u'http://test.com/mytest/?utm_X=&#xtor=', u'http://test.com/mytest/?utm_X=&#xtor=tata', u'http://test.com/mytest/?utm_X=toto#xtor', u'http://test.com/mytest/?utm_X=toto#xtor=', u'http://test.com/mytest/?utm_X=toto#xtor=tata', u'http://test.com/mytest/?utm_X=toto&#xtor', u'http://test.com/mytest/?utm_X=toto&#xtor=', u'http://test.com/mytest/?utm_X=toto&#xtor=tata', u'http://test.com/mytest/?utm_X=toto&utm_Y#xtor', u'http://test.com/mytest/?utm_X=toto&utm_Y#xtor=', u'http://test.com/mytest/?utm_X=toto&utm_Y#xtor=tata', u'http://test.com/mytest/?utm_X=toto&utm_Y=#xtor', u'http://test.com/mytest/?utm_X=toto&utm_Y=#xtor=', u'http://test.com/mytest/?utm_X=toto&utm_Y=#xtor=tata', u'http://test.com/mytest/?utm_X=toto&utm_Y=titi#xtor', u'http://test.com/mytest/?utm_X=toto&utm_Y=titi#xtor=', u'http://test.com/mytest/?utm_X=toto&utm_Y=titi#xtor=tata', u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&#xtor', u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&#xtor=', u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&#xtor=tata', u'http://test.com/mytest/?xtor', u'http://test.com/mytest/?xtor=', u'http://test.com/mytest/?xtor=tata', u'http://test.com/mytest/?xtor=tata&', u'http://test.com/mytest/?utm_X&xtor', u'http://test.com/mytest/?utm_X&xtor=', u'http://test.com/mytest/?utm_X&xtor=tata', u'http://test.com/mytest/?utm_X&xtor=tata&', u'http://test.com/mytest/?utm_X=&xtor', u'http://test.com/mytest/?utm_X=&xtor=', u'http://test.com/mytest/?utm_X=&xtor=tata', u'http://test.com/mytest/?utm_X=&xtor=tata&', u'http://test.com/mytest/?utm_X=toto&xtor', u'http://test.com/mytest/?utm_X=toto&xtor=', u'http://test.com/mytest/?utm_X=toto&xtor=tata', u'http://test.com/mytest/?utm_X=toto&xtor=tata&', u'http://test.com/mytest/?utm_X=toto&utm_Y&xtor', u'http://test.com/mytest/?utm_X=toto&utm_Y&xtor=', u'http://test.com/mytest/?utm_X=toto&utm_Y&xtor=tata', u'http://test.com/mytest/?utm_X=toto&utm_Y&xtor=tata&', u'http://test.com/mytest/?utm_X=toto&utm_Y=&xtor', u'http://test.com/mytest/?utm_X=toto&utm_Y=&xtor=', u'http://test.com/mytest/?utm_X=toto&utm_Y=&xtor=tata', u'http://test.com/mytest/?utm_X=toto&utm_Y=&xtor=tata&', u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&xtor', u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&xtor=', u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&xtor=tata', u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&xtor=tata&', ): self.assertEquals(clean_url(bad_url), good_url)
def process(self, instance, parameters=None, commit=True, **kwargs): """ See source code. """ instance_name = instance._meta.verbose_name instance_id = instance.id urls = URL_MATCH_REGEX.findall(instance.content) if not urls: LOGGER.info(u'url-crawler: nothing to crawl in %s %s.', instance_name, instance_id) return # Start with EasyList adblock_rules_list = requests_get( # WARNING: do not .split() with no parameters, else # adblock will block everything due to empty rules. 'https://easylist-downloads.adblockplus.org/easylist.txt').split('\n') # Append our eventual specific exclusions adblock_rules_list.extend( parameters.get( 'integration', {}).get( 'fetch_content_urls', {}).get( 'adblock_rules', [])) if re2 is None: # Things will be dogly slow… adblock_rules = AdblockRules( adblock_rules_list, max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY) else: # Things will go faster adblock_rules = AdblockRules( adblock_rules_list, use_re2=True, max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY) if isinstance(instance, models.Email): origin = models.ORIGINS.EMAIL # NOTE: there will be at least one here, else # accepts() would have rejected the email. feeds = instance.feeds.exclude( MailFeed___match_action=MAIL_MATCH_ACTIONS.STORE) else: origin = models.ORIGINS.CRAWLING feeds = instance.feeds.all() dupes = 0 blocked = 0 # LOGGER.debug('URLS: %s %s', len(urls), urls) for url in urls: if url.startswith('('): url = url[1:] if url.endswith(')'): # Skip Markdown's enclosing parenthesis # that we explicitely matched manually. url = url[:-1] # In case we've got garbage at the end of the RE. splitted = url.split(')') if len(splitted) == 1: pass if len(splitted) == 2 and len(splitted[1]) < 4: # Highly probable that we got some garbage at the end. url = splitted[0] else: LOGGER.error(u'url-crawler: probable nasty unhandled ' u'URL “%s” too-greedily matched by RE.', url) if adblock_rules.should_block(url): LOGGER.info(u'url-crawler: URL %s skipped, in adblocked rules.', url) blocked += 1 continue LOGGER.info('url-crawler: importing from %s.', url) try: item, created = create_item_from_url( url=clean_url(url), feeds=feeds, origin=origin, ) except: LOGGER.exception(u'Could not create item from URL “%s”', url) else: if created: LOGGER.info(u'url-crawler: successfully imported %s from ' u'%s %s.', item, instance_name, instance_id) else: dupes += 1 LOGGER.warning(u'url-crawler: %s already in database.', item) # link newly created item to the item it was found into. item.sources.add(instance) LOGGER.info(u'url-crawler: crawled %s items (%s new) from %s %s.', len(urls) - blocked, len(urls) - blocked - dupes, instance_name, instance_id)
def process(self, instance, verbose=True, commit=True, **kwargs): """ See source code. """ # from https://github.com/erikriver/opengraph # site_name => YouTube # description => Eric Clapton and Paul McCartney perform George Harrison's "While My Guitar Gently Weeps" at the... # NOQA # title => While My Guitar Gently Weeps # url => http://www.youtube.com/watch?v=q3ixBmDzylQ # image => http://i2.ytimg.com/vi/q3ixBmDzylQ/default.jpg # video:type => application/x-shockwave-flash # video:height => 224 # video => http://www.youtube.com/v/q3ixBmDzylQ?version=3&autohide=1 # NOQA # video:width => 398 # type => video instance_name = instance._meta.verbose_name instance_id = instance.id try: og_article = opengraph.OpenGraph(html=instance.content) except: # Not worth a round trip to sentry in most cases. # A warning will suffice. Developers can still debug # the article manually if wanted. LOGGER.warning(u'opengraph: parsing %s %s failed, aborting.', instance_name, instance_id) return if not og_article.is_valid(): LOGGER.warning( u'opengraph: invalid OpenGraph data in %s %s, ' u'aborting.', instance_name, instance_id) return needs_commit = False # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Title name_needs_extraction = get_processor_by_slug( '1fs-article-title-extract-accept-conditions').accepts(instance, verbose=verbose, commit=commit, **kwargs) if data_ok(og_article.title) and name_needs_extraction: if isinstance(og_article.title, list): # Cf. http://blog.dbth.fr/2015/03/la-liberte-de-fermer-ta-gueule-ou-du-sexisme-dans-la-musique/ # NOQA instance.name = og_article.title[0] else: instance.name = og_article.title needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s name to “%s”.', instance_name, instance_id, instance.name) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Date published # http://ogp.me/#type_article # # article:published_time - datetime - When the article was first published. # article:modified_time - datetime - When the article was last changed. # article:expiration_time - datetime - When the article is out of date after. # NOQA # article:author - profile array - Writers of the article. # article:section - string - A high-level section name. E.g. Technology # article:tag - string array - Tag words associated with this article. # # http://ogp.me/#type_profile (for author) og_pub_time = og_article.get('article__published_time', None) if instance.date_published is None and data_ok(og_pub_time): parsed_datetime = datetime_extended_parser(og_pub_time) if parsed_datetime is None: LOGGER.warning( u'OpenGraph article:published_time “%s” is ' u'unparseable.', og_pub_time) else: date_published = datetime(*parsed_datetime[:6]) instance.date_published = date_published needs_commit = True LOGGER.info(u'opengraph: set %s %s published date.', instance_name, instance_id) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Description og_description = og_article.get('description', None) if data_ok(og_description) and not data_ok(instance.excerpt): instance.excerpt = og_description needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s excerpt.', instance_name, instance_id) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––—–––––––––– Authors # # TODO # # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Language og_language = og_article.get('language', None) if data_ok(og_language) and instance.language_id is None: instance.language = models.Language.get_by_code(og_language) needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s language to %s.', instance_name, instance_id, instance.language) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Tags og_tags = og_article.get('article__tag', None) if data_ok(og_tags): if not isinstance(og_tags, list): og_tags = [og_tags] if og_tags and not instance.tags.exists(): instance.tags.add( *models.SimpleTag.get_tags_set(og_tags, origin=instance)) if verbose: LOGGER.info(u'opengraph: set %s %s tag(s) to %s.', instance_name, instance_id, u', '.join(og_tags)) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Front image og_image = og_article.get('image', None) if data_ok(og_image) and not data_ok(instance.image_url): if isinstance(og_image, list): instance.image_url = clean_url(og_image[0]) else: instance.image_url = clean_url(og_image) needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s image_url to %s.', instance_name, instance_id, instance.image_url) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——————— Beer if needs_commit and commit: # As we changed only fields that were previously # unset, no need to waste a version. instance.save_without_historical_record()
def create_feeds_from_url(feed_url, creator=None, recurse=True): """ Return a list of one or more tuple(s) ``(feed, created)``, from a given URL. If the URL given is an RSS/Atom URL, the method will create a feed (if not already in the database), and will return it associated with the ``created`` boolean, given if it was created now, or not. For consistency, the tuple will be returned in a list, so that this method *always* returns a list of tuples. If the URL is a simple website one, it will be opened and parsed to discover eventual RSS/Atom feeds referenced in the page headers, and the method will return a list of tuples. .. todo:: parse the content body to find any RSS/Atom feeds inside. Will make it easy to parse http://www.bbc.co.uk/news/10628494 :param creator: a :class:`User` that will be set as the feed(s) creator. This will allow to eventually give acheivements to users, or at the contrary to ban them if they pollute the DB. :param recurse: In case of a simple web URL, this method will be called recursively. Subsequent calls will be non-recursive by default. You can consider this argument to be "internal". """ feed_url = prepare_feed_url(feed_url) try: feed = RssAtomFeed.objects.get(url=feed_url) except RssAtomFeed.DoesNotExist: # We will create it now. pass else: # Get the right one for the user subscription. if feed.duplicate_of_id: return [(feed.duplicate_of, False)] else: return [(feed, False)] http_logger = HttpResponseLogProcessor() parsed_feed = feedparser.parse(feed_url, handlers=[http_logger]) feed_status = http_logger.log[-1]['status'] # Stop on HTTP errors before stopping on feedparser errors, # because he is much more lenient in many conditions. if feed_status in (400, 401, 402, 403, 404, 500, 502, 503): raise FeedFetchException(u'Error {0} when fetching feed {1}'.format( feed_status, feed_url)) try: check_feedparser_error(parsed_feed) except FeedIsHtmlPageException: if recurse: new_feeds = [] urls_to_try = set(parse_feeds_urls(parsed_feed)) for sub_url in urls_to_try: try: new_feeds += create_feeds_from_url(sub_url, creator=creator, recurse=False) except FeedIsHtmlPageException: # We don't warn for every URL we find, # most of them are CSS/JS/whatever ones. pass except: LOGGER.exception(u'Could not create a feed from ' u'recursed url {0} (from {1})'.format( sub_url, feed_url)) if new_feeds: # LOGGER.info(u'Returning %s created feeds.', len(new_feeds)) return new_feeds # Just before giving up, try a little more with newspaper. # As it is quite slow, do it in the background. discover_feeds_urls.delay(feed_url) raise else: raise except Exception as e: raise Exception(u'Unparsable feed {0}: {1}'.format(feed_url, e)) else: # Wow. FeedParser creates a <anything>.feed . Impressive. fp_feed = parsed_feed.feed website = WebSite.get_from_url(clean_url(fp_feed.get('link', feed_url))) defaults = { 'name': fp_feed.get('title', u'Feed from {0}'.format(feed_url)), 'is_good': True, # Try the RSS description, then the Atom subtitle. 'description_en': fp_feed.get('description', fp_feed.get('subtitle', u'')), 'website': website } new_feed, created = RssAtomFeed.objects.get_or_create( url=feed_url, defaults=defaults) if created: new_feed.user = creator new_feed.save() return [(new_feed, created)]
def absolutize_url(self, requests_response=None, force=False, commit=True): """ Make the current article URL absolute. Eg. transform: http://feedproxy.google.com/~r/francaistechcrunch/~3/hEIhLwVyEEI/ into: http://techcrunch.com/2013/05/18/hell-no-tumblr-users-wont-go-to-yahoo/ # NOQA ?utm_source=feeurner&utm_medium=feed&utm_campaign=Feed%3A+francaistechcrunch+%28TechCrunch+en+Francais%29 # NOQA and then remove all these F*G utm_* parameters to get a clean final URL for the current article. Returns ``True`` if the operation succeeded, ``False`` if the absolutization pointed out that the current article is a duplicate of another. In this case the caller should stop its processing because the current article will be marked for deletion. Can also return ``None`` if absolutizing is disabled globally in ``constance`` configuration. """ # Another example: http://rss.lefigaro.fr/~r/lefigaro/laune/~3/7jgyrQ-PmBA/story01.htm # NOQA if self.absolutize_url_must_abort(force=force, commit=commit): return if requests_response is None: try: requests_response = requests.get(self.url) except requests.ConnectionError as e: statsd.gauge('articles.counts.url_errors', 1, delta=True) message = u'Connection error while absolutizing “%s”: %s' args = (self.url, str(e), ) self.url_error = message % args # Don't waste a version just for that. self.save_without_historical_record() LOGGER.error(message, *args) return if not requests_response.ok or requests_response.status_code != 200: message = u'HTTP Error %s while absolutizing “%s”: %s' args = ( requests_response.status_code, requests_response.url, requests_response.reason ) with statsd.pipeline() as spipe: spipe.gauge('articles.counts.url_errors', 1, delta=True) if requests_response.status_code in (404, ): self.is_orphaned = True # This is not handled by the post_save() # which acts only at article creation. spipe.gauge('articles.counts.orphaned', 1, delta=True) self.url_error = message % args # Don't waste a version just for that. self.save_without_historical_record() LOGGER.error(message, *args) return # # NOTE: we could also get it eventually from r.headers['link'], # which contains '<another_url>'. We need to strip out # the '<>', and re-absolutize this link, because in the # example it's another redirector. Also r.links is a good # candidate but in the example I used, it contains the # shortlink, which must be re-resolved too. # # So: as we already are at the final address *now*, no need # bothering re-following another which would lead us to the # the same final place. # final_url = clean_url(requests_response.url) # LOGGER.info(u'\n\nFINAL: %s vs. ORIG: %s\n\n', final_url, self.url) if final_url != self.url: # Just for displaying purposes, see below. old_url = self.url if self.url_error: statsd.gauge('articles.counts.url_errors', -1, delta=True) # Even if we are a duplicate, we came until here and everything # went fine. We won't need to lookup again the absolute URL. statsd.gauge('articles.counts.absolutes', 1, delta=True) self.url_absolute = True self.url_error = None self.url = final_url try: if self.name.endswith(old_url): self.name = self.name.replace(old_url, final_url) except: LOGGER.exception(u'Could not replace URL in name of %s #%s', self._meta.model.__name__, self.id) duplicate = False with transaction.atomic(): # Without the atomic() block, saving the current article # (beiing a duplicate) will trigger the IntegrityError, # but will render the current SQL context unusable, unable # to register duplicate, potentially leading to massive # inconsistencies in the caller's context. try: # Don't waste a version just for that. self.save_without_historical_record() except IntegrityError: duplicate = True if duplicate: params = { '%s___url' % self._meta.model.__name__: final_url } original = BaseItem.objects.get(**params) # Just to display the right “old” one in logs. self.url = old_url LOGGER.info(u'%s #%s is a duplicate of #%s, ' u'registering as such.', self._meta.model.__name__, self.id, original.id) original.register_duplicate(self) return False # Any other exception will raise. This is intentional. else: LOGGER.info(u'URL of %s (#%s) successfully absolutized ' u'from %s to %s.', self._meta.model.__name__, self.id, old_url, final_url) else: # Don't do the job twice. if self.url_error: statsd.gauge('articles.counts.url_errors', -1, delta=True) statsd.gauge('articles.counts.absolutes', 1, delta=True) self.url_absolute = True self.url_error = None # Don't waste a version just for that. self.save_without_historical_record() return True
def process(self, instance, verbose=True, commit=True, **kwargs): """ See source code. """ # from https://github.com/erikriver/opengraph # site_name => YouTube # description => Eric Clapton and Paul McCartney perform George Harrison's "While My Guitar Gently Weeps" at the... # NOQA # title => While My Guitar Gently Weeps # url => http://www.youtube.com/watch?v=q3ixBmDzylQ # image => http://i2.ytimg.com/vi/q3ixBmDzylQ/default.jpg # video:type => application/x-shockwave-flash # video:height => 224 # video => http://www.youtube.com/v/q3ixBmDzylQ?version=3&autohide=1 # NOQA # video:width => 398 # type => video instance_name = instance._meta.verbose_name instance_id = instance.id try: og_article = opengraph.OpenGraph(html=instance.content) except: # Not worth a round trip to sentry in most cases. # A warning will suffice. Developers can still debug # the article manually if wanted. LOGGER.warning(u'opengraph: parsing %s %s failed, aborting.', instance_name, instance_id) return if not og_article.is_valid(): LOGGER.warning(u'opengraph: invalid OpenGraph data in %s %s, ' u'aborting.', instance_name, instance_id) return needs_commit = False # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Title name_needs_extraction = get_processor_by_slug( '1fs-article-title-extract-accept-conditions').accepts( instance, verbose=verbose, commit=commit, **kwargs) if data_ok(og_article.title) and name_needs_extraction: if isinstance(og_article.title, list): # Cf. http://blog.dbth.fr/2015/03/la-liberte-de-fermer-ta-gueule-ou-du-sexisme-dans-la-musique/ # NOQA instance.name = og_article.title[0] else: instance.name = og_article.title needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s name to “%s”.', instance_name, instance_id, instance.name) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Date published # http://ogp.me/#type_article # # article:published_time - datetime - When the article was first published. # article:modified_time - datetime - When the article was last changed. # article:expiration_time - datetime - When the article is out of date after. # NOQA # article:author - profile array - Writers of the article. # article:section - string - A high-level section name. E.g. Technology # article:tag - string array - Tag words associated with this article. # # http://ogp.me/#type_profile (for author) og_pub_time = og_article.get('article__published_time', None) if instance.date_published is None and data_ok(og_pub_time): parsed_datetime = datetime_extended_parser(og_pub_time) if parsed_datetime is None: LOGGER.warning(u'OpenGraph article:published_time “%s” is ' u'unparseable.', og_pub_time) else: date_published = datetime(*parsed_datetime[:6]) instance.date_published = date_published needs_commit = True LOGGER.info(u'opengraph: set %s %s published date.', instance_name, instance_id) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Description og_description = og_article.get('description', None) if data_ok(og_description) and not data_ok(instance.excerpt): instance.excerpt = og_description needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s excerpt.', instance_name, instance_id) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––—–––––––––– Authors # # TODO # # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Language og_language = og_article.get('language', None) if data_ok(og_language) and instance.language_id is None: instance.language = models.Language.get_by_code(og_language) needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s language to %s.', instance_name, instance_id, instance.language) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Tags og_tags = og_article.get('article__tag', None) if data_ok(og_tags): if not isinstance(og_tags, list): og_tags = [og_tags] if og_tags and not instance.tags.exists(): instance.tags.add(*models.SimpleTag.get_tags_set(og_tags, origin=instance)) if verbose: LOGGER.info(u'opengraph: set %s %s tag(s) to %s.', instance_name, instance_id, u', '.join(og_tags)) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Front image og_image = og_article.get('image', None) if data_ok(og_image) and not data_ok(instance.image_url): if isinstance(og_image, list): instance.image_url = clean_url(og_image[0]) else: instance.image_url = clean_url(og_image) needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s image_url to %s.', instance_name, instance_id, instance.image_url) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——————— Beer if needs_commit and commit: # As we changed only fields that were previously # unset, no need to waste a version. instance.save_without_historical_record()
def process(self, instance, parameters=None, commit=True, **kwargs): """ See source code. """ instance_name = instance._meta.verbose_name instance_id = instance.id urls = URL_MATCH_REGEX.findall(instance.content) if not urls: LOGGER.info(u'url-crawler: nothing to crawl in %s %s.', instance_name, instance_id) return # Start with EasyList adblock_rules_list = requests_get( # WARNING: do not .split() with no parameters, else # adblock will block everything due to empty rules. 'https://easylist-downloads.adblockplus.org/easylist.txt').split('\n') # Append our eventual specific exclusions adblock_rules_list.extend( parameters.get('integration', {}).get('fetch_content_urls', {}).get('adblock_rules', [])) if re2 is None: # Things will be dogly slow… adblock_rules = AdblockRules( adblock_rules_list, max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY) else: # Things will go faster adblock_rules = AdblockRules( adblock_rules_list, use_re2=True, max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY) if isinstance(instance, models.Email): origin = models.ORIGINS.EMAIL # NOTE: there will be at least one here, else # accepts() would have rejected the email. feeds = instance.feeds.exclude( MailFeed___match_action=MAIL_MATCH_ACTIONS.STORE) else: origin = models.ORIGINS.CRAWLING feeds = instance.feeds.all() dupes = 0 blocked = 0 # LOGGER.debug('URLS: %s %s', len(urls), urls) for url in urls: if url.startswith('('): url = url[1:] if url.endswith(')'): # Skip Markdown's enclosing parenthesis # that we explicitely matched manually. url = url[:-1] # In case we've got garbage at the end of the RE. splitted = url.split(')') if len(splitted) == 1: pass if len(splitted) == 2 and len(splitted[1]) < 4: # Highly probable that we got some garbage at the end. url = splitted[0] else: LOGGER.error( u'url-crawler: probable nasty unhandled ' u'URL “%s” too-greedily matched by RE.', url) if adblock_rules.should_block(url): LOGGER.info(u'url-crawler: URL %s skipped, in adblocked rules.', url) blocked += 1 continue LOGGER.info('url-crawler: importing from %s.', url) try: item, created = create_item_from_url( url=clean_url(url), feeds=feeds, origin=origin, ) except: LOGGER.exception(u'Could not create item from URL “%s”', url) else: if created: LOGGER.info( u'url-crawler: successfully imported %s from ' u'%s %s.', item, instance_name, instance_id) else: dupes += 1 LOGGER.warning(u'url-crawler: %s already in database.', item) # link newly created item to the item it was found into. item.sources.add(instance) LOGGER.info(u'url-crawler: crawled %s items (%s new) from %s %s.', len(urls) - blocked, len(urls) - blocked - dupes, instance_name, instance_id)