def repair_missing_authors_migration_201411(cls): # from oneflow.core.tasks.migration import vacuum_analyze articles = Article.objects.filter( authors=None, date_created__gt=datetime(2014, 10, 31)) count = articles.count() done = 0 LOGGER.info(u'Starting repairing %s missing authors @%s', count, now()) with benchmark(u'Fix missing authors on rel-DB fetched content…'): for article in articles: article.postprocess_original_data(force=True) # if done % 25000 == 0: # vacuum_analyze() done += 1
def long_in_the_past(): """ Return a datetime long before 1flow existed. """ return datetime(2007, 1, 1)
def process(self, instance, verbose=True, commit=True, **kwargs): """ See source code. """ # from https://github.com/erikriver/opengraph # site_name => YouTube # description => Eric Clapton and Paul McCartney perform George Harrison's "While My Guitar Gently Weeps" at the... # NOQA # title => While My Guitar Gently Weeps # url => http://www.youtube.com/watch?v=q3ixBmDzylQ # image => http://i2.ytimg.com/vi/q3ixBmDzylQ/default.jpg # video:type => application/x-shockwave-flash # video:height => 224 # video => http://www.youtube.com/v/q3ixBmDzylQ?version=3&autohide=1 # NOQA # video:width => 398 # type => video instance_name = instance._meta.verbose_name instance_id = instance.id try: og_article = opengraph.OpenGraph(html=instance.content) except: # Not worth a round trip to sentry in most cases. # A warning will suffice. Developers can still debug # the article manually if wanted. LOGGER.warning(u'opengraph: parsing %s %s failed, aborting.', instance_name, instance_id) return if not og_article.is_valid(): LOGGER.warning(u'opengraph: invalid OpenGraph data in %s %s, ' u'aborting.', instance_name, instance_id) return needs_commit = False # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Title name_needs_extraction = get_processor_by_slug( '1fs-article-title-extract-accept-conditions').accepts( instance, verbose=verbose, commit=commit, **kwargs) if data_ok(og_article.title) and name_needs_extraction: if isinstance(og_article.title, list): # Cf. http://blog.dbth.fr/2015/03/la-liberte-de-fermer-ta-gueule-ou-du-sexisme-dans-la-musique/ # NOQA instance.name = og_article.title[0] else: instance.name = og_article.title needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s name to “%s”.', instance_name, instance_id, instance.name) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Date published # http://ogp.me/#type_article # # article:published_time - datetime - When the article was first published. # article:modified_time - datetime - When the article was last changed. # article:expiration_time - datetime - When the article is out of date after. # NOQA # article:author - profile array - Writers of the article. # article:section - string - A high-level section name. E.g. Technology # article:tag - string array - Tag words associated with this article. # # http://ogp.me/#type_profile (for author) og_pub_time = og_article.get('article__published_time', None) if instance.date_published is None and data_ok(og_pub_time): parsed_datetime = datetime_extended_parser(og_pub_time) if parsed_datetime is None: LOGGER.warning(u'OpenGraph article:published_time “%s” is ' u'unparseable.', og_pub_time) else: date_published = datetime(*parsed_datetime[:6]) instance.date_published = date_published needs_commit = True LOGGER.info(u'opengraph: set %s %s published date.', instance_name, instance_id) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Description og_description = og_article.get('description', None) if data_ok(og_description) and not data_ok(instance.excerpt): instance.excerpt = og_description needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s excerpt.', instance_name, instance_id) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––—–––––––––– Authors # # TODO # # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Language og_language = og_article.get('language', None) if data_ok(og_language) and instance.language_id is None: instance.language = models.Language.get_by_code(og_language) needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s language to %s.', instance_name, instance_id, instance.language) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Tags og_tags = og_article.get('article__tag', None) if data_ok(og_tags): if not isinstance(og_tags, list): og_tags = [og_tags] if og_tags and not instance.tags.exists(): instance.tags.add(*models.SimpleTag.get_tags_set(og_tags, origin=instance)) if verbose: LOGGER.info(u'opengraph: set %s %s tag(s) to %s.', instance_name, instance_id, u', '.join(og_tags)) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Front image og_image = og_article.get('image', None) if data_ok(og_image) and not data_ok(instance.image_url): if isinstance(og_image, list): instance.image_url = clean_url(og_image[0]) else: instance.image_url = clean_url(og_image) needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s image_url to %s.', instance_name, instance_id, instance.image_url) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——————— Beer if needs_commit and commit: # As we changed only fields that were previously # unset, no need to waste a version. instance.save_without_historical_record()
DjangoUser as User, READ_STATUS_DATA, WATCH_ATTRIBUTES_FIELDS_NAMES, ) from folder import Folder # Avoid import loop. # from subscription import Subscription, generic_check_subscriptions_method from tag import AbstractTaggedModel, SimpleTag as Tag from item import BaseItem, Poke LOGGER = logging.getLogger(__name__) MIGRATION_DATETIME = datetime(2014, 11, 1) __all__ = [ 'Read', 'ReadManager', 'ReadQuerySet', 'BOOKMARK_TYPES', ] BOOKMARK_TYPES = NamedTupleChoices( 'BOOKMARK_TYPES', ('UNDEFINED', u'U', _(u'Undefined')), ('AFTERNOON', u'A', _(u'This afternoon')), ('WEEK_END', u'W', _(u'This week-end')), # The second char will be used for user defined bookmark types.
from django.conf import settings from django.core.mail import mail_admins from ..models.nonrel import (RATINGS, Article, Feed, Subscription, Read, User as MongoUser) from ..gr_import import GoogleReaderImport from oneflow.base.utils.dateutils import (now, ftstamp, datetime, naturaldelta, naturaltime) from common import User # We don't fetch articles too far in the past, even if google has them. GR_OLDEST_DATE = datetime(2008, 1, 1) LOGGER = logging.getLogger(__name__) def get_user_from_dbs(user_id): django_user = User.objects.get(id=user_id) MongoUser.objects(django_user=django_user.id).update_one( set__django_user=django_user.id, upsert=True) return django_user, MongoUser.objects.get(django_user=django_user.id) def import_google_reader_trigger(user_id, refresh=False): """ This function allow to trigger the celery task from anywhere. just pass it a user ID. It's called from the views, and we created
def process(self, instance, parameters=None, verbose=True, force=False, commit=True, **kwargs): """ See source code. """ # Get an eventual siteconfig override from parameters. If `None`, # the `process()` wrapper will fetch it from repositories as usual. siteconfig_string = parameters.get('metadata', {}).get('siteconfig', None) if siteconfig_string is None: siteconfig = None else: try: siteconfig = ftr.SiteConfig(site_config_text=siteconfig_string) except: LOGGER.exception( u'ftr-extractor: unusable custom siteconfig, aborting.') # TODO: mail admins… return if verbose: LOGGER.info(u'ftr-extractor: custom siteconfig override loaded.') # FTR logs a lot, and it's useless if not in verbose / debug mode, # because siteconfigs are debugged at the lower-level, not from the # 1flow processor. logging.disable(logging.WARNING) try: try: # Note: in case of multiple-pages article, this should # bring us ALL the content, concatenated in one page. extractor = ftr.process( # HEADS UP: Email items have no `.url` field. url=getattr(instance, 'url', None), content=instance.content, config=siteconfig ) except ftr.SiteConfigException: # No configuration for the website or syntax # error in siteconfig. Bail out, another # processor will take care of this article. return finally: logging.disable(logging.NOTSET) instance_name = instance._meta.verbose_name instance_id = instance.id needs_save = False # General processing note: # We use extracted attributes only if they were the result of an # intended extraction. If they come from automatic extraction after # a failure, we discard them. # ——————————————————————————————————————————————————————————————————— Title if (extractor.title is not None and 'title' not in extractor.failures) and ( force or instance.name is None): instance.name = extractor.title needs_save = True if verbose: LOGGER.info(u'ftr-extractor: Set %s %s title to “%s”.', instance_name, instance_id, instance.name) # —————————————————————————————————————————————————————————— Body / content if (extractor.body is not None and 'body' not in extractor.failures) and ( force or instance.content_type is models.CONTENT_TYPES.HTML): instance.content = extractor.body instance.content_type = ( # `prune` option already produces # cleaned HTML and is usually sufficient. models.CONTENT_TYPES.CLEANED_HTML if extractor.config.prune else models.CONTENT_TYPES.HTML ) needs_save = True if verbose: LOGGER.info(u'ftr-extractor: Set %s %s content to %s of ' u'genuine %s.', instance_name, instance_id, naturalsize(len(instance.content)), models.CONTENT_TYPES.symbolic(instance.content_type)) # ———————————————————————————————————————————————————————— Multi-pages URLs # HEADS UP: getattr(…) because Email items have no `.pages_urls` field. if (bool(extractor.next_page_link) and 'next_page_link' not in extractor.failures) and ( force or getattr(instance, 'pages_urls', None) in (None, [])): instance.pages_urls = extractor.next_page_link needs_save = True if verbose: LOGGER.info(u'ftr-extractor: Recorded %s multi-pages URLs ' u'for %s %s.', len(extractor.next_page_link), instance_name, instance_id) # —————————————————————————————————————————————————————————— Date published if (extractor.date is not None and 'title' not in extractor.failures) and ( force or instance.date_published is None): try: the_datetime = datetime(datetime_extended_parser( extractor.date)[:6]) except: pass else: if is_naive(the_datetime): the_datetime = make_aware(the_datetime, utc) instance.date_published = the_datetime needs_save = True if verbose: LOGGER.info(u'ftr-extractor: Set %s %s date to %s.', instance_name, instance_id, instance.date_published) # ———————————————————————————————————————————————————————————————— Language if (extractor.language is not None and 'language' not in extractor.failures) and ( force or instance.language_id is None): instance.language = models.Language.get_by_code(extractor.language) needs_save = True if verbose: LOGGER.info(u'ftr-extractor: Set %s %s language to %s.', instance_name, instance_id, instance.language) # ——————————————————————————————————————————————————————————————— Author(s) if (bool(extractor.author) and 'author' not in extractor.failures) and ( force or not instance.authors.exists()): authors = models.Author.get_authors_from_name_emails_and_article( authors=[{'name': name} for name in extractor.author], origin_article=instance ) if verbose: LOGGER.info(u'ftr-extractor: Set %s %s author(s) to %s.', instance_name, instance_id, u', '.join(unicode(a) for a in authors)) if needs_save and commit: # If the processing was forced, we consider keeping the previous # version is a safe practice, to be able to eventually go back. # Else, just don't clutter the system with another version. if force: instance.save() else: instance.save_without_historical_record()
def process(self, instance, verbose=True, commit=True, **kwargs): """ See source code. """ # from https://github.com/erikriver/opengraph # site_name => YouTube # description => Eric Clapton and Paul McCartney perform George Harrison's "While My Guitar Gently Weeps" at the... # NOQA # title => While My Guitar Gently Weeps # url => http://www.youtube.com/watch?v=q3ixBmDzylQ # image => http://i2.ytimg.com/vi/q3ixBmDzylQ/default.jpg # video:type => application/x-shockwave-flash # video:height => 224 # video => http://www.youtube.com/v/q3ixBmDzylQ?version=3&autohide=1 # NOQA # video:width => 398 # type => video instance_name = instance._meta.verbose_name instance_id = instance.id try: og_article = opengraph.OpenGraph(html=instance.content) except: # Not worth a round trip to sentry in most cases. # A warning will suffice. Developers can still debug # the article manually if wanted. LOGGER.warning(u'opengraph: parsing %s %s failed, aborting.', instance_name, instance_id) return if not og_article.is_valid(): LOGGER.warning( u'opengraph: invalid OpenGraph data in %s %s, ' u'aborting.', instance_name, instance_id) return needs_commit = False # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Title name_needs_extraction = get_processor_by_slug( '1fs-article-title-extract-accept-conditions').accepts(instance, verbose=verbose, commit=commit, **kwargs) if data_ok(og_article.title) and name_needs_extraction: if isinstance(og_article.title, list): # Cf. http://blog.dbth.fr/2015/03/la-liberte-de-fermer-ta-gueule-ou-du-sexisme-dans-la-musique/ # NOQA instance.name = og_article.title[0] else: instance.name = og_article.title needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s name to “%s”.', instance_name, instance_id, instance.name) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Date published # http://ogp.me/#type_article # # article:published_time - datetime - When the article was first published. # article:modified_time - datetime - When the article was last changed. # article:expiration_time - datetime - When the article is out of date after. # NOQA # article:author - profile array - Writers of the article. # article:section - string - A high-level section name. E.g. Technology # article:tag - string array - Tag words associated with this article. # # http://ogp.me/#type_profile (for author) og_pub_time = og_article.get('article__published_time', None) if instance.date_published is None and data_ok(og_pub_time): parsed_datetime = datetime_extended_parser(og_pub_time) if parsed_datetime is None: LOGGER.warning( u'OpenGraph article:published_time “%s” is ' u'unparseable.', og_pub_time) else: date_published = datetime(*parsed_datetime[:6]) instance.date_published = date_published needs_commit = True LOGGER.info(u'opengraph: set %s %s published date.', instance_name, instance_id) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Description og_description = og_article.get('description', None) if data_ok(og_description) and not data_ok(instance.excerpt): instance.excerpt = og_description needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s excerpt.', instance_name, instance_id) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––—–––––––––– Authors # # TODO # # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Language og_language = og_article.get('language', None) if data_ok(og_language) and instance.language_id is None: instance.language = models.Language.get_by_code(og_language) needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s language to %s.', instance_name, instance_id, instance.language) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Tags og_tags = og_article.get('article__tag', None) if data_ok(og_tags): if not isinstance(og_tags, list): og_tags = [og_tags] if og_tags and not instance.tags.exists(): instance.tags.add( *models.SimpleTag.get_tags_set(og_tags, origin=instance)) if verbose: LOGGER.info(u'opengraph: set %s %s tag(s) to %s.', instance_name, instance_id, u', '.join(og_tags)) # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Front image og_image = og_article.get('image', None) if data_ok(og_image) and not data_ok(instance.image_url): if isinstance(og_image, list): instance.image_url = clean_url(og_image[0]) else: instance.image_url = clean_url(og_image) needs_commit = True if verbose: LOGGER.info(u'opengraph: set %s %s image_url to %s.', instance_name, instance_id, instance.image_url) # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——————— Beer if needs_commit and commit: # As we changed only fields that were previously # unset, no need to waste a version. instance.save_without_historical_record()
from oneflow.base.utils.dateutils import now, datetime from ..common import ORIGINS, CONTENT_TYPES from ..author import Author from base import ( BaseItemQuerySet, BaseItemManager, BaseItem, baseitem_create_reads_task, ) LOGGER = logging.getLogger(__name__) MIGRATION_DATETIME = datetime(2014, 11, 1) __all__ = [ 'Tweet', 'create_tweet_from_id', 'mark_tweet_deleted', # Tasks will be added below by register_task_method(). ] def create_tweet_from_id(tweet_id, feeds=None, origin=None): """ From a Tweet ID, create a 1flow tweet via the REST API.
def process(self, instance, parameters=None, verbose=True, commit=True, **kwargs): """ See source code. """ CONTENT_TYPES = models.CONTENT_TYPES instance_name = instance._meta.verbose_name instance_id = instance.id # Only used in accepts() code. # repair = parameters.get('repair', False) if instance.content_type == CONTENT_TYPES.HTML: html_to_work_on = instance.content else: # The existence of this has already been tested in accepts(). # We cannot run process() if the instance is not HTML or not # repairing it with a known HTML history version. html_to_work_on = instance.history.filter( content_type=CONTENT_TYPES.HTML).earliest('history_date').content try: # The microdata parser expects an utf-8 encoded string… too bad. items = microdata.get_items(html_to_work_on.encode('utf-8')) except: LOGGER.warning(u'schema.org-extractor: could not extract microdata ' u'from %s %s', instance_name, instance_id) return need_save = False # ————————————————————————————————————————————————————————————————— Extract attributes = OrderedDict() for item in items: schema_properties = item.props # LOGGER.info(u'item %s', item.json()) # Common attributes to all types we handle in 1flow. name = schema_properties.get('name', None) # Do not overwrite with a less specific value if # name was already set via 'Article::headline'. if name is not None and attributes.get('name', None) is not None: attributes['name'] = get_property(name) date_published = schema_properties.get('datePublished', None) if date_published is not None: attributes['date_published'] = get_property(date_published) excerpt = schema_properties.get('description', None) if excerpt is not None: attributes['excerpt'] = get_property(excerpt) tags = schema_properties.get('keywords', None) if tags is not None: attributes['tags'] = extract_tags(tags) image_url = schema_properties.get('thumbnailUrl', None) if image_url is not None: attributes['image_url'] = get_property(image_url) authors = schema_properties.get('author', None) # Author can be a link to the author page, which # will give us a Person or Organization schema. if authors is not None: found_authors = get_microdata_authors(authors, instance) if found_authors: attributes['authors'] = found_authors genre = schema_properties.get('genre', None) if genre is not None: if 'tags' not in attributes: attributes['tags'] = [] for one_genre in genre: attributes['tags'].extend(extract_tags(one_genre)) if item.type == 'http://schema.org/VideoObject': if instance.content_type != CONTENT_TYPES.VIDEO: instance.content_type = CONTENT_TYPES.VIDEO need_save = True LOGGER.info(u'schema.org-extractor: Set %s %s content type ' u'to VIDEO.', instance_name, instance_id) elif item.type in ( 'http://schema.org/Article', 'http://schema.org/NewsArticle', 'http://schema.org/TechArticle', 'http://schema.org/BlogPosting', 'http://schema.org/WebPage', 'http://schema.org/CreativeWork', ): # HeadLine overwrites name, it's more specific. attributes['name'] = get_property( schema_properties.get('headline', None)) attributes['language'] = get_property( schema_properties.get('inLanguage', None)) attributes['word_count'] = get_property( schema_properties.get('wordCount', None)) creators = schema_properties.get('creator', None) # Author can be a link to the creator page, which # will give us a Person or Organization schema. if creators is not None: creators = get_microdata_authors(creators, instance) if creators: if 'authors' in attributes: attributes['authors'].extend(creators) else: attributes['authors'] = creators # TODO: # citation # comment # articleBody → content # articleSection → Tags # # News: # dateline → ? # # Tech: # dependencies # proficiencyLevel # # WebPage: # specialy → ? # significantLink → crawl ? # reviewedBy → ? # lastReviewed → ? # relatedLink → ? # primaryImageOfPage # —————————————————————————————————————————————————————— Transform & assign # turn attributes into their python / 1flow native-internals formats. if attributes.get('date_published', None) is not None: try: attributes['date_published'] = datetime(*datetime_extended_parser( attributes['date_published'])[:6]) except: LOGGER.exception(u'schema.org-extractor: unparseable date “%s”', attributes['date_published']) # Be sure we don't try to use it below. attributes['date_published'] = None if attributes.get('language', None) is not None: try: attributes['language'] = models.Language.get_by_code( attributes['language']) except: LOGGER.exception(u'schema.org-extractor: unable to get ' u'language “%s”', attributes['language']) # Be sure we don't try to use it below. attributes['language'] = None if attributes.get('word_count', None) is not None: attributes['word_count'] = int(attributes['word_count']) if attributes.get('tags', None) is not None: # We pop() tags to avoid trying to setattr() it below. tags = models.SimpleTag.get_tags_set(attributes.pop('tags'), origin=instance) instance.tags.add(*tags) if verbose: LOGGER.info(u'schema.org-extractor: added tags %s to %s %s.', u', '.join(tag.name for tag in tags), instance_name, instance_id) if attributes.get('authors', None) is not None: # We pop() tags to avoid trying to setattr() it below. authors = attributes.pop('authors') # LOGGER.info(authors) # This will implicitely add() the author to the instance. authors = models.Author.get_authors_from_name_emails_and_article( authors, origin_article=instance) # LOGGER.info(authors) LOGGER.info(u'schema.org-extractor: added author(s) %s to %s %s.', u', '.join(unicode(a) for a in authors), instance_name, instance_id) # if verbose: # LOGGER.debug(u'schema.org-extractor: %s', attributes) for attribute, value in attributes.items(): if value is None: continue if getattr(instance, attribute) is None: setattr(instance, attribute, value) need_save = True if verbose: LOGGER.info(u'schema.org-extractor: Set %s %s to %s %s.', attribute, value, instance_name, instance_id) if need_save and commit: instance.save()
def process(self, instance, parameters=None, verbose=True, force=False, commit=True, **kwargs): """ See source code. """ # Get an eventual siteconfig override from parameters. If `None`, # the `process()` wrapper will fetch it from repositories as usual. siteconfig_string = parameters.get('metadata', {}).get('siteconfig', None) if siteconfig_string is None: siteconfig = None else: try: siteconfig = ftr.SiteConfig(site_config_text=siteconfig_string) except: LOGGER.exception( u'ftr-extractor: unusable custom siteconfig, aborting.') # TODO: mail admins… return if verbose: LOGGER.info(u'ftr-extractor: custom siteconfig override loaded.') # FTR logs a lot, and it's useless if not in verbose / debug mode, # because siteconfigs are debugged at the lower-level, not from the # 1flow processor. logging.disable(logging.WARNING) try: try: # Note: in case of multiple-pages article, this should # bring us ALL the content, concatenated in one page. extractor = ftr.process( # HEADS UP: Email items have no `.url` field. url=getattr(instance, 'url', None), content=instance.content, config=siteconfig) except ftr.SiteConfigException: # No configuration for the website or syntax # error in siteconfig. Bail out, another # processor will take care of this article. return finally: logging.disable(logging.NOTSET) instance_name = instance._meta.verbose_name instance_id = instance.id needs_save = False # General processing note: # We use extracted attributes only if they were the result of an # intended extraction. If they come from automatic extraction after # a failure, we discard them. # ——————————————————————————————————————————————————————————————————— Title if (extractor.title is not None and 'title' not in extractor.failures) and (force or instance.name is None): instance.name = extractor.title needs_save = True if verbose: LOGGER.info(u'ftr-extractor: Set %s %s title to “%s”.', instance_name, instance_id, instance.name) # —————————————————————————————————————————————————————————— Body / content if (extractor.body is not None and 'body' not in extractor.failures) and ( force or instance.content_type is models.CONTENT_TYPES.HTML): instance.content = extractor.body instance.content_type = ( # `prune` option already produces # cleaned HTML and is usually sufficient. models.CONTENT_TYPES.CLEANED_HTML if extractor.config.prune else models.CONTENT_TYPES.HTML) needs_save = True if verbose: LOGGER.info( u'ftr-extractor: Set %s %s content to %s of ' u'genuine %s.', instance_name, instance_id, naturalsize(len(instance.content)), models.CONTENT_TYPES.symbolic(instance.content_type)) # ———————————————————————————————————————————————————————— Multi-pages URLs # HEADS UP: getattr(…) because Email items have no `.pages_urls` field. if (bool(extractor.next_page_link) and 'next_page_link' not in extractor.failures) and ( force or getattr(instance, 'pages_urls', None) in (None, [])): instance.pages_urls = extractor.next_page_link needs_save = True if verbose: LOGGER.info( u'ftr-extractor: Recorded %s multi-pages URLs ' u'for %s %s.', len(extractor.next_page_link), instance_name, instance_id) # —————————————————————————————————————————————————————————— Date published if (extractor.date is not None and 'title' not in extractor.failures) and ( force or instance.date_published is None): try: the_datetime = datetime( datetime_extended_parser(extractor.date)[:6]) except: pass else: if is_naive(the_datetime): the_datetime = make_aware(the_datetime, utc) instance.date_published = the_datetime needs_save = True if verbose: LOGGER.info(u'ftr-extractor: Set %s %s date to %s.', instance_name, instance_id, instance.date_published) # ———————————————————————————————————————————————————————————————— Language if (extractor.language is not None and 'language' not in extractor.failures) and (force or instance.language_id is None): instance.language = models.Language.get_by_code(extractor.language) needs_save = True if verbose: LOGGER.info(u'ftr-extractor: Set %s %s language to %s.', instance_name, instance_id, instance.language) # ——————————————————————————————————————————————————————————————— Author(s) if (bool(extractor.author) and 'author' not in extractor.failures) and ( force or not instance.authors.exists()): authors = models.Author.get_authors_from_name_emails_and_article( authors=[{ 'name': name } for name in extractor.author], origin_article=instance) if verbose: LOGGER.info(u'ftr-extractor: Set %s %s author(s) to %s.', instance_name, instance_id, u', '.join(unicode(a) for a in authors)) if needs_save and commit: # If the processing was forced, we consider keeping the previous # version is a safe practice, to be able to eventually go back. # Else, just don't clutter the system with another version. if force: instance.save() else: instance.save_without_historical_record()