Exemplo n.º 1
0
def set_dateline(item, city, source, set_date=False, text=None):
    """Set the dateline for item"""
    if not city:
        return

    cities = app.locators.find_cities()
    located = [c for c in cities if c['city'].lower() == city.lower()]
    item.setdefault('dateline', {})
    item['dateline']['located'] = located[0] if len(located) > 0 else {
        'city_code': city,
        'city': city,
        'tz': 'UTC',
        'dateline': 'city'
    }
    if set_date:
        item['dateline']['date'] = datetime.fromtimestamp(
            get_date(item['firstcreated']).timestamp(),
            tz=timezone(item['dateline']['located']['tz']))
    item['dateline']['source'] = source
    if text:
        item['dateline']['text'] = text
    else:
        item['dateline']['text'] = format_dateline_to_locmmmddsrc(
            item['dateline']['located'],
            get_date(item['firstcreated']),
            source=source)
Exemplo n.º 2
0
    def run(self, url, start_date=None, end_date=None, page_size=None):
        try:
            if start_date:
                self.default_start_date = get_date(start_date)
                self.default_end_date = get_date(end_date)

            if page_size:
                self.default_page_size = int(page_size)

            self.export(url)
        except:
            logger.exception('Failed to run the command.')
Exemplo n.º 3
0
    def run(self, url, start_date=None, end_date=None, page_size=None):
        try:
            if start_date:
                self.default_start_date = get_date(start_date)
                self.default_end_date = get_date(end_date)

            if page_size:
                self.default_page_size = int(page_size)

            self.export(url)
        except:
            logger.exception('Failed to run the command.')
    def _fill_dates(self, document, item):
        tz = 'Europe/Oslo'
        item['dates'] = {'tz': tz}

        for tag in ('startDate', 'timeStart'):
            _time = document.find(tag)
            if _time is not None:
                item['dates']['start'] = local_to_utc(tz, get_date(_time.text))
                break

        for tag in ('stopDate', 'timeEnd'):
            _time = document.find(tag)
            if _time is not None:
                item['dates']['end'] = local_to_utc(tz, get_date(_time.text))
                break
Exemplo n.º 5
0
    def ap_derive_dateline(self, item):
        """
        This function looks for a dateline in the article body an uses that.
        :param item:
        :return: item populated with a dateline
        """
        try:
            html = item.get('body_html')
            if html:
                soup = BeautifulSoup(html, "html.parser")
                pars = soup.findAll('p')
                for par in pars:
                    city, source, the_rest = par.get_text().partition(' (AP) _ ')
                    if source:
                        # sometimes the city is followed by a comma and either a date or a state
                        city = city.split(',')[0]
                        if any(char.isdigit() for char in city):
                            return
                        cities = app.locators.find_cities()
                        located = [c for c in cities if c['city'].lower() == city.lower()]
                        item.setdefault('dateline', {})
                        item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city,
                                                                                           'city': city,
                                                                                           'tz': 'UTC',
                                                                                           'dateline': 'city'}
                        item['dateline']['source'] = item.get('original_source', 'AP')
                        item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                                  get_date(item['firstcreated']),
                                                                                  source=item.get('original_source',
                                                                                                  'AP'))
                        break

            return item
        except:
            logging.exception('AP dateline extraction exception')
Exemplo n.º 6
0
def set_default_source(doc):
    """Set the source for the item.

    If desk level source is specified then use that source else default from global settings.

    :param {dict} doc: doc where source is defined
    """

    # set the source for the article as default
    source = get_default_source()
    desk_id = doc.get('task', {}).get('desk')

    if desk_id:
        # if desk level source is specified then use that instead of the default source
        desk = get_resource_service('desks').find_one(req=None, _id=desk_id)
        source = desk.get('source') or source

    doc['source'] = source

    if not doc.get('dateline'):
        return

    doc['dateline']['source'] = source

    if not (doc['dateline'].get('located') and doc['dateline'].get('date')):
        return

    if isinstance(doc['dateline'].get('date'), str):
        doc['dateline']['date'] = get_date(doc['dateline'].get('date'))

    doc['dateline']['text'] = format_dateline_to_locmmmddsrc(
        doc['dateline'].get('located'), doc['dateline'].get('date'), source)
Exemplo n.º 7
0
 def derive_dateline(self, item):
     """
     Attempt to derive a dateline using the place, only if there is exactly one match on the city can we be sure we
     have the correct country.
     :param item:
     :return:
     """
     try:
         if len(item.get('place', [])) == 1:
             cities = app.locators.find_cities()
             city = item.get('place', '')[0].get('name', '')
             if city:
                 located = [c for c in cities if c['city'].lower() == city.lower()]
                 if len(located) == 1:
                     item.setdefault('dateline', {})
                     item['dateline']['located'] = located[0]
                     item['dateline']['source'] = item.get('original_source', 'EFE')
                     item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                               get_date(item['firstcreated']),
                                                                               source=item.get('original_source',
                                                                                               'EFE'))
     except Exception as ex:
         logging.exception('EFE dateline extraction exception {}'.format(ex))
     finally:
         item.pop('place', None)
Exemplo n.º 8
0
def set_default_source(doc):
    """Set the source for the item.

    If desk level source is specified then use that source else default from global settings.

    :param {dict} doc: doc where source is defined
    """

    # set the source for the article as default
    source = get_default_source()
    desk_id = doc.get('task', {}).get('desk')

    if desk_id:
        # if desk level source is specified then use that instead of the default source
        desk = get_resource_service('desks').find_one(req=None, _id=desk_id)
        source = desk.get('source') or source

    doc['source'] = source

    if not doc.get('dateline'):
        return

    doc['dateline']['source'] = source

    if not (doc['dateline'].get('located') and doc['dateline'].get('date')):
        return

    if isinstance(doc['dateline'].get('date'), str):
        doc['dateline']['date'] = get_date(doc['dateline'].get('date'))

    doc['dateline']['text'] = format_dateline_to_locmmmddsrc(doc['dateline'].get('located'),
                                                             doc['dateline'].get('date'), source)
def update_to_pass_validation(item, **kwargs):
    """
    This is a test macro that does what is required to ensure that a text item will pass publication validation.
    It is intended to be used to test auto publishing, that is publishing directly from ingest.
    At the moment virtually all content received from Reuters fails validation.
    :param item:
    :param kwargs:
    :return:
    """
    try:
        lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT}
        validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup)
        if validators.count():
            max_slugline_len = validators[0]['schema']['slugline']['maxlength']
            max_headline_len = validators[0]['schema']['headline']['maxlength']
            item['slugline'] = item['slugline'][:max_slugline_len] \
                if len(item['slugline']) > max_slugline_len else item['slugline']
            item['headline'] = item['headline'][:max_headline_len] \
                if len(item['headline']) > max_headline_len else item['headline']
        if 'dateline' not in item:
            cities = app.locators.find_cities(country_code='AU', state_code='NSW')
            located = [c for c in cities if c['city'].lower() == 'sydney']
            if located:
                item['dateline'] = {'date': item['firstcreated'], 'located': located[0]}
            item['dateline']['source'] = item['source']
            item['dateline']['text'] = format_dateline_to_locmmmddsrc(located[0], get_date(item['firstcreated']),
                                                                      source=item['source'])
        return item
    except:
        logging.exception('Test update to pass validation macro exception')
Exemplo n.º 10
0
def format_datetime_filter(date_or_string, timezone_string=None, date_format=None):
    """Convert date or string to another timezone

    :param str date_or_string:
    :param str timezone_string:
    :param str date_format:
    :return str: returns string representation of the date format
    """
    try:
        date_time = get_date(date_or_string)

        timezone_string = timezone_string if timezone_string else config.DEFAULT_TIMEZONE
        tz = timezone(timezone_string)
        if tz:
            date_time = date_time.astimezone(tz)

        if date_format:
            return date_time.strftime(date_format)
        else:
            return str(date_time)

    except Exception:
        logger.warning('Failed to convert datetime. Arguments: Date - {} Timezone - {} format - {}.'.format(
            date_or_string, timezone_string, date_format
        ))
        return ''
Exemplo n.º 11
0
def format_datetime_filter(date_or_string,
                           timezone_string=None,
                           date_format=None):
    """
    Convert date or string to another timezone
    :param str date_or_string:
    :param str timezone_string:
    :param str date_format:
    :return str: returns string representation of the date format
    """
    try:
        date_time = get_date(date_or_string)

        timezone_string = timezone_string if timezone_string else config.DEFAULT_TIMEZONE
        tz = timezone(timezone_string)
        if tz:
            date_time = date_time.astimezone(tz)

        if date_format:
            return date_time.strftime(date_format)
        else:
            return str(date_time)

    except:
        logger.warning(
            'Failed to convert datetime. Arguments: Date - {} Timezone - {} format - {}.'
            .format(date_or_string, timezone_string, date_format))
        return ''
Exemplo n.º 12
0
def noise11_derive_metadata(item, **kwargs):
    """
    By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline
    appropriately
    :param item:
    :param kwargs:
    :return:
    """
    try:
        if "anpa_category" not in item:
            category_map = superdesk.get_resource_service("vocabularies").find_one(req=None, _id="categories")
            if category_map:
                map_entry = next(
                    (code for code in category_map["items"] if code["qcode"] == "e" and code["is_active"]), None
                )
                item["anpa_category"] = [{"qcode": "e", "name": map_entry["name"]}]

        if "subject" not in item:
            qcode = "01000000"
            item["subject"] = [{"qcode": qcode, "name": subject_codes[qcode]}]

        cities = find_cities(country_code="AU", state_code="NSW")
        located = [c for c in cities if c["city"].lower() == "sydney"]

        if located and "dateline" not in item:
            item["dateline"] = {"date": item["firstcreated"], "located": located[0]}
        item["dateline"]["source"] = item["source"]
        item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
            located[0], get_date(item["firstcreated"]), source=item["source"]
        )

        return item
    except Exception as ex:
        logger.exception(ex)
Exemplo n.º 13
0
 def derive_dateline(self, item):
     """
     Attempt to derive a dateline using the place, only if there is exactly one match on the city can we be sure we
     have the correct country.
     :param item:
     :return:
     """
     try:
         if len(item.get("place", [])) == 1:
             cities = app.locators.find_cities()
             city = item.get("place", "")[0].get("name", "")
             if city:
                 located = [
                     c for c in cities if c["city"].lower() == city.lower()
                 ]
                 if len(located) == 1:
                     item.setdefault("dateline", {})
                     item["dateline"]["located"] = located[0]
                     item["dateline"]["source"] = item.get(
                         "original_source", "EFE")
                     item["dateline"][
                         "text"] = format_dateline_to_locmmmddsrc(
                             item["dateline"]["located"],
                             get_date(item["firstcreated"]),
                             source=item.get("original_source", "EFE"),
                         )
     except Exception as ex:
         logging.exception(
             "EFE dateline extraction exception {}".format(ex))
     finally:
         item.pop("place", None)
Exemplo n.º 14
0
    def ap_derive_dateline(self, item):
        """
        This function looks for a dateline in the article body an uses that.
        :param item:
        :return: item populated with a dateline
        """
        try:
            html = item.get('body_html')
            if html:
                soup = BeautifulSoup(html, "html.parser")
                pars = soup.findAll('p')
                for par in pars:
                    city, source, the_rest = par.get_text().partition(' (AP) _ ')
                    if source:
                        # sometimes the city is followed by a comma and either a date or a state
                        city = city.split(',')[0]
                        if any(char.isdigit() for char in city):
                            return
                        cities = app.locators.find_cities()
                        located = [c for c in cities if c['city'].lower() == city.lower()]
                        item.setdefault('dateline', {})
                        item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city,
                                                                                           'city': city,
                                                                                           'tz': 'UTC',
                                                                                           'dateline': 'city'}
                        item['dateline']['source'] = item.get('original_source', 'AP')
                        item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                                  get_date(item['firstcreated']),
                                                                                  source=item.get('original_source',
                                                                                                  'AP'))
                        break

            return item
        except:
            logging.exception('AP dateline extraction exception')
Exemplo n.º 15
0
 def derive_dateline(self, item):
     """
     Attempt to derive a dateline using the place, only if there is exactly one match on the city can we be sure we
     have the correct country.
     :param item:
     :return:
     """
     try:
         if len(item.get('place', [])) == 1:
             cities = app.locators.find_cities()
             city = item.get('place', '')[0].get('name', '')
             located = [
                 c for c in cities if c['city'].lower() == city.lower()
             ]
             if len(located) == 1:
                 item.setdefault('dateline', {})
                 item['dateline']['located'] = located[0]
                 item['dateline']['source'] = item.get(
                     'original_source', 'EFE')
                 item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                     item['dateline']['located'],
                     get_date(item['firstcreated']),
                     source=item.get('original_source', 'EFE'))
             item.pop('place')
     except:
         logging.exception('EFE dateline extraction exception')
Exemplo n.º 16
0
 def test_utcnow(self):
     self.assertIsInstance(utcnow(), datetime)
     date1 = get_date(datetime.now(tz=utc))
     date2 = utcnow()
     self.assertEqual(date1.year, date2.year)
     self.assertEqual(date1.month, date2.month)
     self.assertEqual(date1.day, date2.day)
     self.assertEqual(date1.hour, date2.hour)
     self.assertEqual(date1.minute, date2.minute)
     self.assertEqual(date1.second, date2.second)
Exemplo n.º 17
0
 def test_utcnow(self):
     self.assertIsInstance(utcnow(), datetime)
     date1 = get_date(datetime.now(tz=utc))
     date2 = utcnow()
     self.assertEqual(date1.year, date2.year)
     self.assertEqual(date1.month, date2.month)
     self.assertEqual(date1.day, date2.day)
     self.assertEqual(date1.hour, date2.hour)
     self.assertEqual(date1.minute, date2.minute)
     self.assertEqual(date1.second, date2.second)
Exemplo n.º 18
0
def set_dateline(item, city, source, set_date=False, text=None):
    """Set the dateline for item"""
    if not city:
        return

    cities = app.locators.find_cities()
    located = [c for c in cities if c['city'].lower() == city.lower()]
    item.setdefault('dateline', {})
    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city,
                                                                       'tz': 'UTC', 'dateline': 'city'}
    if set_date:
        item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(),
                                                          tz=timezone(item['dateline']['located']['tz']))
    item['dateline']['source'] = source
    if text:
        item['dateline']['text'] = text
    else:
        item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                  get_date(item['firstcreated']),
                                                                  source=source)
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            soup = BeautifulSoup(html, "html.parser")
            pars = soup.findAll('p')
            if len(pars) >= 2:
                if BYLINE in item and item.get(BYLINE) in pars[0].get_text():
                    first = pars[1].get_text()
                else:
                    first = pars[0].get_text()
                city, source, the_rest = first.partition(' (Reuters) - ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = app.locators.find_cities()
                    located = [
                        c for c in cities if c['city'].lower() == city.lower()
                    ]
                    # if not dateline we create one
                    if 'dateline' not in item:
                        item['dateline'] = {}
                    # there is already a dateline that is not Bangalore don't do anything just return
                    elif 'located' in item['dateline'] and 'BANGALORE' != item[
                            'dateline']['located'].get('city').upper():
                        return

                    item['dateline']['located'] = located[0] if len(
                        located) > 0 else {
                            'city_code': city,
                            'city': city,
                            'tz': 'UTC',
                            'dateline': 'city'
                        }
                    item['dateline']['source'] = item.get(
                        'original_source', 'Reuters')
                    item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                        item['dateline']['located'],
                        get_date(item['firstcreated']),
                        source=item.get('original_source', 'Reuters'))

        return item
    except:
        logging.exception('Reuters dateline macro exception')
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get("body_html")
        if html:
            soup = BeautifulSoup(html, "html.parser")
            pars = soup.findAll("p")
            if len(pars) >= 2:
                if BYLINE in item and item.get(BYLINE) in pars[0].get_text():
                    first = pars[1].get_text()
                else:
                    first = pars[0].get_text()
                city, source, the_rest = first.partition(" (Reuters) - ")
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(",")[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = find_cities()
                    located = [c for c in cities if c["city"].lower() == city.lower()]
                    # if not dateline we create one
                    if "dateline" not in item:
                        item["dateline"] = {}
                    # there is already a dateline that is not Bangalore don't do anything just return
                    elif (
                        "located" in item["dateline"] and "BANGALORE" != item["dateline"]["located"].get("city").upper()
                    ):
                        return

                    item["dateline"]["located"] = (
                        located[0]
                        if len(located) > 0
                        else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"}
                    )
                    item["dateline"]["source"] = item.get("original_source", "Reuters")
                    item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
                        item["dateline"]["located"],
                        get_date(item["firstcreated"]),
                        source=item.get("original_source", "Reuters"),
                    )

        return item
    except:
        logging.exception("Reuters dateline macro exception")
Exemplo n.º 21
0
 def set_if_not_running(pipe):
     last_updated = pipe.get(key)
     if last_updated:
         last_updated = get_date(str(last_updated))
         delta = last_updated + update_schedule
         if delta < now:
             logger.warn('Overwriting running key for {}:{}'.format(name, id))
             pipe.set(key, date_to_str(now))
             return True
         else:
             logger.warn('Task {}:{} is already running. last_updated={}'.format(name, id, last_updated))
             return False
     else:
         pipe.set(key, date_to_str(now))
         return True
Exemplo n.º 22
0
def set_dateline(updates, original):
    """Set the dateline for the item.
    :param {dict} updates: Updates related to the doc
    :param {dict} original: Original document.
    """
    if not ((updates.get('dateline') or {}).get('located') and (updates.get('dateline') or {}).get('date')):
        return

    source = updates.get('source', original.get('source')) or get_default_source()
    updates['dateline']['source'] = source

    if isinstance(updates['dateline'].get('date'), str):
        updates['dateline']['date'] = get_date(updates['dateline'].get('date'))

    updates['dateline']['text'] = format_dateline_to_locmmmddsrc(updates['dateline'].get('located'),
                                                                 updates['dateline'].get('date'), source)
Exemplo n.º 23
0
def set_dateline(updates, original):
    """Set the dateline for the item.
    :param {dict} updates: Updates related to the doc
    :param {dict} original: Original document.
    """
    if not ((updates.get('dateline') or {}).get('located') and (updates.get('dateline') or {}).get('date')):
        return

    source = updates.get('source', original.get('source')) or get_default_source()
    updates['dateline']['source'] = source

    if isinstance(updates['dateline'].get('date'), str):
        updates['dateline']['date'] = get_date(updates['dateline'].get('date'))

    updates['dateline']['text'] = format_dateline_to_locmmmddsrc(updates['dateline'].get('located'),
                                                                 updates['dateline'].get('date'), source)
Exemplo n.º 24
0
 def set_if_not_running(pipe):
     last_updated = pipe.get(key)
     if last_updated:
         last_updated = get_date(str(last_updated))
         delta = last_updated + update_schedule
         if delta < now:
             logger.warn('Overwritting running key for provider {0}'.format(provider[superdesk.config.ID_FIELD]))
             pipe.set(key, date_to_str(now))
             return True
         else:
             logger.warn('Update ingest already running for provider {0}, last_updated={1}'.
                         format(provider[superdesk.config.ID_FIELD], last_updated))
             return False
     else:
         pipe.set(key, date_to_str(now))
         return True
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            parsed = parse_html(html, content='xml')
            pars = parsed.xpath('//p')
            if len(pars) >= 2:
                if BYLINE in item and item.get(BYLINE) in ''.join(pars[0].itertext()):
                    first = ''.join(pars[1].itertext())
                else:
                    first = ''.join(pars[0].itertext())
                city, source, the_rest = first.partition(' (Reuters) - ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = app.locators.find_cities()
                    located = [c for c in cities if c['city'].lower() == city.lower()]
                    # if not dateline we create one
                    if 'dateline' not in item:
                        item['dateline'] = {}
                    # there is already a dateline that is not Bangalore don't do anything just return
                    elif 'located' in item['dateline'] and 'BANGALORE' != item['dateline']['located'].get(
                            'city').upper():
                        return

                    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city,
                                                                                       'city': city,
                                                                                       'tz': 'UTC',
                                                                                       'dateline': 'city'}
                    item['dateline']['source'] = item.get('original_source', 'Reuters')
                    item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                              get_date(item['firstcreated']),
                                                                              source=item.get('original_source',
                                                                                              'Reuters'))

        return item
    except:
        logging.exception('Reuters dateline macro exception')
Exemplo n.º 26
0
def _yonhap_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            parsed = sd_etree.parse_html(html, content='xml')
            pars = parsed.xpath('//p')
            for par in pars:
                if not par.text:
                    continue
                city, source, the_rest = par.text.partition(' (Yonhap) -- ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = app.locators.find_cities()
                    located = [
                        c for c in cities if c['city'].lower() == city.lower()
                    ]
                    # if not dateline we create one
                    if 'dateline' not in item:
                        item['dateline'] = {}

                    item['dateline']['located'] = located[0] if len(
                        located) == 1 else {
                            'city_code': city,
                            'city': city,
                            'tz': 'UTC',
                            'dateline': 'city'
                        }
                    item['dateline']['source'] = item.get('source', 'Yonhap')
                    item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                        item['dateline']['located'],
                        get_date(item['firstcreated']),
                        source='Yonhap')
                    break

        return item
    except:
        logging.exception('Yonhap dateline macro exception')
Exemplo n.º 27
0
 def set_if_not_running(pipe):
     last_updated = pipe.get(key)
     if last_updated:
         last_updated = get_date(str(last_updated))
         delta = last_updated + update_schedule
         if delta < now:
             logger.warn('Overwriting running key for {}:{}'.format(
                 name, id))
             pipe.set(key, date_to_str(now))
             return True
         else:
             logger.warn(
                 'Task {}:{} is already running. last_updated={}'.format(
                     name, id, last_updated))
             return False
     else:
         pipe.set(key, date_to_str(now))
         return True
    def _get_date_range(self, input_date, days_to_process=1):
        """
        Calculate the date range to process
        :param datetime input_date:
        :param int days_to_process:
        :return:
        """
        if not input_date:
            input_date = utcnow()
        elif isinstance(input_date, str):
            input_date = get_date(input_date)
        elif not isinstance(input_date, datetime):
            raise ValueError("Invalid Input Date.")

        end_date = input_date
        start_date = (end_date - timedelta(days=int(days_to_process))).replace(hour=0, minute=0,
                                                                               second=0, microsecond=0)

        return start_date, end_date
 def test_compare_repos(self):
     with self.app.app_context():
         cmd = GenerateActivityCountReport()
         items = cmd.run(get_date("2017-05-10T23:59:59+0000"))
         self.assertEqual(len(items), 3)
         user1_items = [
             item for item in items if item['user_id'] == 'user1'
         ]
         user2_items = [
             item for item in items if item['user_id'] == 'user2'
         ]
         self.assertEqual(len(user1_items), 1)
         self.assertEqual(len(user2_items), 2)
         self.assertEqual(user1_items[0]['create_count'], 1)
         self.assertEqual(user1_items[0]['update_count'], 1)
         self.assertEqual(user2_items[0]['create_count'], 1)
         self.assertEqual(user2_items[0]['update_count'], 0)
         self.assertEqual(user2_items[1]['create_count'], 1)
         self.assertEqual(user2_items[1]['update_count'], 0)
Exemplo n.º 30
0
def set_dateline(updates, original):
    """Set the dateline for the item.
    :param {dict} updates: Updates related to the doc
    :param {dict} original: Original document.
    """
    if not ((updates.get("dateline") or {}).get("located") and
            (updates.get("dateline") or {}).get("date")):
        return

    source = updates.get("source",
                         original.get("source")) or get_default_source()
    updates["dateline"]["source"] = source

    if isinstance(updates["dateline"].get("date"), str):
        updates["dateline"]["date"] = get_date(updates["dateline"].get("date"))

    updates["dateline"]["text"] = format_dateline_to_locmmmddsrc(
        updates["dateline"].get("located"), updates["dateline"].get("date"),
        source)
Exemplo n.º 31
0
    def _get_date_range(self, input_date, days_to_process=1):
        """Calculate the date range to process

        :param datetime input_date:
        :param int days_to_process:
        :return:
        """
        if not input_date:
            input_date = utcnow()
        elif isinstance(input_date, str):
            input_date = get_date(input_date)
        elif not isinstance(input_date, datetime):
            raise ValueError("Invalid Input Date.")

        end_date = input_date
        start_date = (end_date - timedelta(days=int(days_to_process))).replace(hour=0, minute=0,
                                                                               second=0, microsecond=0)

        return start_date, end_date
Exemplo n.º 32
0
    def _process_report(self, items):
        """To insert/update the activity report

        :param list items:
        """
        service = superdesk.get_resource_service('activity_report')
        new_items = []

        for item in items:
            item['activity_date'] = get_date(item['activity_date'])
            existing_item = service.find_one(req=None,
                                             activity_date=item['activity_date'],
                                             user_id=item['user_id'])
            if existing_item:
                service.patch(existing_item['_id'], item)
            else:
                new_items.append(item)

        if new_items:
            service.post(new_items)
def noise11_derive_metadata(item, **kwargs):
    """
    By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline
    appropriately
    :param item:
    :param kwargs:
    :return:
    """
    try:
        if 'anpa_category' not in item:
            category_map = superdesk.get_resource_service(
                'vocabularies').find_one(req=None, _id='categories')
            if category_map:
                map_entry = next(
                    (code for code in category_map['items']
                     if code['qcode'] == 'e' and code['is_active']), None)
                item['anpa_category'] = [{
                    'qcode': 'e',
                    'name': map_entry['name']
                }]

        if 'subject' not in item:
            qcode = '01000000'
            item['subject'] = [{'qcode': qcode, 'name': subject_codes[qcode]}]

        cities = find_cities(country_code='AU', state_code='NSW')
        located = [c for c in cities if c['city'].lower() == 'sydney']

        if located and 'dateline' not in item:
            item['dateline'] = {
                'date': item['firstcreated'],
                'located': located[0]
            }
        item['dateline']['source'] = item['source']
        item['dateline']['text'] = format_dateline_to_locmmmddsrc(
            located[0], get_date(item['firstcreated']), source=item['source'])

        return item
    except Exception as ex:
        logger.exception(ex)
Exemplo n.º 34
0
    def ap_derive_dateline(self, item):
        """This function looks for a dateline in the article body an uses that.

        :param item:
        :return: item populated with a dateline
        """
        try:
            html = item.get("body_html")
            if html:
                parsed = parse_html(html, content="html")
                for par in parsed.xpath("/div/child::*"):
                    if not par.text:
                        continue
                    city, source, the_rest = par.text.partition(" (AP) _ ")
                    if source:
                        # sometimes the city is followed by a comma and either a date or a state
                        city = city.split(",")[0]
                        if any(char.isdigit() for char in city):
                            return
                        cities = app.locators.find_cities()
                        located = [c for c in cities if c["city"].lower() == city.lower()]
                        item.setdefault("dateline", {})
                        item["dateline"]["located"] = (
                            located[0]
                            if len(located) == 1
                            else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"}
                        )
                        item["dateline"]["source"] = item.get("original_source", "AP")
                        item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
                            item["dateline"]["located"],
                            get_date(item["firstcreated"]),
                            source=item.get("original_source", "AP"),
                        )
                        break

            return item
        except Exception:
            logging.exception("AP dateline extraction exception")
def ap_weather_format(item, **kwargs):
    if not item.get('slugline', '').startswith('WEA--GlobalWeather-Ce') or not item.get('source', '') == 'AP':
        raise SuperdeskApiError.badRequestError("Article should be an AP sourced weather table")
    item['slugline'] = 'WORLD WEATHER'

    text = get_text(item['body_html'], content='html')
    lines = text.splitlines()
    if not lines[0] == 'BC-WEA--Global Weather-Celsius,<':
        raise SuperdeskApiError.badRequestError("Table should be in Celsius only")

    # tabular column max lengths are extracted into this list
    columns = []
    # map of the columns to extract and the substitutions to apply to the column
    columnMap = ({'index': 0}, {'index': 1}, {'index': 2},
                 {'index': 3, 'substitute': [('COND', 'CONDITIONS'),
                                             ('pc', 'partly cloudy'), ('clr', 'clear'),
                                             ('cdy', 'cloudy'), ('rn', 'rain'),
                                             ('sn', 'snow')]})
    # story preamble
    preamble = 'Temperatures and conditions in world centres:\r\n'
    output = StringIO()
    output.write(preamble)

    # story is always datelined News York
    city = 'New York City'
    cities = app.locators.find_cities()
    located = [c for c in cities if c['city'].lower() == city.lower()]
    if 'dateline' not in item:
        item['dateline'] = {}
    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city,
                                                                       'tz': 'UTC', 'dateline': 'city'}
    item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(),
                                                      tz=timezone(item['dateline']['located']['tz']))
    item['dateline']['source'] = 'AP'
    item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                              get_date(item['firstcreated']),
                                                              source=item.get('original_source', 'AP'))

    item['headline'] = 'World Weather for ' + item['dateline']['date'].strftime('%b %-d')

    item['subject'] = [{"name": "weather", "qcode": "17000000"}]
    locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
    item['place'] = [x for x in locator_map.get('items', []) if x['qcode'] == 'US']

    if lines:
        # scan all the lines in the file for potential collimated lines and calculate the length
        # of the column
        for line in lines:
            row = re.split('[;\<]+', line)
            # only consider it if there are more than two rows
            if len(row) > 2:
                index = 0
                for col in row:
                    # check if the column is mapped
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map):
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                    # if it's a new column
                    if 0 <= index < len(columns):
                        # check the length
                        if len(col) > columns[index]:
                            columns[index] = len(col)
                    else:
                        columns.append(len(col))
                    index += 1

        for line in lines:
            row = re.split('[;\<]+', line)
            if len(row) > 2:
                index = 0
                for col in row:
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map) > 0:
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                        output.write(
                            '{}'.format(col.lstrip('\t').ljust(columns[map[0].get('index')] + 2)).rstrip('\r\n'))
                    index += 1
                output.write('\r\n')

        item['body_html'] = '<pre>' + output.getvalue() + '</pre>'
    return item
Exemplo n.º 36
0
 def test_get_date(self):
     self.assertIsInstance(get_date('2012-12-12'), datetime)
     self.assertIsInstance(get_date(datetime.now()), datetime)
     self.assertIsNone(get_date(None))
Exemplo n.º 37
0
    def _parse_formatted_email(self, data, provider):
        """Construct an item from an email that was constructed as a notification from a google form submission.

        The google form submits to a google sheet, this sheet creates the email as a notification

        :param data:
        :param provider:
        :return: A list of 1 item
        """
        try:
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item['versioncreated'] = utcnow()
            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    # Check that the subject line matches what we expect, ignore it if not
                    if self.parse_header(
                            msg['subject']) != 'Formatted Editorial Story':
                        return []

                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            # if we don't know the charset just have a go!
                            if part.get_content_charset() is None:
                                json_str = body.decode().replace('\r\n',
                                                                 '').replace(
                                                                     '  ', ' ')
                            else:
                                charset = part.get_content_charset()
                                json_str = body.decode(charset).replace(
                                    '\r\n', '').replace('  ', ' ')

                            mail_item = dict(
                                (k, v[0])
                                for k, v in json.loads(json_str).items())

                            self._expand_category(item, mail_item)

                            item['original_source'] = mail_item.get(
                                'Username', '')
                            item['headline'] = mail_item.get('Headline', '')
                            item['abstract'] = mail_item.get('Abstract', '')
                            item['slugline'] = mail_item.get('Slugline', '')
                            item['body_html'] = '<p>' + mail_item.get(
                                'Body', '').replace('\n', '</p><p>') + '</p>'

                            default_source = app.config.get(
                                'DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES')
                            city = mail_item.get('Dateline', '')
                            cities = app.locators.find_cities()
                            located = [
                                c for c in cities
                                if c['city'].lower() == city.lower()
                            ]
                            item.setdefault('dateline', {})
                            item['dateline']['located'] = located[0] if len(
                                located) > 0 else {
                                    'city_code': city,
                                    'city': city,
                                    'tz': 'UTC',
                                    'dateline': 'city'
                                }
                            item['dateline']['source'] = default_source
                            item['dateline'][
                                'text'] = format_dateline_to_locmmmddsrc(
                                    item['dateline']['located'],
                                    get_date(item['firstcreated']),
                                    source=default_source)

                            if mail_item.get('Priority') != '':
                                if mail_item.get('Priority', '3').isdigit():
                                    item['priority'] = int(
                                        mail_item.get('Priority', '3'))
                                else:
                                    priority_map = superdesk.get_resource_service(
                                        'vocabularies').find_one(
                                            req=None, _id='priority')
                                    priorities = [
                                        x
                                        for x in priority_map.get('items', [])
                                        if x['name'].upper() == mail_item.get(
                                            'Priority', '').upper()
                                    ]
                                    if priorities is not None and len(
                                            priorities) > 0:
                                        item['priority'] = int(
                                            priorities[0].get('qcode', '3'))
                                    else:
                                        item['priority'] = 3
                            if mail_item.get('News Value') != '':
                                item['urgency'] = int(
                                    mail_item.get('News Value', '3'))

                            # We expect the username passed corresponds to a superdesk user
                            query = {
                                'email':
                                re.compile(
                                    '^{}$'.format(mail_item.get('Username')),
                                    re.IGNORECASE)
                            }
                            user = superdesk.get_resource_service(
                                'users').find_one(req=None, **query)
                            if not user:
                                logger.error(
                                    'Failed to find user for email {}'.format(
                                        mail_item.get('Username')))
                                raise UserNotRegisteredException()
                            item['original_creator'] = user.get('_id')
                            if BYLINE in user and user.get(BYLINE, ''):
                                item['byline'] = user.get(BYLINE)
                            item[SIGN_OFF] = user.get(SIGN_OFF)

                            # attempt to match the given desk name against the defined desks
                            query = {
                                'name':
                                re.compile(
                                    '^{}$'.format(mail_item.get('Desk', '')),
                                    re.IGNORECASE)
                            }
                            desk = superdesk.get_resource_service(
                                'desks').find_one(req=None, **query)
                            if desk:
                                item['task'] = {
                                    'desk': desk.get('_id'),
                                    'stage': desk.get('incoming_stage')
                                }

                            if 'Place' in mail_item:
                                locator_map = superdesk.get_resource_service(
                                    'vocabularies').find_one(req=None,
                                                             _id='locators')
                                place = [
                                    x for x in locator_map.get('items', [])
                                    if x['qcode'] == mail_item.get(
                                        'Place', '').upper()
                                ]
                                if place is not None:
                                    item['place'] = place

                            if mail_item.get('Legal flag', '') == 'LEGAL':
                                item['flags'] = {'marked_for_legal': True}

                            break

            return [item]
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Exemplo n.º 38
0
def ap_weather_format(item, **kwargs):
    if not item.get('slugline', '').startswith('WEA--GlobalWeather-Ce') or not item.get('source', '') == 'AP':
        raise SuperdeskApiError.badRequestError("Article should be an AP sourced weather table")
    item['slugline'] = 'WORLD WEATHER'

    text = get_text(item['body_html'], content='html')
    lines = text.splitlines()
    if not lines[0] == 'BC-WEA--Global Weather-Celsius,<':
        raise SuperdeskApiError.badRequestError("Table should be in Celsius only")

    # tabular column max lengths are extracted into this list
    columns = []
    # map of the columns to extract and the substitutions to apply to the column
    columnMap = ({'index': 0}, {'index': 1}, {'index': 2},
                 {'index': 3, 'substitute': [('COND', 'CONDITIONS'),
                                             ('pc', 'partly cloudy'), ('clr', 'clear'),
                                             ('cdy', 'cloudy'), ('rn', 'rain'),
                                             ('sn', 'snow')]})
    # story preamble
    preamble = 'Temperatures and conditions in world centres:\r\n'
    output = StringIO()
    output.write(preamble)

    # story is always datelined News York
    city = 'New York City'
    cities = app.locators.find_cities()
    located = [c for c in cities if c['city'].lower() == city.lower()]
    if 'dateline' not in item:
        item['dateline'] = {}
    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city,
                                                                       'tz': 'UTC', 'dateline': 'city'}
    item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(),
                                                      tz=timezone(item['dateline']['located']['tz']))
    item['dateline']['source'] = 'AP'
    item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                              get_date(item['firstcreated']),
                                                              source=item.get('original_source', 'AP'))

    item['headline'] = 'World Weather for ' + item['dateline']['date'].strftime('%b %-d')

    item['subject'] = [{"name": "weather", "qcode": "17000000"}]
    locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
    item['place'] = [x for x in locator_map.get('items', []) if x['qcode'] == 'US']

    if lines:
        # scan all the lines in the file for potential collimated lines and calculate the length
        # of the column
        for line in lines:
            row = re.split('[;\<]+', line)
            # only consider it if there are more than two rows
            if len(row) > 2:
                index = 0
                for col in row:
                    # check if the column is mapped
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map):
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                    # if it's a new column
                    if 0 <= index < len(columns):
                        # check the length
                        if len(col) > columns[index]:
                            columns[index] = len(col)
                    else:
                        columns.append(len(col))
                    index += 1

        for line in lines:
            row = re.split('[;\<]+', line)
            if len(row) > 2:
                index = 0
                for col in row:
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map) > 0:
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                        output.write(
                            '{}'.format(col.lstrip('\t').ljust(columns[map[0].get('index')] + 2)).rstrip('\r\n'))
                    index += 1
                output.write('\r\n')

        item['body_html'] = '<pre>' + output.getvalue() + '</pre>'
    return item
def process_victorian_harness_racing(item, **kwargs):

    number_words_map = {1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five',
                        6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten',
                        11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen',
                        15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen',
                        19: 'Nineteen', 20: 'Twenty', 30: 'Thirty', 40: 'Forty',
                        50: 'Fifty', 60: 'Sixty', 70: 'Seventy', 80: 'Eighty',
                        90: 'Ninety', 0: 'Zero'}

    substitution_map = OrderedDict({"second": "2nd", "third": "3rd", "fourth": "4th", "fifth": "5th", "sixth": "6th",
                                    "seventh": "7th", "eighth": "8th", "ninth": "9th", "2nd row": "second row",
                                    "2nd up": "second up", "2nd line": "second line", "2nd run": "second run",
                                    "2nd pick": "second pick", "January": "Jan", "February": "Feb", "August": "Aug",
                                    "September": "Sept", "October": "Oct", "November": "Nov", "December": "Dec",
                                    "Harold Park": "HP", "Moonee Valley": "MV"})

    def race_number_to_words(race):
        n = int(race.replace('Race', '').replace(':', ''))
        try:
            return titlecase(number_words_map[n])
        except KeyError:
            try:
                return titlecase(number_words_map[n - n % 10] + number_words_map[n % 10].lower())
            except KeyError:
                return str(n)

    content = item.get('body_html', '')
    comment_item = {
        "anpa_category": [
            {
                "qcode": "r",
                "name": "Racing (Turf)",
                "subject": "15030001"
            }
        ],
        "subject": [
            {
                "parent": "15000000",
                "name": "horse racing, harness racing",
                "qcode": "15030000"
            }
        ],
        "place": [
            {
                "state": "Victoria",
                "name": "VIC",
                "group": "Australia",
                "country": "Australia",
                "qcode": "VIC",
                "world_region": "Oceania"
            }
        ],
        FORMAT: FORMATS.HTML,
        ITEM_TYPE: CONTENT_TYPE.TEXT
    }
    selections_item = deepcopy(comment_item)
    # copy the genre of the item that we are oprerting on
    if 'genre' in item:
        selections_item['genre'] = deepcopy(item['genre'])

    parsed = parse_html(content, content='html')

    for tag in parsed.xpath('/html/div/child::*'):
        if tag.tag == 'p':
            if tag.text.startswith('VENUE: '):
                venue = tag.text.replace('VENUE: ', '')
            elif tag.text.startswith('DATE: '):
                try:
                    meeting_date = datetime.strptime(tag.text.replace('DATE: ', '').replace(' ', ''), '%d/%m/%y')
                except Exception:
                    logger.warning('Date format exception for {}'.format(tag.text.replace('DATE: ', '')))
                    try:
                        meeting_date = datetime.strptime(tag.text.replace('DATE: ', '').replace(' ', ''), '%d/%m/%Y')
                    except Exception:
                        logger.warning('Date format exception 2 for {}'.format(tag.text.replace('DATE: ', '')))
                        try:
                            meeting_date = get_date(tag.text.replace('DATE: ', '').replace(' ', ''))
                        except Exception:
                            logger.warning('Date format exception 3 for {}'.format(tag.text.replace('DATE: ', '')))
                            meeting_date = utcnow()

                comment_item['slugline'] = venue + ' Comment'
                comment_item['anpa_take_key'] = meeting_date.strftime('%A')
                comment_item['headline'] = venue + ' Trot Comment ' + meeting_date.strftime('%A')
                comment_item['firstcreated'] = utcnow()
                set_dateline(comment_item, 'Melbourne', 'AAP')

                selections_item['slugline'] = venue + ' Selections'
                selections_item['anpa_take_key'] = meeting_date.strftime('%A')
                selections_item['headline'] = venue + ' Trot Selections ' + meeting_date.strftime('%A')
                selections_item['firstcreated'] = utcnow()
                set_dateline(selections_item, 'Melbourne', 'AAP')
                selections_item['body_html'] = '<p>{} Selections for {}\'s {} trots.-</p>'.format(
                    selections_item.get('dateline').get('text'),
                    meeting_date.strftime('%A'), venue)
                selections_item['firstcreated'] = utcnow()
                break

    regex = r"Race ([1-9][0-9]|[1-9]):"
    for tag in parsed.xpath('/html/div/child::*'):
        if tag.tag == 'p':
            m = re.match(regex, tag.text)
            if m:
                selections_item['body_html'] += '<p>{} '.format(tag.text)
            if tag.text.startswith('SELECTIONS: '):
                sels = titlecase(tag.text.replace('SELECTIONS: ', ''))
                # In some cases there is no comma between the selections, apparently there should be!
                sels = sels.replace(') ', '), ')
                sels = re.sub(r'\s\(.*?\)', '', sels)
                # get rid of the trailing one
                sels = re.sub(r'(, $|,$)', ' ', sels)
                selections_item['body_html'] += '{}</p>'.format(sels)
    selections_item['body_html'] += '<p>AAP SELECTIONS</p>'

    comment_item['body_html'] = ''
    overview = ''
    regex = r"Race ([1-9][0-9]|[1-9]):"
    for tag in parsed.xpath('/html/div/child::*'):
        if tag.tag == 'p':
            m = re.match(regex, tag.text)
            if m:
                comment_item['body_html'] += '<p>Race {}:</p>'.format(race_number_to_words(tag.text))
            if tag.text.startswith('EARLY SPEED: '):
                comment_item['body_html'] += '<p>{}</p>'.format(overview.rstrip())
                overview = ''
                comment_item['body_html'] += '<p>{}</p>'.format(tag.text.rstrip())
            if tag.text.startswith('OVERVIEW: '):
                overview = tag.text
            elif overview:
                overview += tag.text

    for i, j in substitution_map.items():
        comment_item['body_html'] = comment_item['body_html'].replace(i, j)
    comment_item['body_html'] += '<p>AAP COMMENT</p>'

    service = get_resource_service('archive')
    selections_item['task'] = item.get('task')
    selections_item['profile'] = item.get('profile')
    selections_item[ITEM_STATE] = CONTENT_STATE.PROGRESS
    service.post([selections_item])

    item.update(comment_item)

    return item
class ActivityReportTestCase(AAPTestCase):
    history = [{
        "operation": "create",
        "_created": get_date("2017-05-09T01:55:47+0000"),
        "item_id": "item1",
        "version": 1,
        "_updated": get_date("2017-05-09T01:55:47+0000"),
        "user_id": "user1"
    }, {
        "operation": "update",
        "_created": get_date("2017-05-09T01:55:47+0000"),
        "item_id": "item1",
        "version": 2,
        "_updated": get_date("2017-05-09T01:55:47+0000"),
        "user_id": "user1"
    }, {
        "operation": "create",
        "_created": get_date("2017-05-09T01:55:47+0000"),
        "item_id": "item2",
        "version": 1,
        "_updated": get_date("2017-05-09T01:55:47+0000"),
        "user_id": "user2"
    }, {
        "operation": "create",
        "_created": get_date("2017-05-09T01:55:47+0000"),
        "item_id": "item3",
        "version": 1,
        "_updated": get_date("2017-05-09T01:55:47+0000"),
        "user_id": "user1"
    }, {
        "operation": "update",
        "_created": get_date("2017-05-09T01:55:47.000+0000"),
        "item_id": "item3",
        "version": 2,
        "_updated": get_date("2017-05-09T01:55:47+0000"),
        "user_id": "user1"
    }, {
        "operation": "spike",
        "_created": get_date("2017-05-09T01:55:47+0000"),
        "item_id": "item3",
        "version": 3,
        "_updated": get_date("2017-05-09T01:55:47+0000"),
        "user_id": "user2"
    }, {
        "operation": "create",
        "_created": get_date("2017-05-10T01:55:47+0000"),
        "item_id": "item4",
        "version": 1,
        "_updated": get_date("2017-05-10T01:55:47+0000"),
        "user_id": "user2"
    }, {
        "operation": "create",
        "_created": get_date("2017-05-10T01:55:47+0000"),
        "item_id": "item5",
        "version": 1,
        "_updated": get_date("2017-05-10T01:55:47+0000"),
        "user_id": "user2",
        "original_item_id": "item2"
    }]

    def setUp(self):
        super().setUp()
        self.app.data.insert('archive_history', self.history)

    def test_compare_repos(self):
        with self.app.app_context():
            cmd = GenerateActivityCountReport()
            items = cmd.run(get_date("2017-05-10T23:59:59+0000"))
            self.assertEqual(len(items), 3)
            user1_items = [
                item for item in items if item['user_id'] == 'user1'
            ]
            user2_items = [
                item for item in items if item['user_id'] == 'user2'
            ]
            self.assertEqual(len(user1_items), 1)
            self.assertEqual(len(user2_items), 2)
            self.assertEqual(user1_items[0]['create_count'], 1)
            self.assertEqual(user1_items[0]['update_count'], 1)
            self.assertEqual(user2_items[0]['create_count'], 1)
            self.assertEqual(user2_items[0]['update_count'], 0)
            self.assertEqual(user2_items[1]['create_count'], 1)
            self.assertEqual(user2_items[1]['update_count'], 0)
Exemplo n.º 41
0
 def test_get_date(self):
     self.assertIsInstance(get_date('2012-12-12'), datetime)
     self.assertIsInstance(get_date(datetime.now()), datetime)
     self.assertIsNone(get_date(None))
Exemplo n.º 42
0
    def _parse_formatted_email(self, data, provider):
        """Construct an item from an email that was constructed as a notification from a google form submission.

        The google form submits to a google sheet, this sheet creates the email as a notification

        :param data:
        :param provider:
        :return: A list of 1 item
        """
        try:
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item['versioncreated'] = utcnow()
            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    # Check that the subject line matches what we expect, ignore it if not
                    if self.parse_header(msg['subject']) != 'Formatted Editorial Story':
                        return []

                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            # if we don't know the charset just have a go!
                            if part.get_content_charset() is None:
                                json_str = body.decode().replace('\r\n', '').replace('  ', ' ')
                            else:
                                charset = part.get_content_charset()
                                json_str = body.decode(charset).replace('\r\n', '').replace('  ', ' ')

                            mail_item = dict((k, v[0]) for k, v in json.loads(json_str).items())

                            self._expand_category(item, mail_item)

                            item['original_source'] = mail_item.get('Username', '')
                            item['headline'] = mail_item.get('Headline', '')
                            item['abstract'] = mail_item.get('Abstract', '')
                            item['slugline'] = mail_item.get('Slugline', '')
                            item['body_html'] = '<p>' + mail_item.get('Body', '').replace('\n', '</p><p>') + '</p>'

                            default_source = app.config.get('DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES')
                            city = mail_item.get('Dateline', '')
                            cities = app.locators.find_cities()
                            located = [c for c in cities if c['city'].lower() == city.lower()]
                            item.setdefault('dateline', {})
                            item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city,
                                                                                               'city': city,
                                                                                               'tz': 'UTC',
                                                                                               'dateline': 'city'}
                            item['dateline']['source'] = default_source
                            item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                                      get_date(item['firstcreated']),
                                                                                      source=default_source)

                            if mail_item.get('Priority') != '':
                                if mail_item.get('Priority', '3').isdigit():
                                    item['priority'] = int(mail_item.get('Priority', '3'))
                                else:
                                    priority_map = superdesk.get_resource_service('vocabularies').find_one(
                                        req=None, _id='priority')
                                    priorities = [x for x in priority_map.get('items', []) if
                                                  x['name'].upper() == mail_item.get('Priority', '').upper()]
                                    if priorities is not None and len(priorities) > 0:
                                        item['priority'] = int(priorities[0].get('qcode', '3'))
                                    else:
                                        item['priority'] = 3
                            if mail_item.get('News Value') != '':
                                item['urgency'] = int(mail_item.get('News Value', '3'))

                            # We expect the username passed corresponds to a superdesk user
                            query = {'email': re.compile('^{}$'.format(mail_item.get('Username')), re.IGNORECASE)}
                            user = superdesk.get_resource_service('users').find_one(req=None, **query)
                            if not user:
                                logger.error('Failed to find user for email {}'.format(mail_item.get('Username')))
                                raise UserNotRegisteredException()
                            item['original_creator'] = user.get('_id')
                            if BYLINE in user and user.get(BYLINE, ''):
                                item['byline'] = user.get(BYLINE)
                            item[SIGN_OFF] = user.get(SIGN_OFF)

                            # attempt to match the given desk name against the defined desks
                            query = {'name': re.compile('^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE)}
                            desk = superdesk.get_resource_service('desks').find_one(
                                req=None, **query)
                            if desk:
                                item['task'] = {'desk': desk.get('_id'), 'stage': desk.get('incoming_stage')}

                            if 'Place' in mail_item:
                                locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None,
                                                                                                      _id='locators')
                                place = [x for x in locator_map.get('items', []) if
                                         x['qcode'] == mail_item.get('Place', '').upper()]
                                if place is not None:
                                    item['place'] = place

                            if mail_item.get('Legal flag', '') == 'LEGAL':
                                item['flags'] = {'marked_for_legal': True}

                            break

            return [item]
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Exemplo n.º 43
0
 def get_dateline_date(self, ap_item):
     if ap_item.get("firstcreated"):
         dateline_date = get_date(ap_item["firstcreated"]).replace(tzinfo=pytz.UTC)
     else:
         dateline_date = utcnow()
     return dateline_date
Exemplo n.º 44
0
    def parse(self, s_json, provider=None):
        in_item = s_json.get('data', {}).get('item')
        nitf_item = s_json.get('nitf', {})
        item = {
            'guid':
            in_item.get('altids', {}).get('itemid') + ':' +
            str(in_item.get('version'))
        }
        item['source'] = provider.get('source') if provider else 'AP'

        for copy_property in self.direct_copy_properties:
            if in_item.get(copy_property) is not None:
                item[copy_property] = in_item[copy_property]

        if in_item.get('version'):
            item['version'] = in_item['version']

        if in_item.get('versioncreated'):
            item['versioncreated'] = self.datetime(
                in_item.get('versioncreated'))

        if in_item.get('firstcreated'):
            item['firstcreated'] = self.datetime(in_item.get('firstcreated'))

        if len(in_item.get('infosource', [])):
            item['original_source'] = ','.join(
                [n.get('name') for n in in_item.get('infosource', [])])

        if in_item.get('datelinelocation'):
            cities = app.locators.find_cities()
            # Try to find a single matching city either by city and country or city country and state
            located = [
                c for c in cities
                if c['city'] == in_item.get('datelinelocation').get('city')
                and c['country'] == in_item.get('datelinelocation').get(
                    'countryname')
            ]
            if len(located) > 1:
                located = [
                    c for c in cities
                    if c['city'] == in_item.get('datelinelocation').get('city')
                    and c['country'] == in_item.get('datelinelocation').get(
                        'countryname') and c['state'] == in_item.get(
                            'datelinelocation').get('countryareaname')
                ]
            if len(located) == 1:
                item['dateline'] = dict()
                item['dateline']['located'] = located[0]
                item['dateline']['source'] = provider.get('source')
                item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                    item['dateline']['located'],
                    get_date(item['firstcreated']), provider.get('source'))

        if len(in_item.get('bylines', [])):
            item['byline'] = ','.join([
                n.get('name') if n.get('name') else n.get('by', '') +
                (' ({})'.format(n.get('title')) if n.get('title') else '')
                for n in in_item.get('bylines', [])
            ])
            if item.get('byline').startswith('By '):
                item['byline'] = item['byline'][3:]

        if len(in_item.get('usageterms', [])):
            item['usageterms'] = ', '.join(
                [n for n in in_item.get('usageterms', [])])

        if in_item.get('type') == 'picture':
            if in_item.get('renditions'):
                self._parse_renditions(in_item['renditions'], item, provider)

            if in_item.get('description_caption'):
                item['description_text'] = in_item.get('description_caption')
                item['archive_description'] = in_item.get(
                    'description_caption')

            if in_item.get('description_creditline'):
                item['credit'] = in_item.get('description_creditline')

            if in_item.get('photographer', {}).get('name'):
                item['byline'] = in_item.get('photographer', {}).get('name')

        if in_item.get('type') == 'text':
            # Peel off the take key if possible
            if ',' in item['slugline']:
                item['anpa_take_key'] = item['slugline'].split(',')[1]
                item['slugline'] = item['slugline'].split(',')[0]
            if item['slugline'].startswith('BC-'):
                item['slugline'] = item['slugline'][3:]
            if item.get('ednote', '').startswith('Eds:'):
                item['ednote'] = item['ednote'][5:]
            if in_item.get('headline_extended'):
                item['abstract'] = in_item.get('headline_extended')

            self.categorisation_mapping(in_item, item)

            # Map the urgency to urgency and priority
            if in_item.get('urgency'):
                item[ITEM_URGENCY] = int(
                    in_item['urgency']) if in_item['urgency'] <= 5 else 5
                item[ITEM_PRIORITY] = self.priority_map.get(
                    in_item['urgency'], 5)

            if nitf_item.get('body_html'):
                # item['body_html'] = sd_etree.clean_html_str(nitf_item.get('body_html'))
                item['body_html'] = nitf_item.get('body_html').replace(
                    '<block id="Main">', '').replace('</block>', '')

        if s_json.get('associations'):
            self._parse_associations(s_json['associations'], item, provider)

        return item
Exemplo n.º 45
0
    def parse(self, s_json, provider=None):
        in_item = s_json.get("data", {}).get("item")
        nitf_item = s_json.get("nitf", {})
        item = {
            "guid":
            in_item.get("altids", {}).get("itemid") + ":" +
            str(in_item.get("version"))
        }
        item["source"] = provider.get("source") if provider else "AP"

        for copy_property in self.direct_copy_properties:
            if in_item.get(copy_property) is not None:
                item[copy_property] = in_item[copy_property]

        if in_item.get("version"):
            item["version"] = in_item["version"]

        if in_item.get("versioncreated"):
            item["versioncreated"] = self.datetime(
                in_item.get("versioncreated"))

        if in_item.get("firstcreated"):
            item["firstcreated"] = self.datetime(in_item.get("firstcreated"))

        if len(in_item.get("infosource", [])):
            item["original_source"] = ",".join(
                [n.get("name") for n in in_item.get("infosource", [])])

        if in_item.get("datelinelocation"):
            cities = app.locators.find_cities()
            # Try to find a single matching city either by city and country or city country and state
            located = [
                c for c in cities
                if c["city"] == in_item.get("datelinelocation").get("city")
                and c["country"] == in_item.get("datelinelocation").get(
                    "countryname")
            ]
            if len(located) > 1:
                located = [
                    c for c in cities
                    if c["city"] == in_item.get("datelinelocation").get("city")
                    and c["country"] == in_item.get("datelinelocation").get(
                        "countryname") and c["state"] == in_item.get(
                            "datelinelocation").get("countryareaname")
                ]
            if len(located) == 1:
                item["dateline"] = dict()
                item["dateline"]["located"] = located[0]
                item["dateline"]["source"] = provider.get("source")
                item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
                    item["dateline"]["located"],
                    get_date(item["firstcreated"]), provider.get("source"))

        if len(in_item.get("bylines", [])):
            item["byline"] = ",".join([
                n.get("name") if n.get("name") else n.get("by", "") +
                (" ({})".format(n.get("title")) if n.get("title") else "")
                for n in in_item.get("bylines", [])
            ])
            if item.get("byline").startswith("By "):
                item["byline"] = item["byline"][3:]

        if len(in_item.get("usageterms", [])):
            item["usageterms"] = ", ".join(
                [n for n in in_item.get("usageterms", [])])

        if in_item.get("type") == "picture":
            if in_item.get("renditions"):
                self._parse_renditions(in_item["renditions"], item, provider)

            if in_item.get("description_caption"):
                item["description_text"] = in_item.get("description_caption")
                item["archive_description"] = in_item.get(
                    "description_caption")

            if in_item.get("description_creditline"):
                item["credit"] = in_item.get("description_creditline")

            if in_item.get("photographer", {}).get("name"):
                item["byline"] = in_item.get("photographer", {}).get("name")

        if in_item.get("type") == "text":
            # Peel off the take key if possible
            if "," in item["slugline"]:
                item["anpa_take_key"] = item["slugline"].split(",")[1]
                item["slugline"] = item["slugline"].split(",")[0]
            if item["slugline"].startswith("BC-"):
                item["slugline"] = item["slugline"][3:]
            if item.get("ednote", "").startswith("Eds:"):
                item["ednote"] = item["ednote"][5:]
            if in_item.get("headline_extended"):
                item["abstract"] = in_item.get("headline_extended")

            self.categorisation_mapping(in_item, item)

            # Map the urgency to urgency and priority
            if in_item.get("urgency"):
                item[ITEM_URGENCY] = int(
                    in_item["urgency"]) if in_item["urgency"] <= 5 else 5
                item[ITEM_PRIORITY] = self.priority_map.get(
                    in_item["urgency"], 5)

            if nitf_item.get("body_html"):
                # item['body_html'] = sd_etree.clean_html_str(nitf_item.get('body_html'))
                item["body_html"] = nitf_item.get("body_html").replace(
                    '<block id="Main">', "").replace("</block>", "")

        if s_json.get("associations"):
            self._parse_associations(s_json["associations"], item, provider)

        return item
Exemplo n.º 46
0
    def _parse_doc(self, doc):
        new_doc = {}
        new_doc['_id'] = doc['refPtr']
        new_doc['guid'] = doc['refPtr']
        try:
            new_doc['description_text'] = doc['caption']
        except KeyError:
            pass
        try:
            new_doc['headline'] = doc['headline']
        except KeyError:
            pass
        try:
            new_doc['original_source'] = new_doc['source'] = doc['credit']
        except KeyError:
            pass
        new_doc['versioncreated'] = new_doc['firstcreated'] = self._datetime(
            local_to_utc(SCANPIX_TZ, get_date(doc['archivedTime'])))
        new_doc['pubstatus'] = 'usable'
        # This must match the action
        new_doc['_type'] = 'externalsource'
        # entry that the client can use to identify the fetch endpoint
        new_doc['fetch_endpoint'] = 'scanpix'

        # mimetype is not directly found in Scanpix API
        # so we use original filename to guess it
        mimetype = mimetypes.guess_type("_{}".format(
            splitext(doc.get('originalFileName', ''))[1]))[0]
        if mimetype is None:
            # nothing found with filename, we try out luck with fileFormat
            try:
                format_ = doc['fileFormat'].split()[0]
            except (KeyError, IndexError):
                mimetype = None
            else:
                mimetype = mimetypes.guess_type('_.{}'.format(format_))[0]
        if mimetype is not None:
            new_doc['mimetype'] = mimetype

        main_group = doc['mainGroup']
        if main_group == 'video':
            new_doc[ITEM_TYPE] = CONTENT_TYPE.VIDEO
        elif main_group == 'graphic':
            new_doc[ITEM_TYPE] = CONTENT_TYPE.GRAPHIC
            new_doc['mimetype'] = 'image/jpeg'
        else:
            new_doc[ITEM_TYPE] = CONTENT_TYPE.PICTURE

        try:
            doc_previews = doc['previews']
        except KeyError:
            logger.warning('no preview found for item {}'.format(
                new_doc['_id']))
        else:
            # we look for best available scanpix preview
            available_previews = [p['type'] for p in doc_previews]
            renditions = new_doc['renditions'] = {}
            for rend, previews in REND2PREV.items():
                for prev in previews:
                    if prev in available_previews:
                        idx = available_previews.index(prev)
                        renditions[rend] = {"href": doc_previews[idx]['url']}
                        break

        new_doc['byline'] = doc['byline']
        doc.clear()
        doc.update(new_doc)
Exemplo n.º 47
0
 def _get_local_time(self, time, tz=None):
     if time is None:
         time = utcnow()
     if not tz:
         tz = self.TIMEZONE
     return utc_to_local(tz, get_date(time))