예제 #1
0
    def proc(record, parser):
        if record.json is None:
            rerror('record.json is None', record)
            return

        unescape_abstract(record, parser)
        unescape_titles(record, parser)
예제 #2
0
    def proc(record, parser):
        if record.json is None:
            rerror('record.json is None', record)
            return

        unescape_abstract(record, parser)
        unescape_titles(record, parser)
예제 #3
0
 def proc_find(record):
     if record.json and 'authors' in record.json:
         for a in record.json['authors']:
             s = sum(map(bool, a.values()))
             if s == 0:
                 rerror('error', record)
                 missing_authors.append(record.id)
                 return
예제 #4
0
    def proc(record):
        if not record.json:
            rerror('no json.', record)
            return

        if 'record_creation_year' not in record.json:
            date = parse_date(record.json['record_creation_date'])
            if not date:
                rerror("Date couldn't be parsed: %s" % record.json['record_creation_date'], record)

            record.json['record_creation_year'] = date.year
            flag_modified(record, 'json')
예제 #5
0
    def proc(record):
        if not record.json:
            rerror('no json.', record)
            return

        if 'record_creation_year' not in record.json:
            date = parse_date(record.json['record_creation_date'])
            if not date:
                rerror(
                    "Date couldn't be parsed: %s" %
                    record.json['record_creation_date'], record)

            record.json['record_creation_year'] = date.year
            flag_modified(record, 'json')
예제 #6
0
def unescape_abstract(record, parser):
    if 'abstracts' not in record.json or len(record.json['abstracts']) == 0:
        rerror('Record has no abstracts.', record)
        return

    if len(record.json['abstracts']) > 1:
        rerror('Record has more then one abstracts (%d). Skipping.' % len(record.json['abstracts']), record)
        return

    original = record.json['abstracts'][0]['value']
    unescaped = parser.unescape(original)
    if unescaped != original:
        rinfo('Abstract changed.', record)
        record.json['abstracts'][0]['value'] = unescaped
        flag_modified(record, 'json')
예제 #7
0
def unescape_abstract(record, parser):
    if 'abstracts' not in record.json or len(record.json['abstracts']) == 0:
        rerror('Record has no abstracts.', record)
        return

    if len(record.json['abstracts']) > 1:
        rerror(
            'Record has more then one abstracts (%d). Skipping.' %
            len(record.json['abstracts']), record)
        return

    original = record.json['abstracts'][0]['value']
    unescaped = parser.unescape(original)
    if unescaped != original:
        rinfo('Abstract changed.', record)
        record.json['abstracts'][0]['value'] = unescaped
        flag_modified(record, 'json')
예제 #8
0
def unescape_titles(record, parser):
    if 'titles' not in record.json or len(record.json['titles']) == 0:
        rerror('Record has no titles.', record)
        return

    original = record.json['titles']
    unescaped = []

    for title in original:
        if 'title' not in title:
            rerror('title key not in title', record)

        title['title'] = parser.unescape(title['title'])
        unescaped.append(title)

    if unescaped != original:
        rinfo('Authors changed.', record)
        record.json['titles'] = unescaped
        flag_modified(record, 'json')
예제 #9
0
def unescape_titles(record, parser):
    if 'titles' not in record.json or len(record.json['titles']) == 0:
        rerror('Record has no titles.', record)
        return

    original = record.json['titles']
    unescaped = []

    for title in original:
        if 'title' not in title:
            rerror('title key not in title', record)

        title['title'] = parser.unescape(title['title'])
        unescaped.append(title)

    if unescaped != original:
        rinfo('Authors changed.', record)
        record.json['titles'] = unescaped
        flag_modified(record, 'json')
예제 #10
0
def map_old_record(record, dry_run):
    """
    Maps the given record if needed to comply with the new schema.

    Following fields will be mapped:
     - page_nr will be a list of integers instead of list of strings
     - arxiv id will be put to the arxiv_eprints field
     - arxiv categories will be added if not yet present
     - "arxiv:" prefix will be removed from arxiv id
     - record_creation_date will be converted to iso format

     Following fields will be deleted at the end of the process:
     - _collections
     - report_numbers
     - files
     - local_files
     - free_keywords
     - additional_files
     - file_urls
     - earliest_date

    The result won't be saved and None will be returned in the following cases:
     - the record doesn't contain a json
     - a record fails the validation after mapping
     - both report_numbers and arxiv_eprints fields are present (shouldn't happen in the existing records)
     - there is more then one value in report_numbers field (shouldn't happen in the existing records)
     - report_numbers field is present, but there is no source subfield
     - no record_creation_date is present
    """

    # if there is no json, the record is considered deleted
    if not record.json:
        rerror('no json', record)
        return

    # page_nr to list of integers
    if 'page_nr' in record.json:
        record.json['page_nr'] = [int(x) for x in record.json['page_nr']]

    # extract arxiv from report_numbers if present
    if "report_numbers" in record.json and "arxiv_eprints" in record.json:
        rerror('both report_numbers and arxiv_eprints are present. Skip record.', record)
        return

    if "report_numbers" in record.json:
        if len(record.json["report_numbers"]) > 1:
            rerror('report_numbers has more then one element. Skip record.', record)
            return

        arxiv_id = None
        for element in record.json.get("report_numbers", ()):
            source = element.get('source')
            if not source:
                rerror('report_numbers present, but no source. Skip record.', record)
                return

            if source.lower() == 'arxiv':
                arxiv_id = element.get('value')
                break

        if arxiv_id:
            arxiv_id = arxiv_id.lower().replace('arxiv:', '')
            record.json['arxiv_eprints'] = [{'value': arxiv_id}]
            rinfo('report_numbers -> arxiv_eprints', record)
        else:
            rerror('report_numbers present, but no arxiv id? Skip record.', record)
            return

    # add arxiv category if not yet present
    if "arxiv_eprints" in record.json:
        for element in record.json.get("arxiv_eprints", ()):
            if 'value' not in element:
                rerror('arxiv_eprints value missing', record)
                continue

            arxiv_id = element['value']

            # remove arxiv prefix if present
            if arxiv_id.lower().startswith('arxiv:'):
                rinfo('removing "arxiv:" prefix', record)
                arxiv_id = arxiv_id[len('arxiv:'):]

            if 'categories' not in element:
                categories = get_arxiv_categories(arxiv_id)
                element['categories'] = categories

    # record_creation_date to isoformat
    record_creation_date = record.json.get('record_creation_date')
    if record_creation_date is None:
        rerror('no record creation date. Skip record.', record)
        return

    new_date = parse_date(record_creation_date).isoformat()
    if new_date != record_creation_date:
        rinfo('update record_creation_date: %s -> %s' % (record_creation_date, new_date), record)
        record.json['record_creation_date'] = new_date

    # delete unwanted fields
    unwanted_fields = (
        '_collections',
        'report_numbers',
        'files',
        'local_files',
        'free_keywords',
        'additional_files',
        'file_urls',
        'earliest_date',
    )
    for key in unwanted_fields:
        if record.json.pop(key, None) is not None:
            rinfo('deleted %s field' % key, record)

    # validate record
    valid = False
    schema = record.json.get('$schema')
    if schema is not None:
        schema_data = requests_retry_session().get(schema).content
        schema_data = json.loads(schema_data)

        try:
            validate(record.json, schema_data)
            valid = True
        except ValidationError as err:
            rerror('Invalid record: %s' % err, record)
        except SchemaError as err:
            rerror('SchemaError during record validation! %s' % err, record)
    else:
        rerror('No schema found!', record)

    if not valid:
        return

    # mark changes if not dry_run
    if not dry_run:
        flag_modified(record, 'json')

    return record
예제 #11
0
    def proc(record):
        rinfo('start...', record)

        if '_files' not in record.json:
            rerror('Skipping. No _files', record)
            return

        xml = filter(lambda x: x['filetype'] == 'xml', record.json['_files'])
        if not xml:
            rerror('Skipping. No xml in _files', record)
            return

        object = ObjectVersion.get(xml[0]['bucket'], xml[0]['key'])
        uri = object.file.uri
        xml = parse(open(uri, 'rt'))
        x_author_groups = xml.getElementsByTagName('ce:author-group')

        if not x_author_groups:
            rerror('Skipping. No author groups.', record)
            return

        if len(x_author_groups) > 1:
            rinfo('Reparse all authors.', record)
            authors = []

            for x_author_group in x_author_groups:
                # skip if not deepest author-group
                if x_author_group.getElementsByTagName('ce:author-group'):
                    continue

                # extract affiliations
                x_affiliations = x_author_group.getElementsByTagName('ce:affiliation')
                affs = []
                for a in x_affiliations:
                    value = a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue
                    affs.append({
                        u'country': find_country(value),
                        u'value': value
                    })

                # extract authors, add affiliations
                x_authors = x_author_group.getElementsByTagName('ce:author')
                for x_author in x_authors:
                    given_name = x_author.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue
                    surname = x_author.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue
                    full_name = '%s, %s' % (surname, given_name)

                    author_affs = []
                    for ref in x_author.getElementsByTagName('ce:cross-ref'):
                        affid = ref.attributes.get('refid').value
                        if 'aff' in affid:
                            aff_value = get_aff_by_id(x_author_group, affid)
                            aff_country = find_country(aff_value)
                            author_affs.append({
                                u'country': aff_country,
                                u'value': aff_value
                            })

                    if not (author_affs or affs):
                        rerror('no affs for author: %s. Skip this record.' % surname, record)
                        return

                    authors.append({
                        'full_name': full_name,
                        'given_name': given_name,
                        'surname': surname,
                        'affiliations': author_affs or affs
                    })

            if authors:
                record.json['authors'] = authors
                flag_modified(record, 'json')
                rinfo('updated', record)
            else:
                rerror('No authors found', record)

        else:
            for x_author_group in x_author_groups:
                x_collaborations = x_author_group.getElementsByTagName('ce:collaboration')
                x_affiliations = x_author_group.getElementsByTagName('ce:affiliation')
                # needed for supporting multiple author groups with author matching, but author matching is not rly possible.
                # authors_in_group = [
                #     (c.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue.replace('-', '').title(),
                #      c.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue.replace('-', '').title())
                #     for c in x_author_group.getElementsByTagName('ce:author')
                # ]

                if 'authors' not in record.json:
                    # Type 1 and 3: has no authors at all. Fix: add collaborations if there are affiliations in xml.
                    rerror('No authors... SKIPPING', record)
                    return

                    # extract collaborations, find countries later
                    # FIXME we should always extract collaborations, but that would cause a lot more problems now.
                    authors = [{'full_name': c.getElementsByTagName('ce:text')[0].childNodes[0].nodeValue} for c in
                               x_collaborations]
                    if authors:
                        rinfo('Collaborations found: %s' % authors, record)
                        record.json['authors'] = authors
                    else:
                        rerror('No collaborations. Not fixable.', record)

                # possibly we added authors in the previous step.
                if 'authors' in record.json:
                    # Type 2 and 4: has authors, but no affiliations.
                    authors = record.json['authors']
                    aff_count = sum(map(lambda x: 'affiliations' in x, authors))
                    if aff_count == 0:
                        # Type 4: No affiliations in data.
                        new_affs = [
                            {u'country': find_country(a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue),
                             u'value': a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue
                             }
                            for a in x_affiliations]
                        if new_affs:
                            rinfo('New affiliations: %s' % new_affs, record)
                            # FIXME modify this, if multiple author groups should be supported
                            # FIXME (not all authors should be updated)!!!
                            # update_authors(record, authors_in_group, new_affs)

                            for i, a in enumerate(record.json.get('authors')):
                                record.json['authors'][i]['affiliations'] = new_affs
                            flag_modified(record, 'json')
                        else:
                            rerror('No affiliations at all. Not fixable.', record)

                    elif aff_count == len(authors):
                        empty_aff_count = sum(map(lambda x: len(x['affiliations']) == 0, authors))
                        if empty_aff_count == len(authors):
                            # Type 2: Only empty affiliations.
                            rinfo('Type 2. Not fixable.', record)
                        else:
                            rerror('Only SOME authors have EMPTY affiliations. What now?', record)
                    else:
                        rerror('Only SOME authors have affiliations. What now?', record)

        rinfo('OK', record)
예제 #12
0
 def proc(record):
     if record.json is None:
         rerror('record.json is None', record)
         return
     record.json = utf8rec(record.json)
     flag_modified(record, 'json')
예제 #13
0
    def proc(record):
        rinfo('start...', record)

        if '_files' not in record.json:
            rerror('Skipping. No _files', record)
            return

        xml = filter(lambda x: x['filetype'] == 'xml', record.json['_files'])
        if not xml:
            rerror('Skipping. No xml in _files', record)
            return

        object = ObjectVersion.get(xml[0]['bucket'], xml[0]['key'])
        uri = object.file.uri
        xml = parse(open(uri, 'rt'))
        x_author_groups = xml.getElementsByTagName('ce:author-group')

        if not x_author_groups:
            rerror('Skipping. No author groups.', record)
            return

        if len(x_author_groups) > 1:
            rerror('Skipping. MORE THEN ONE author group. Not supported.',
                   record)
            return

        for x_author_group in x_author_groups:
            x_collaborations = x_author_group.getElementsByTagName(
                'ce:collaboration')
            x_affiliations = x_author_group.getElementsByTagName(
                'ce:affiliation')
            # needed for supporting multiple author groups with author matching, but author matching is not rly possible.
            # authors_in_group = [
            #     (c.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue.replace('-', '').title(),
            #      c.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue.replace('-', '').title())
            #     for c in x_author_group.getElementsByTagName('ce:author')
            # ]

            if 'authors' not in record.json:
                # Type 1 and 3: has no authors at all. Fix: add collaborations if there are affiliations in xml.
                rerror('No authors... SKIPPING', record)
                return

                # extract collaborations, find countries later
                # FIXME we should always extract collaborations, but that would cause a lot more problems now.
                authors = [{
                    'full_name':
                    c.getElementsByTagName('ce:text')
                    [0].childNodes[0].nodeValue
                } for c in x_collaborations]
                if authors:
                    rinfo('Collaborations found: %s' % authors, record)
                    record.json['authors'] = authors
                else:
                    rerror('No collaborations. Not fixable.', record)

            # possibly we added authors in the previous step.
            if 'authors' in record.json:
                # Type 2 and 4: has authors, but no affiliations.
                authors = record.json['authors']
                aff_count = sum(map(lambda x: 'affiliations' in x, authors))
                if aff_count == 0:
                    # Type 4: No affiliations in data.
                    new_affs = [{
                        u'country':
                        get_country_for_aff(a),
                        u'value':
                        a.getElementsByTagName('ce:textfn')
                        [0].childNodes[0].nodeValue
                    } for a in x_affiliations]
                    if new_affs:
                        rinfo('New affiliations: %s' % new_affs, record)
                        # FIXME modify this, if multiple author groups should be supported
                        # FIXME (not all authors should be updated)!!!
                        # update_authors(record, authors_in_group, new_affs)
                        for i, a in enumerate(record.json.get('authors')):
                            record.json['authors'][i][
                                'affiliations'] = new_affs
                        flag_modified(record, 'json')
                    else:
                        rerror('No affiliations at all. Not fixable.', record)

                elif aff_count == len(authors):
                    empty_aff_count = sum(
                        map(lambda x: len(x['affiliations']) == 0, authors))
                    if empty_aff_count == len(authors):
                        # Type 2: Only empty affiliations.
                        rinfo('Type 2. Not fixable.', record)
                    else:
                        rerror(
                            'Only SOME authors have EMPTY affiliations. What now?',
                            record)
                else:
                    rerror('Only SOME authors have affiliations. What now?',
                           record)

        rinfo('OK', record)
예제 #14
0
 def proc(record):
     if record.json is None:
         rerror('record.json is None', record)
         return
     record.json = utf8rec(record.json)
     flag_modified(record, 'json')