示例#1
0
def test_get_record_by_identifier(app, document):
    """Test getting record by its identifier."""
    # Record found
    record = DocumentRecord.get_record_by_identifier([{
        'value': '111111',
        'type': 'bf:Local'
    }])
    assert record['pid'] == document['pid']

    # Record not found
    record = DocumentRecord.get_record_by_identifier([{
        'value': 'oai:unknown',
        'type': 'bf:Identifier'
    }])
    assert not record
示例#2
0
def test_get_record_by_identifier(app, document):
    """Test getting record by its identifier."""
    # Record found
    record = DocumentRecord.get_record_by_identifier([{
        'value': '111111',
        'type': 'bf:Local',
        'source': 'RERO DOC'
    }, {
        'value': 'R003415713',
        'type': 'bf:Local',
        'source': 'RERO'
    }])
    assert record['pid'] == document['pid']

    # Not matching complete identifier
    record = DocumentRecord.get_record_by_identifier([{
        'value': '111111',
        'type': 'bf:Local',
        'source': 'Unmatching'
    }, {
        'value': 'R003415713',
        'type': 'bf:Local',
        'source': 'RERO'
    }])
    assert not record

    # Mixing identifier data
    record = DocumentRecord.get_record_by_identifier([{
        'value': 'R003415713',
        'type': 'bf:Local',
        'source': 'RERO DOC'
    }])
    assert not record

    # Record not found, cause juste `bf:Doi` and `bf:Local` are analyzed.
    record = DocumentRecord.get_record_by_identifier([{
        'value': 'oai:unknown',
        'type': 'bf:Identifier'
    }])
    assert not record
示例#3
0
def import_records(records_to_import):
    """Import records in database and index them.

    Used as celery task. "ignore_result" flag means that we don't want to
    get the status and/or the result of the task, execution is faster.

    :param list records_to_import: List of records to import.
    :returns: List of IDs.
    """
    indexer = RecordIndexer()

    ids = []

    for data in records_to_import:
        try:
            files_data = data.pop('files', [])

            record = DocumentRecord.get_record_by_identifier(
                data.get('identifiedBy', []))

            if not record:
                record = DocumentRecord.create(data,
                                               dbcommit=False,
                                               with_bucket=True)
            else:
                record.update(data)

            for file_data in files_data:
                # Store url and key and remove it from dict to pass dict to
                # kwargs in add_file_from_url method
                url = file_data.pop('url')
                key = file_data.pop('key')

                try:
                    record.add_file_from_url(url, key, **file_data)
                except Exception as exception:
                    current_app.logger.warning(
                        'Error during import of file {file} of record '
                        '{record}: {error}'.format(
                            file=key,
                            error=exception,
                            record=record['identifiedBy']))

            # Merge record in database, at this time it's not saved into DB.
            record.commit()

            # Pushing record to database, not yet persisted into DB
            db.session.flush()

            # Add ID for bulk index in elasticsearch
            ids.append(str(record.id))

            current_app.logger.info(
                'Record with reference "{reference}" imported successfully'.
                format(reference=record['identifiedBy']))

        except Exception as exception:
            current_app.logger.error(
                'Error during importation of record {record}: {exception}'.
                format(record=data, exception=exception))

    # Commit and index records
    db.session.commit()
    indexer.bulk_index(ids)
    indexer.process_bulk_queue()

    return ids
示例#4
0
def update_file_permissions(permissions_file, chunk_size):
    """Update file permission with information given by input file.

    :param permissions_file: CSV file containing files permissions.
    """
    indexer = RecordIndexer()

    def save_records(ids):
        """Save current records set into database and re-index.

        :param ids: List of records to save
        """
        db.session.commit()
        indexer.bulk_index(ids)
        indexer.process_bulk_queue()

    try:
        with open(permissions_file.name, 'r') as file:
            reader = csv.reader(file, delimiter=',')

            # header
            header = next(reader)

            # check number of columns
            if len(header) != 3:
                raise Exception('CSV file seems to be not well formatted.')

            # To store ids for bulk indexing
            ids = []

            for row in reader:
                try:
                    # try to load corresponding record
                    record = DocumentRecord.get_record_by_identifier([{
                        'type':
                        'bf:Local',
                        'value':
                        row[0]
                    }])

                    # No record found, skipping..
                    if not record:
                        raise Exception(
                            'Record {record} not found'.format(record=row[0]))

                    file_name = '{key}.pdf'.format(key=row[2])

                    # File not found in record, skipping
                    if file_name not in record.files:
                        raise Exception(
                            'File {file} not found in record {record}'.format(
                                file=file_name, record=row[0]))

                    record_file = record.files[file_name]

                    # permissions contains a status
                    matches = re.search(r'status:(\w+)$', row[1])
                    if matches:
                        # If status if RERO or INTERNAL, file must not be
                        # displayed, otherwise file is not accessible outside
                        # organisation
                        record_file[
                            'access'] = 'coar:c_16ec'  # restricted access
                        if matches.group(1) in ['RERO', 'INTERNAL']:
                            record_file[
                                'restricted_outside_organisation'] = False
                        else:
                            record_file[
                                'restricted_outside_organisation'] = True
                    else:
                        # permissions contains a date
                        matches = re.search(
                            r'allow roles \/\.\*,(\w+),\.\*\/\n\s+.+(\d{4}-'
                            r'\d{2}-\d{2})', row[1])

                        if matches:
                            # file is accessible inside organisation
                            if matches.group(1) != 'INTERNAL':
                                record_file[
                                    'restricted_outside_organisation'] = True
                            # file is not accessible
                            else:
                                record_file[
                                    'restricted_outside_organisation'] = False

                            record_file[
                                'access'] = 'coar:c_f1cf'  # embargoed access
                            record_file['embargo_date'] = matches.group(2)

                    record.commit()
                    db.session.flush()
                    ids.append(str(record.id))

                    current_app.logger.warning(
                        'Restriction added for file {file} in record {record}.'
                        .format(file=file_name, record=record['pid']))

                    # Bulk save and index
                    if len(ids) % chunk_size == 0:
                        save_records(ids)
                        ids = []

                except Exception as exception:
                    click.secho(str(exception), fg='yellow')

        # save remaining records
        save_records(ids)

        click.secho('Process finished', fg='green')

    except Exception as exception:
        click.secho('An error occured during file process: {error}'.format(
            error=str(exception)),
                    fg='red')