Python dedupe_list示例，inspirehep.utils.dedupers.dedupe_list Python示例

示例#1

0

显示文件

文件： test_utils_dedupers.py 项目： fschwenn/inspire-next

def test_dedupe_list():
    list_with_duplicates = ['foo', 'bar', 'foo']

    expected = ['foo', 'bar']
    result = dedupe_list(list_with_duplicates)

    assert expected == result

示例#2

0

显示文件

文件： tei.py 项目： fschwenn/inspire-next

def _parse_structures(record):
    structures = []
    recids = []

    for author in record.get('authors', []):
        for affiliation in author.get('affiliations', []):
            try:
                recids.append(
                    str(get_recid_from_ref(affiliation['record']))
                )
                affiliation['recid'] = get_recid_from_ref(
                    affiliation['record']
                )
            except KeyError:
                continue

    try:
        records = get_es_records('ins', recids)
    except RequestError:
        records = []

    for record in records:
        structures.append(
            _structure_data(record)
        )
    return dedupe_list(structures)

示例#3

0

显示文件

文件： test_utils_dedupers.py 项目： kaplun/inspire-next

def test_dedupe_list():
    list_with_duplicates = ["foo", "bar", "foo"]

    expected = ["foo", "bar"]
    result = dedupe_list(list_with_duplicates)

    assert expected == result

示例#4

0

显示文件

def test_dedupe_list():
    list_with_duplicates = ['foo', 'bar', 'foo']

    expected = ['foo', 'bar']
    result = dedupe_list(list_with_duplicates)

    assert expected == result

示例#5

0

显示文件

def dedupe_all_lists(obj):
    """Recursively remove duplucates from all lists."""
    squared_dedupe_len = 10
    if isinstance(obj, dict):
        for key, value in obj.items():
            obj[key] = dedupe_all_lists(value)
        return obj
    elif isinstance(obj, (list, tuple, set)):
        new_elements = [dedupe_all_lists(v) for v in obj]
        if len(new_elements) < squared_dedupe_len:
            new_obj = dedupe_list(new_elements)
        else:
            new_obj = dedupe_list_of_dicts(new_elements)
        return type(obj)(new_obj)
    else:
        return obj

示例#6

0

显示文件

文件： __init__.py 项目： liamkirsh/inspire-next

def dedupe_all_lists(obj):
    """Recursively remove duplucates from all lists."""
    squared_dedupe_len = 10
    if isinstance(obj, dict):
        for key, value in obj.items():
            obj[key] = dedupe_all_lists(value)
        return obj
    elif isinstance(obj, (list, tuple, set)):
        new_elements = [dedupe_all_lists(v) for v in obj]
        if len(new_elements) < squared_dedupe_len:
            new_obj = dedupe_list(new_elements)
        else:
            new_obj = dedupe_list_of_dicts(new_elements)
        return type(obj)(new_obj)
    else:
        return obj

示例#7

0

显示文件

文件： bd76x78x.py 项目： jacenkow/inspire-next

def publication_info(self, key, value):
    """Publication info about record."""
    def get_int_value(val):
        if val:
            out = force_force_list(val)[0]
            if out.isdigit():
                out = int(out)
                return out
        return None

    year = get_int_value(value.get('y'))
    parent_recid = get_int_value(value.get('0'))
    journal_recid = get_int_value(value.get('1'))
    conference_recid = get_int_value(value.get('2'))
    parent_record = get_record_ref(parent_recid, 'literature')
    conference_record = get_record_ref(conference_recid, 'conferences')
    journal_record = get_record_ref(journal_recid, 'journals')

    page_start, page_end, artid = split_page_artid(value.get('c'))

    res = {
        'parent_record': parent_record,
        'conference_record': conference_record,
        'journal_record': journal_record,
        'page_start': page_start,
        'page_end': page_end,
        'artid': artid,
        'journal_issue': value.get('n'),
        'conf_acronym': value.get('o'),
        'journal_title': value.get('p'),
        'reportnumber': value.get('r'),
        'confpaper_info': value.get('t'),
        'journal_volume': value.get('v'),
        'cnum': force_force_list(value.get('w')),
        'pubinfo_freetext': value.get('x'),
        'year': year,
        'isbn': value.get('z'),
        'notes': dedupe_list(force_force_list(value.get('m'))),
    }

    return res

示例#8

0

显示文件

文件： bd76x78x.py 项目： bittirousku/inspire-next

def publication_info(self, key, value):
    """Publication info about record."""
    def get_int_value(val):
        if val:
            out = force_force_list(val)[0]
            if out.isdigit():
                out = int(out)
                return out
        return None

    year = get_int_value(value.get('y'))
    parent_recid = get_int_value(value.get('0'))
    journal_recid = get_int_value(value.get('1'))
    conference_recid = get_int_value(value.get('2'))
    parent_record = get_record_ref(parent_recid, 'literature')
    conference_record = get_record_ref(conference_recid, 'conferences')
    journal_record = get_record_ref(journal_recid, 'journals')

    page_start, page_end, artid = split_page_artid(value.get('c'))

    res = {
        'parent_record': parent_record,
        'conference_record': conference_record,
        'journal_record': journal_record,
        'page_start': page_start,
        'page_end': page_end,
        'artid': artid,
        'journal_issue': force_single_element(value.get('n')),
        'conf_acronym': force_single_element(value.get('o')),
        'journal_title': force_single_element(value.get('p')),
        'reportnumber': force_single_element(value.get('r')),
        'confpaper_info': force_single_element(value.get('t')),
        'journal_volume': force_single_element(value.get('v')),
        'cnum': force_single_element(value.get('w')),
        'pubinfo_freetext': force_single_element(value.get('x')),
        'year': year,
        'isbn': force_single_element(value.get('z')),
        'notes': dedupe_list(force_force_list(value.get('m'))),
    }

    return res

示例#9

0

显示文件

文件： tei.py 项目： michamos/inspire-next

def _parse_structures(record):
    structures = []
    recids = []

    for author in record.get('authors', []):
        for affiliation in author.get('affiliations', []):
            try:
                recids.append(str(get_recid_from_ref(affiliation['record'])))
                affiliation['recid'] = get_recid_from_ref(
                    affiliation['record'])
            except KeyError:
                continue

    try:
        records = get_es_records('ins', recids)
    except RequestError:
        records = []

    for record in records:
        structures.append(_structure_data(record))
    return dedupe_list(structures)

示例#10

0

显示文件

文件： jinja2filters.py 项目： bittirousku/inspire-next

def remove_duplicates_from_list(l):
    return dedupe_list(l)

示例#11

0

显示文件

文件： jinja2filters.py 项目： michamos/inspire-next

def remove_duplicates_from_list(l):
    return dedupe_list(l)

示例#12

0

显示文件

文件： records.py 项目： bittirousku/inspire-next

def add_citation_counts(chunk_size=500, request_timeout=120):
    def _build_recid_to_uuid_map(citations_lookup):
        pids = PersistentIdentifier.query.filter(
            PersistentIdentifier.object_type == 'rec').yield_per(1000)

        with click.progressbar(pids) as bar:
            return {
                pid.object_uuid: citations_lookup[int(pid.pid_value)]
                for pid in bar if int(pid.pid_value) in citations_lookup
            }

    def _get_records_to_update_generator(citations_lookup):
        with click.progressbar(citations_lookup.iteritems()) as bar:
            for uuid, citation_count in bar:
                yield {
                    '_op_type': 'update',
                    '_index': index,
                    '_type': doc_type,
                    '_id': str(uuid),
                    'doc': {'citation_count': citation_count}
                }

    index, doc_type = schema_to_index('records/hep.json')
    citations_lookup = Counter()

    click.echo('Extracting all citations...')
    with click.progressbar(es_scan(
            current_search_client,
            query={
                '_source': 'references.recid',
                'filter': {
                    'exists': {
                        'field': 'references.recid'
                    }
                },
                'size': LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            unique_refs_ids = dedupe_list(list(chain.from_iterable(map(
                force_force_list, get_value(record, '_source.references.recid')))))

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1
    click.echo('... DONE.')

    click.echo('Mapping recids to UUIDs...')
    citations_lookup = _build_recid_to_uuid_map(citations_lookup)
    click.echo('... DONE.')

    click.echo('Adding citation numbers...')
    success, failed = es_bulk(
        current_search_client,
        _get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
        request_timeout=request_timeout,
        stats_only=True,
    )
    click.echo('... DONE: {} records updated with success. {} failures.'.format(
        success, failed))

示例#13

0

显示文件

文件： tasks.py 项目： jacenkow/inspire-next

def add_citation_counts(chunk_size=500, request_timeout=120):
    def _build_recid_to_uuid_map(citations_lookup):
        pids = PersistentIdentifier.query.filter(
            PersistentIdentifier.object_type == 'rec').yield_per(1000)

        with click.progressbar(pids) as bar:
            return {
                pid.object_uuid: citations_lookup[int(pid.pid_value)]
                for pid in bar if int(pid.pid_value) in citations_lookup
            }

    def _get_records_to_update_generator(citations_lookup):
        with click.progressbar(citations_lookup.iteritems()) as bar:
            for uuid, citation_count in bar:
                yield {
                    '_op_type': 'update',
                    '_index': index,
                    '_type': doc_type,
                    '_id': str(uuid),
                    'doc': {'citation_count': citation_count}
                }

    index, doc_type = schema_to_index('records/hep.json')
    citations_lookup = Counter()

    click.echo('Extracting all citations...')
    with click.progressbar(es_scan(
            current_search_client,
            query={
                '_source': 'references.recid',
                'filter': {
                    'exists': {
                        'field': 'references.recid'
                    }
                },
                'size': LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            unique_refs_ids = dedupe_list(list(chain.from_iterable(map(
                force_force_list, get_value(record, '_source.references.recid')))))

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1
    click.echo('... DONE.')

    click.echo('Mapping recids to UUIDs...')
    citations_lookup = _build_recid_to_uuid_map(citations_lookup)
    click.echo('... DONE.')

    click.echo('Adding citation numbers...')
    success, failed = es_bulk(
        current_search_client,
        _get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=True,
        raise_on_error=True,
        request_timeout=request_timeout,
        stats_only=True,
    )
    click.echo('... DONE: {} records updated with success. {} failures.'.format(
        success, failed))