def test_dedupe_list(): list_with_duplicates = ['foo', 'bar', 'foo'] expected = ['foo', 'bar'] result = dedupe_list(list_with_duplicates) assert expected == result
def _parse_structures(record): structures = [] recids = [] for author in record.get('authors', []): for affiliation in author.get('affiliations', []): try: recids.append( str(get_recid_from_ref(affiliation['record'])) ) affiliation['recid'] = get_recid_from_ref( affiliation['record'] ) except KeyError: continue try: records = get_es_records('ins', recids) except RequestError: records = [] for record in records: structures.append( _structure_data(record) ) return dedupe_list(structures)
def test_dedupe_list(): list_with_duplicates = ["foo", "bar", "foo"] expected = ["foo", "bar"] result = dedupe_list(list_with_duplicates) assert expected == result
def dedupe_all_lists(obj): """Recursively remove duplucates from all lists.""" squared_dedupe_len = 10 if isinstance(obj, dict): for key, value in obj.items(): obj[key] = dedupe_all_lists(value) return obj elif isinstance(obj, (list, tuple, set)): new_elements = [dedupe_all_lists(v) for v in obj] if len(new_elements) < squared_dedupe_len: new_obj = dedupe_list(new_elements) else: new_obj = dedupe_list_of_dicts(new_elements) return type(obj)(new_obj) else: return obj
def publication_info(self, key, value): """Publication info about record.""" def get_int_value(val): if val: out = force_force_list(val)[0] if out.isdigit(): out = int(out) return out return None year = get_int_value(value.get('y')) parent_recid = get_int_value(value.get('0')) journal_recid = get_int_value(value.get('1')) conference_recid = get_int_value(value.get('2')) parent_record = get_record_ref(parent_recid, 'literature') conference_record = get_record_ref(conference_recid, 'conferences') journal_record = get_record_ref(journal_recid, 'journals') page_start, page_end, artid = split_page_artid(value.get('c')) res = { 'parent_record': parent_record, 'conference_record': conference_record, 'journal_record': journal_record, 'page_start': page_start, 'page_end': page_end, 'artid': artid, 'journal_issue': value.get('n'), 'conf_acronym': value.get('o'), 'journal_title': value.get('p'), 'reportnumber': value.get('r'), 'confpaper_info': value.get('t'), 'journal_volume': value.get('v'), 'cnum': force_force_list(value.get('w')), 'pubinfo_freetext': value.get('x'), 'year': year, 'isbn': value.get('z'), 'notes': dedupe_list(force_force_list(value.get('m'))), } return res
def publication_info(self, key, value): """Publication info about record.""" def get_int_value(val): if val: out = force_force_list(val)[0] if out.isdigit(): out = int(out) return out return None year = get_int_value(value.get('y')) parent_recid = get_int_value(value.get('0')) journal_recid = get_int_value(value.get('1')) conference_recid = get_int_value(value.get('2')) parent_record = get_record_ref(parent_recid, 'literature') conference_record = get_record_ref(conference_recid, 'conferences') journal_record = get_record_ref(journal_recid, 'journals') page_start, page_end, artid = split_page_artid(value.get('c')) res = { 'parent_record': parent_record, 'conference_record': conference_record, 'journal_record': journal_record, 'page_start': page_start, 'page_end': page_end, 'artid': artid, 'journal_issue': force_single_element(value.get('n')), 'conf_acronym': force_single_element(value.get('o')), 'journal_title': force_single_element(value.get('p')), 'reportnumber': force_single_element(value.get('r')), 'confpaper_info': force_single_element(value.get('t')), 'journal_volume': force_single_element(value.get('v')), 'cnum': force_single_element(value.get('w')), 'pubinfo_freetext': force_single_element(value.get('x')), 'year': year, 'isbn': force_single_element(value.get('z')), 'notes': dedupe_list(force_force_list(value.get('m'))), } return res
def _parse_structures(record): structures = [] recids = [] for author in record.get('authors', []): for affiliation in author.get('affiliations', []): try: recids.append(str(get_recid_from_ref(affiliation['record']))) affiliation['recid'] = get_recid_from_ref( affiliation['record']) except KeyError: continue try: records = get_es_records('ins', recids) except RequestError: records = [] for record in records: structures.append(_structure_data(record)) return dedupe_list(structures)
def remove_duplicates_from_list(l): return dedupe_list(l)
def add_citation_counts(chunk_size=500, request_timeout=120): def _build_recid_to_uuid_map(citations_lookup): pids = PersistentIdentifier.query.filter( PersistentIdentifier.object_type == 'rec').yield_per(1000) with click.progressbar(pids) as bar: return { pid.object_uuid: citations_lookup[int(pid.pid_value)] for pid in bar if int(pid.pid_value) in citations_lookup } def _get_records_to_update_generator(citations_lookup): with click.progressbar(citations_lookup.iteritems()) as bar: for uuid, citation_count in bar: yield { '_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } index, doc_type = schema_to_index('records/hep.json') citations_lookup = Counter() click.echo('Extracting all citations...') with click.progressbar(es_scan( current_search_client, query={ '_source': 'references.recid', 'filter': { 'exists': { 'field': 'references.recid' } }, 'size': LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: unique_refs_ids = dedupe_list(list(chain.from_iterable(map( force_force_list, get_value(record, '_source.references.recid'))))) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo('... DONE.') click.echo('Mapping recids to UUIDs...') citations_lookup = _build_recid_to_uuid_map(citations_lookup) click.echo('... DONE.') click.echo('Adding citation numbers...') success, failed = es_bulk( current_search_client, _get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, request_timeout=request_timeout, stats_only=True, ) click.echo('... DONE: {} records updated with success. {} failures.'.format( success, failed))
def add_citation_counts(chunk_size=500, request_timeout=120): def _build_recid_to_uuid_map(citations_lookup): pids = PersistentIdentifier.query.filter( PersistentIdentifier.object_type == 'rec').yield_per(1000) with click.progressbar(pids) as bar: return { pid.object_uuid: citations_lookup[int(pid.pid_value)] for pid in bar if int(pid.pid_value) in citations_lookup } def _get_records_to_update_generator(citations_lookup): with click.progressbar(citations_lookup.iteritems()) as bar: for uuid, citation_count in bar: yield { '_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } index, doc_type = schema_to_index('records/hep.json') citations_lookup = Counter() click.echo('Extracting all citations...') with click.progressbar(es_scan( current_search_client, query={ '_source': 'references.recid', 'filter': { 'exists': { 'field': 'references.recid' } }, 'size': LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: unique_refs_ids = dedupe_list(list(chain.from_iterable(map( force_force_list, get_value(record, '_source.references.recid'))))) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo('... DONE.') click.echo('Mapping recids to UUIDs...') citations_lookup = _build_recid_to_uuid_map(citations_lookup) click.echo('... DONE.') click.echo('Adding citation numbers...') success, failed = es_bulk( current_search_client, _get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=True, raise_on_error=True, request_timeout=request_timeout, stats_only=True, ) click.echo('... DONE: {} records updated with success. {} failures.'.format( success, failed))