def issue_ids_to_article_ids(collection, items): """ Return a dictionary, like: {'issn':[pid, pid, ...]), 'issn':[pid, pid, ...]), 'issn':[pid, pid, ...])} """ data_dict = {} domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN, ARTICLE_META_THRIFT_PORT) cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT) for issn, icodes in items.items(): d = data_dict.setdefault(issn, []) for icode in icodes: for code in cl.documents(collection=collection, only_identifiers=True, extra_filter='{"code_issue":"%s"}' % icode): if code: d.append(code.code) return data_dict
def common_mode(self): art_meta = ThriftClient() logger.info("Running without differential mode") logger.info("Indexing in {0}".format(self.solr.url)) for document in art_meta.documents( collection=self.collection, issn=self.issn, from_date=self.format_date(self.from_date), until_date=self.format_date(self.until_date) ): logger.debug("Loading document %s" % '_'.join([document.collection_acronym, document.publisher_id])) try: xml = self.pipeline_to_xml(document) self.solr.update(xml, commit=False) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue if self.delete is True: logger.info("Running remove records process.") ind_ids = set() art_ids = set() itens_query = [] if self.collection: itens_query.append('in:%s' % self.collection) if self.issn: itens_query.append('issn:%s' % self.issn) query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query) list_ids = json.loads(self.solr.select( {'q': query, 'fl': 'id', 'rows': 1000000}))['response']['docs'] for id in list_ids: ind_ids.add(id['id']) # all ids in articlemeta for item in art_meta.documents( collection=self.collection, issn=self.issn, only_identifiers=True ): art_ids.add('%s-%s' % (item.code, item.collection)) # Ids to remove total_to_remove = len(remove_ids) logger.info("Removing (%d) documents from search index." % len(remove_ids)) remove_ids = ind_ids - art_ids for ndx, to_remove_id in enumerate(remove_ids, 1): logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id)) self.solr.delete('id:%s' % to_remove_id, commit=False)
def registry_dispatcher_document(self, code, collection): """ This task receive a list of codes that should be queued for DOI registry """ articlemeta = ThriftClient(domain=ARTICLEMETA_THRIFTSERVER) document = articlemeta.document(code, collection) code = '_'.join([document.collection_acronym, document.publisher_id]) log_title = 'Reading document: %s' % code logger.info(log_title) xml_file_name = '%s.xml' % code doi = document.doi or '' doi_prefix = document.doi.split('/')[0] if doi else '' now = datetime.now() if SUGGEST_DOI_IDENTIFICATION is True and not doi: doi_prefix = CROSSREF_PREFIX doi = '/'.join([ CROSSREF_PREFIX, document.publisher_ahead_id or document.publisher_id ]) depitem = Deposit(code=code, pid=document.publisher_id, issn=document.journal.scielo_issn, volume=document.issue.volume, number=document.issue.number, issue_label=document.issue.label, journal=document.journal.title, journal_acronym=document.journal.acronym, collection_acronym=document.collection_acronym, xml_file_name=xml_file_name, doi=doi, publication_year=int(document.publication_date[0:4]), prefix=doi_prefix, has_submission_xml_valid_references=False, submission_updated_at=now, submission_status='waiting', updated_at=now, started_at=now) with transactional_session() as session: deposit = session.query(Deposit).filter_by(code=code).first() if deposit: logger.info( 'deposit already exists. it will be deleted and ' 're-created: "%s"', code) session.delete(deposit) session.add(depitem) logger.info('deposit successfuly created for "%s": %s', code, repr(deposit)) chain( triage_deposit.s(code).set(queue='dispatcher'), load_xml_from_articlemeta.s().set(queue='dispatcher'), prepare_document.s().set(queue='dispatcher'), register_doi.s().set(queue='dispatcher'), request_doi_status.s().set(queue='releaser')).delay()
def get_issn_by_acron(collection, acron): domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN, ARTICLE_META_THRIFT_PORT) cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT) for journal in cl.journals(collection=collection): if journal.acronym == acron: return journal.scielo_issn
def __init__(self, collection, issns=None, from_date=FROM, until_date=UNTIL): self._articlemeta = ThriftClient(domain=os.environ.get( 'ARTICLEMETA_THRIFTSERVER', 'articlemeta.scielo.org:11621')) self._depositor = Depositor() self.collection = collection self.from_date = from_date self.until_date = until_date self.issns = issns or [None]
def load_articlemeta_journals_ids(collection, issns=None): rc = ThriftClient(domain=ARTICLEMETA_THRIFTSERVER, admintoken=ADMINTOKEN) journals_pids = [] logger.info('Loading articlemeta journals ids') for issn in issns or [None]: for journal in rc.journals(collection, issn=issn, only_identifiers=True): logger.debug( 'Loading articlemeta journal id (%s)', '_'.join([journal.collection, journal.code]) ) journals_pids.append('_'.join([journal.collection, journal.code, journal.processing_date.replace('-', '')])) return journals_pids
class ExportDOI(object): def __init__(self, collection, issns=None, from_date=FROM, until_date=UNTIL): self._articlemeta = ThriftClient(domain=os.environ.get( 'ARTICLEMETA_THRIFTSERVER', 'articlemeta.scielo.org:11621')) self._depositor = Depositor() self.collection = collection self.from_date = from_date self.until_date = until_date self.issns = issns or [None] def run(self): logger.info( 'started collecting articles with processing dates ' 'between "%s" and "%s"', self.from_date, self.until_date) count = 0 for issn in self.issns: for document in self._articlemeta.documents( collection=self.collection, issn=issn, from_date=self.from_date, until_date=self.until_date, only_identifiers=True): code = '_'.join([document.collection, document.code]) logger.info('collecting document for deposit: %s', code) self._depositor.deposit_by_pids([code]) count += 1 logger.info('finished collecting documents. total: %d', count)
def get_issns_by_acrons(collection, acrons): issn_list = [] domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN, ARTICLE_META_THRIFT_PORT) cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT) acrons = set(acrons) for journal in cl.journals(collection=collection): if not acrons: break if journal.acronym in acrons: acrons.remove(journal.acronym) issn_list.append(journal.scielo_issn) return issn_list
def issue_labels_to_ids(collection, items): """ Return a dictionary, like: {'issn':set([id, id]), 'issn':set([id, id]), 'issn':set([id, id])} """ data_dict = {} domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN, ARTICLE_META_THRIFT_PORT) cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT) for issn, labels in items.items(): d = data_dict.setdefault(issn, set()) for label in labels: code = cl.get_issue_code_from_label(label, issn, collection) if code: d.add(code) return data_dict
def main(): usage = """Povoa tabela de periódicos para uso da API SUSHI em relatórios COUNTER""" parser = argparse.ArgumentParser(usage) parser.add_argument( '-u', '--matomo_db_uri', default=MATOMO_DATABASE_STRING, dest='matomodb_uri', help= 'String de conexão a base SQL no formato mysql://username:password@host1:port/database' ) parser.add_argument( '-t', '--use_thrift', dest='use_thrift', default=False, action='store_true', help='Usar ArticleMeta Thrift Client ao invés de RestfulClient') parser.add_argument( '--logging_level', choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'NOTSET'], dest='logging_level', default=LOGGING_LEVEL, help='Nível de log') params = parser.parse_args() logging.basicConfig(level=params.logging_level) if not params.use_thrift: articlemeta = RestfulClient() else: articlemeta = ThriftClient() populate(articlemeta=articlemeta, db_session=SESSION_FACTORY())
def run(self): client = ThriftClient() logger.info('Creating zip file: %s', self.zip_name) logger.info('XML Format: %s', self.xml_format) with zipfile.ZipFile(self.zip_name, 'w', compression=zipfile.ZIP_DEFLATED, allowZip64=True) as thezip: for pid, collection, document in self.items(): logger.debug('Loading XML file for %s', '_'.join([collection, pid])) collection = trans_acronym.get(collection, collection) issn = pid[1:10] xml_file = '{0}/{1}/{2}.xml'.format(collection, issn, pid) thezip.writestr(xml_file, bytes(document.encode('utf-8'))) readmef = open( os.path.dirname(__file__) + '/templates/dumparticle_readme.txt', 'r').read() readme = '{0}\r\n* Documents updated at: {1}\r\n'.format( readmef, datetime.datetime.now().isoformat()) thezip.writestr("README.txt", bytes(readme.encode('utf-8'))) if self.xml_format == 'xmlwos': xsd = getschema() if xsd: thezip.writestr("schema/ThomsonReuters_publishing.xsd", bytes(xsd.encode('utf-8'))) logger.info('Zip created: %s', self.zip_name) logger.info('Processing finished')
def run(collection, issns, full_rebuild=False, force_delete=False, bulk_size=BULK_SIZE): rc = ThriftClient(domain=ARTICLEMETA_THRIFTSERVER, admintoken=ADMINTOKEN) logger.info('Running Isis2mongo') logger.debug('Thrift Server: %s', ARTICLEMETA_THRIFTSERVER) logger.debug('Admin Token: %s', ADMINTOKEN) logger.info('Loading ArticleMeta identifiers for collection: %s', collection) articlemeta_documents = set( load_articlemeta_documents_ids(collection, issns)) articlemeta_issues = set( load_articlemeta_issues_ids(collection, issns)) articlemeta_journals = set( load_articlemeta_journals_ids(collection, issns)) if full_rebuild is True: articlemeta_documents = set([]) articlemeta_issues = set([]) articlemeta_journals = set([]) with DataBroker(uuid.uuid4()) as ctrl: update_issue_id = '' fields_to_update_after_loading_documents = [] bulk = {} bulk_count = 0 for coll, record in load_isis_records(collection, issns): bulk_count += 1 bulk.setdefault(coll, []) bulk[coll].append(record) if bulk_count == bulk_size: bulk_count = 0 ctrl.bulk_data(dict(bulk)) bulk = {} # ctrl.write_record(coll, record) # Write field 4 in issue database rec_type = record.get('v706', [{'_': ''}])[0]['_'] if rec_type == 'h': if update_issue_id == record['v880'][0]['_'][1:18]: continue fields_to_update_after_loading_documents.append([ 'issues', 'v4', record['v4'][0]['_'], record['v880'][0]['_'][1:18] ]) # bulk residual data ctrl.bulk_data(dict(bulk)) logger.info('Updating fields metadata') total_fields_to_update = len(fields_to_update_after_loading_documents) for ndx, item in enumerate(fields_to_update_after_loading_documents, 1): logger.debug("Updating (%d, %d) %s", ndx, total_fields_to_update, str(item)) ctrl.update_field(*item) logger.info('Loading legacy identifiers') legacy_documents = set(ctrl.articles_ids) legacy_issues = set(ctrl.issues_ids) legacy_journals = set(ctrl.journals_ids) logger.info('Producing lists of differences between ArticleMeta and Legacy databases') new_documents = list(legacy_documents - articlemeta_documents) new_issues = list(legacy_issues - articlemeta_issues) new_journals = list(legacy_journals - articlemeta_journals) am_document_pids_only = set([i[0:27] for i in articlemeta_documents]) lg_document_pids_only = set([i[0:27] for i in legacy_documents]) to_remove_documents = list(am_document_pids_only - lg_document_pids_only) am_issue_pids_only = set([i[0:21] for i in articlemeta_issues]) lg_issue_pids_only = set([i[0:21] for i in legacy_issues]) to_remove_issues = list(am_issue_pids_only - lg_issue_pids_only) am_journals_pids_only = set([i[0:13] for i in articlemeta_journals]) lg_journals_pids_only = set([i[0:13] for i in legacy_journals]) to_remove_journals = list(am_journals_pids_only - lg_journals_pids_only) # Including and Updating Documents logger.info( 'Documents being included into articlemeta (%d)', len(new_documents) ) for ndx, item in enumerate(new_documents, 1): item = item.split('_') try: document_meta = ctrl.load_document(item[0], item[1]) except: logger.error( 'Fail to load document into Articlemeta (%s)', '_'.join([item[0], item[1]]) ) continue if not document_meta: logger.error( 'Fail to load document into Articlemeta (%s)', '_'.join([item[0], item[1]]) ) continue try: rc.add_document(json.dumps(document_meta)) except ServerError: logger.error( 'Fail to load document into Articlemeta (%s)', '_'.join([item[0], item[1]]) ) continue logger.debug( 'Document (%d, %d) loaded into Articlemeta (%s)', ndx, len(new_documents), '_'.join([item[0], item[1]]) ) # Removing Documents total_to_remove_documents = len(to_remove_documents) logger.info( 'Documents to be removed from articlemeta (%d)', total_to_remove_documents ) skip_deletion = True if total_to_remove_documents > SECURE_ARTICLE_DELETIONS_NUMBER: logger.info('To many documents to be removed') if force_delete is False: skip_deletion = True logger.info('force_delete is setup to %s, the remove task will be skipped', force_delete) else: skip_deletion = False for item in to_remove_documents: item = item.split('_') if skip_deletion is True: logger.debug( 'Document remove task (%d, %d) will be skipped (%s)', ndx, total_to_remove_documents, '_'.join([item[0], item[1]]) ) try: rc.delete_document(item[1], item[0]) logger.debug( 'Document (%d, %d) removed from Articlemeta (%s)', ndx, total_to_remove_documents, '_'.join([item[0], item[1]]) ) except UnauthorizedAccess: logger.warning('Unauthorized access to remove itens, check the ArticleMeta admin token') # Including and Updating Journals logger.info( 'Journals being included into articlemeta (%d)', len(new_journals) ) for ndx, item in enumerate(new_journals, 1): item = item.split('_') try: journal_meta = ctrl.load_journal(item[0], item[1]) except: logger.error( 'Fail to load journal into Articlemeta (%s)', '_'.join([item[0], item[1]]) ) continue if not journal_meta: logger.error( 'Fail to load journal into Articlemeta (%s)', '_'.join([item[0], item[1]]) ) continue try: rc.add_journal(json.dumps(journal_meta)) except ServerError: logger.error( 'Fail to load document into Articlemeta (%s)', '_'.join([item[0], item[1]]) ) continue logger.debug( 'Journal (%d, %d) loaded into Articlemeta (%s)', ndx, len(new_journals), '_'.join([item[0], item[1]]) ) # Removing Journals total_to_remove_journals = len(to_remove_journals) logger.info( 'Journals to be removed from articlemeta (%d)', total_to_remove_journals ) skip_deletion = True if total_to_remove_journals > SECURE_JOURNAL_DELETIONS_NUMBER: logger.info('To many journals to be removed') if force_delete is False: skip_deletion = True logger.info('force_delete is setup to %s, the remove task will be skipped', force_delete) else: skip_deletion = False for ndx, item in enumerate(to_remove_journals, 1): item = item.split('_') if skip_deletion is True: logger.debug( 'Journal remove task (%d, %d) will be skipped (%s)', ndx, total_to_remove_journals, '_'.join([item[0], item[1]]) ) try: rc.delete_journal(item[1], item[0]) logger.debug( 'Journal (%d, %d) removed from Articlemeta (%s)', ndx, total_to_remove_journals, '_'.join([item[0], item[1]]) ) except UnauthorizedAccess: logger.warning('Unauthorized access to remove itens, check the ArticleMeta admin token') # Including and Updating Issues logger.info( 'Issues being included into articlemeta (%d)', len(new_issues) ) for ndx, item in enumerate(new_issues, 1): item = item.split('_') try: issue_meta = ctrl.load_issue(item[0], item[1]) except: logger.error( 'Fail to load issue into Articlemeta (%s)', '_'.join([item[0], item[1]]) ) continue if not issue_meta: logger.error( 'Fail to load issue into Articlemeta (%s)', '_'.join([item[0], item[1]]) ) continue try: rc.add_issue(json.dumps(issue_meta)) except ServerError: logger.error( 'Fail to load document into Articlemeta (%s)', '_'.join([item[0], item[1]]) ) continue logger.debug( 'Issue (%d, %d) loaded into Articlemeta (%s)', ndx, len(new_issues), '_'.join([item[0], item[1]]) ) # Removing Issues total_to_remove_issues = len(to_remove_issues) logger.info( 'Issues to be removed from articlemeta (%d)', total_to_remove_issues ) skip_deletion = True if total_to_remove_documents > SECURE_ISSUE_DELETIONS_NUMBER: logger.info('To many issues to be removed') if force_delete is False: skip_deletion = True logger.info('force_delete is setup to %s, the remove task will be skipped', force_delete) else: skip_deletion = False for ndx, item in enumerate(to_remove_issues, 1): item = item.split('_') if skip_deletion is True: logger.debug( 'Issue remove task (%d, %d) will be skipped (%s)', ndx, total_to_remove_issues, '_'.join([item[0], item[1]]) ) try: rc.delete_issue(item[1], item[0]) logger.debug( 'Issue (%d, %d) removed from Articlemeta (%s)', ndx, total_to_remove_issues, '_'.join([item[0], item[1]]) ) except UnauthorizedAccess: logger.warning('Unauthorized access to remove itens, check the ArticleMeta admin token') logger.info('Process Isis2mongo Finished')
def run(self): """ Run the process for update article in Solr. """ art_meta = ThriftClient() if self.delete: self.solr.delete(self.delete, commit=True) else: logger.info("Indexing in {0}".format(self.solr.url)) for document in art_meta.documents( collection=self.collection, issn=self.issn, from_date=self.format_date(self.from_date), until_date=self.format_date(self.until_date)): logger.debug("Loading document %s" % '_'.join( [document.collection_acronym, document.publisher_id])) try: xml = self.pipeline_to_xml(document) self.solr.update(self.pipeline_to_xml(document), commit=True) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue if self.sanitization is True: logger.info("Running sanitization process") ind_ids = set() art_ids = set() itens_query = [] if self.collection: itens_query.append('in:%s' % self.collection) if self.issn: itens_query.append('issn:%s' % self.issn) query = '*:*' if len(itens_query) == 0 else ' AND '.join( itens_query) list_ids = json.loads( self.solr.select({ 'q': query, 'fl': 'id', 'rows': 1000000 }))['response']['docs'] for id in list_ids: ind_ids.add(id['id']) # all ids in articlemeta for item in art_meta.documents(collection=self.collection, issn=self.issn, only_identifiers=True): art_ids.add('%s-%s' % (item.code, item.collection)) # Ids to remove remove_ids = ind_ids - art_ids for id in remove_ids: logger.debug("Removing id: %s" % id) self.solr.delete('id:%s' % id, commit=True) # optimize the index self.solr.commit() self.solr.optimize()
} ROBOTS = [ i.strip() for i in open(utils.settings.get('robots_file', 'robots.txt')) ] APACHE_LOG_FORMAT = utils.settings.get( 'log_format', r'= %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"') COMPILED_ROBOTS = [re.compile(i.lower()) for i in ROBOTS] REGEX_ISSN = re.compile("^[0-9]{4}-[0-9]{3}[0-9xX]$") REGEX_ISSUE = re.compile("^[0-9]{4}-[0-9]{3}[0-9xX][0-2][0-9]{3}[0-9]{4}$") REGEX_ARTICLE = re.compile( "^[0-9]{4}-[0-9]{3}[0-9xX][0-2][0-9]{3}[0-9]{4}[0-9]{5}$") REGEX_FBPE = re.compile("^[0-9]{4}-[0-9]{3}[0-9xX]\([0-9]{2}\)[0-9]{8}$") am_client = ThriftClient(domain='articlemeta.scielo.org:11621') def _allowed_collections(): """Obtém a lista das coleções a partir do ArticleMeta """ allowed_collections = [] try: collections = am_client.collections() except: logger.error('Fail to retrieve collections from thrift server') return [i.code for i in collections]
def load_xml_from_articlemeta(self, code): articlemeta = ThriftClient(domain=ARTICLEMETA_THRIFTSERVER) exc_log_title = '' with transactional_session() as session: deposit = session.query(Deposit).filter_by(code=code).first() log_title = 'Loading XML document from ArticleMeta (%s)' % code log_event( session, { 'title': log_title, 'type': 'submission', 'status': 'info', 'deposit_code': code }) try: xml = articlemeta.document(deposit.pid, deposit.collection_acronym, fmt='xmlcrossref') except Exception as exc: logger.info('could not fetch Crossref XML for "%s": %s', code, str(exc)) logger.exception(exc) deposit.submission_status = 'error' deposit.submission_updated_at = datetime.now() deposit.updated_at = datetime.now() log_title = 'Fail to load XML document from ArticleMeta (%s)' % code log_event( session, { 'title': log_title, 'body': str(exc), 'type': 'submission', 'status': 'error', 'deposit_code': code }) exc_log_title = log_title else: deposit.submission_status = 'waiting' deposit.submission_xml = xml deposit.submission_updated_at = datetime.now() deposit.updated_at = datetime.now() log_title = 'XML Document loaded from ArticleMeta (%s)' % code log_event( session, { 'title': log_title, 'type': 'submission', 'status': 'success', 'deposit_code': code }) if exc_log_title: raise self.retry(exc=ComunicationError(exc_log_title)) return code
def client(self): """ Returns a new instance of ThriftClient client """ client = ThriftClient(domain=self._domain, timeout=self.timeout) return client
def run(self): """ Run the process for update article in Solr. """ art_meta = ThriftClient() if self.args.delete: self.solr.delete(self.args.delete, commit=True) elif self.args.sanitization: # set of index ids ind_ids = set() # set of articlemeta ids art_ids = set() # all ids in index list_ids = json.loads(self.solr.select( {'q': '*:*', 'fl': 'id', 'rows': 1000000}))['response']['docs'] for id in list_ids: ind_ids.add(id['id']) # all ids in articlemeta for item in art_meta.documents(only_identifiers=True): if item.collection not in ALLOWED_COLLECTION: continue art_ids.add('%s-%s' % (item.code, item.collection)) # Ids to remove remove_ids = ind_ids - art_ids for id in remove_ids: self.solr.delete('id:%s' % id, commit=True) logger.info("List of removed ids: %s" % remove_ids) else: # Get article identifiers logger.info("Indexing in {0}".format(self.solr.url)) for document in art_meta.documents( collection=self.args.collection, issn=self.args.issn, from_date=self.format_date(self.args.from_date), until_date=self.format_date(self.args.until_date) ): try: xml = self.pipeline_to_xml(document) self.solr.update(self.pipeline_to_xml(document), commit=True) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue # optimize the index self.solr.commit() self.solr.optimize()
def run(self): """ Run the process for update article in Solr. """ art_meta = ThriftClient() if self.args.delete: self.solr.delete(self.args.delete, commit=True) elif self.args.sanitization: # set of index ids ind_ids = set() # set of articlemeta ids art_ids = set() # all ids in index list_ids = json.loads( self.solr.select({ 'q': '*:*', 'fl': 'id', 'rows': 1000000 }))['response']['docs'] for id in list_ids: ind_ids.add(id['id']) # all ids in articlemeta for item in art_meta.documents(only_identifiers=True): if item.collection not in ALLOWED_COLLECTION: continue art_ids.add('%s-%s' % (item.code, item.collection)) # Ids to remove remove_ids = ind_ids - art_ids for id in remove_ids: self.solr.delete('id:%s' % id, commit=True) logger.info("List of removed ids: %s" % remove_ids) else: # Get article identifiers logger.info("Indexing in {0}".format(self.solr.url)) for document in art_meta.documents( collection=self.args.collection, issn=self.args.issn, from_date=self.format_date(self.args.from_date), until_date=self.format_date(self.args.until_date)): try: xml = self.pipeline_to_xml(document) self.solr.update(self.pipeline_to_xml(document), commit=True) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue # optimize the index self.solr.commit() self.solr.optimize()
def differential_mode(self): art_meta = ThriftClient() logger.info("Running with differential mode") ind_ids = set() art_ids = set() # all ids in search index logger.info("Loading Search Index ids.") itens_query = [] if self.collection: itens_query.append('in:%s' % self.collection) if self.issn: itens_query.append('issn:%s' % self.issn) query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query) list_ids = json.loads(self.solr.select( {'q': query, 'fl': 'id,scielo_processing_date', 'rows': 1000000}))['response']['docs'] for id in list_ids: ind_ids.add('%s-%s' % (id['id'], id.get('scielo_processing_date', '1900-01-01'))) # all ids in articlemeta logger.info("Loading ArticleMeta ids.") for item in art_meta.documents( collection=self.collection, issn=self.issn, only_identifiers=True ): art_ids.add('%s-%s-%s' % (item.code, item.collection, item.processing_date)) # Ids to remove if self.delete is True: logger.info("Running remove records process.") remove_ids = set([i[:27] for i in ind_ids]) - set([i[:27] for i in art_ids]) logger.info("Removing (%d) documents from search index." % len(remove_ids)) total_to_remove = len(remove_ids) if total_to_remove > 0: for ndx, to_remove_id in enumerate(remove_ids, 1): logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id)) self.solr.delete('id:%s' % to_remove_id, commit=False) # Ids to include logger.info("Running include records process.") include_ids = art_ids - ind_ids logger.info("Including (%d) documents to search index." % len(include_ids)) total_to_include = len(include_ids) if total_to_include > 0: for ndx, to_include_id in enumerate(include_ids, 1): logger.debug("Including (%d/%d): %s" % (ndx, total_to_include, to_include_id)) code = to_include_id[:23] collection = to_include_id[24: 27] processing_date = to_include_id[:-11] document = art_meta.document(code=code, collection=collection) try: xml = self.pipeline_to_xml(document) self.solr.update(xml, commit=False) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue
def run(self): """ Run the process for update article in Solr. """ art_meta = ArticleMetaThriftClient() art_accesses = AccessThriftClient(domain="ratchet.scielo.org:11660") logger.info("Loading Solr available document ids") itens_query = [] if self.collection: itens_query.append('in:%s' % self.collection) if self.issn: itens_query.append('issn:%s' % self.issn) query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query) available_ids = set([ i['id'] for i in json.loads( self.solr.select({ 'q': query, 'fl': 'id', 'rows': 1000000 }))['response']['docs'] ]) logger.info("Recording accesses for documents in {0}".format( self.solr.url)) for document in art_meta.documents(collection=self.collection, issn=self.issn): solr_id = '-'.join( [document.publisher_id, document.collection_acronym]) if solr_id not in available_ids: continue logger.debug("Loading accesses for document %s" % solr_id) total_accesses = int( art_accesses.document(document.publisher_id, document.collection_acronym).get( 'access_total', {'value': 0})['value']) xml = self.set_accesses(solr_id, total_accesses) try: result = self.solr.update(xml, commit=False) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue # optimize the index self.solr.commit() self.solr.optimize()
def articlemeta(domain='articlemeta.scielo.org:11621'): return ThriftClient(domain=domain)