def num_claims(app=app, n_days=7, test=False): """ Reporting function; checks the postgres database for: - number of unique ORCID IDs who have created claims in the given range of time - if a single user creates a number of claims in the time period, that user is reported only once here - counts claims of all types - number of claims on a single bibcode by a single user in the given range of time - if a user claims 5 separate records in the given time period, the number of claims reported is 5 - if a user claims a record multiple times in the given time period, the number of claims reported is 1 - counts claims of type claimed, updated, and removed - total number of claims in the given time period - does not remove duplicates - meant to be compared to Kibana reports on number of rejected claims :param n_days: number of days backwards to look, starting from now :return: None (outputs to logs) """ if test: logger = setup_logging('test_num_claimed') else: logger = setup_logging('reporting') now = datetime.datetime.now(tzutc()) beginning = now - datetime.timedelta(days=n_days) with app.session_scope() as session: status_count = session.query(func.count(distinct( ClaimsLog.orcidid)), ClaimsLog.status).filter( and_(ClaimsLog.created >= beginning, ClaimsLog.created <= now)).group_by( ClaimsLog.status).all() for i in range(len(status_count)): logger.info( 'Number of unique ORCID IDs generating claims of type {} in last {} days: {}' .format(status_count[i][1], n_days, status_count[i][0])) statuses = ['claimed', 'removed', 'updated'] for s in statuses: #claims = session.query(func.count(distinct(ClaimsLog.bibcode)). # filter(and_(ClaimsLog.created >= beginning, ClaimsLog.created <= now, # ClaimsLog.status == s))).all() claims = session.query(ClaimsLog).distinct( ClaimsLog.bibcode, ClaimsLog.orcidid).filter( and_(ClaimsLog.created >= beginning, ClaimsLog.created <= now, ClaimsLog.status == s)).all() logger.info( 'Number of unique claims by a unique bibcode+ORCID ID pair that have been {} in the last {} days: {}' .format(s, n_days, len(claims))) total_claims = session.query(ClaimsLog).filter( and_(ClaimsLog.created >= beginning, ClaimsLog.created <= now, ClaimsLog.status.in_(statuses))).all() logger.info( 'Total number of non-unique claims with status {} in the last {} days, to compare with logging on rejected claims: {}' .format(statuses, n_days, len(total_claims)))
def test_setup_logging(self): with patch('adsputils.ConcurrentRotatingFileHandler') as cloghandler: adsputils.setup_logging('app') f = os.path.abspath( os.path.join(os.path.abspath(__file__), '../../..')) self.assertEqual( "call(backupCount=5, encoding=u'UTF-8', filename=u'{filename}/logs/app.log', maxBytes=2097152, mode=u'a')" .format(filename=f), str(cloghandler.call_args))
def test_setup_logging(self): with patch('adsputils.ConcurrentRotatingFileHandler') as cloghandler: adsputils.setup_logging('app') f = os.path.abspath( os.path.join(os.path.abspath(__file__), '../../..')) if sys.version_info > (3, ): test_data = "call(backupCount=10, encoding='UTF-8', filename='{filename}/logs/app.log', maxBytes=10485760, mode='a')".format( filename=f) else: test_data = "call(backupCount=10, encoding=u'UTF-8', filename=u'{filename}/logs/app.log', maxBytes=10485760, mode=u'a')".format( filename=f) self.assertEqual(test_data, str(cloghandler.call_args))
def __init__(self, app_name, *args, **kwargs): Celery.__init__(self, *args, **kwargs) self._config = adsputils.load_config() self._session = None self._engine = None self._app_name = app_name self.logger = adsputils.setup_logging(app_name) #default logger
def __init__(self, sqlachemy_url, group_changes_in_chunks_of=1, sqlalchemy_echo=False, schema_prefix="citation_capture_", force=False): """ Initializes the class and prepares DB connection. :param sqlachemy_url: URL to connect to the DB. :param group_changes_in_chunks_of: Number of citation changes to be grouped when iterating. :param sqlalchemy_echo: Print every SQL statement. :param schema_prefix: Data is stored in schemas that correspond to a prefix + file last access date. :param force: If tables already exists in DB, drop them and re-ingest. """ self.engine = create_engine(sqlachemy_url, echo=sqlalchemy_echo) self.connection = self.engine.connect() self.session = sessionmaker(bind=self.engine)() # self.logger = setup_logging(__name__) self.logger.propagate = False # self.table_name = RawCitation.__tablename__ self.expanded_table_name = "expanded_" + self.table_name self.recreated_previous_expanded_table_name = "recreated_previous_expanded_" + self.table_name self.missing_previous_expanded_table_name = "not_processed_" + self.table_name self.joint_table_name = CitationChanges.__tablename__ self.schema_prefix = schema_prefix self.schema_name = None self.previous_schema_name = None self.input_refids_filename = None self.group_changes_in_chunks_of=group_changes_in_chunks_of self.offset = 0 self.n_changes = 0 self.force = force self.last_modification_date = None
def __init__(self, fields, ignore_fields, new_fields): self.fields = fields self.ignore_fields = ignore_fields self.new_fields = new_fields self.logger = setup_logging('validate', 'INFO') self.config = {} self.config.update(load_config())
def __init__(self, file_): self._file = file_ self.read_count = 0 # needed for logging self.logger = setup_logging('AdsDataSqlSync', 'DEBUG') self.logger.info('nonbib file ingest, file {}'.format(self._file)) self.config = {} self.config.update(load_config()) self._iostream = open(file_, 'r')
def __init__(self, schema_='metrics'): self.logger = setup_logging('AdsDataSqlSync', 'INFO') self.schema = schema_ self.table = models.MetricsTable() self.table.schema = self.schema # used to buffer writes self.upserts = [] self.tmp_update_buffer = [] self.tmp_count = 0 self.config = {} self.config.update(load_config())
def num_refused_claims(n_days=7, test=False): """ Queries logs via Kibana to get the number of refused claims over a given time period. :param n_days: Number of days backwards to look, starting from now :return: None (outputs to logs) """ if test: logger = setup_logging('test_kibana') else: logger = setup_logging('reporting') query = '"+@log_group:\\"backoffice-logs\\" "+@log_group:\\"fluent-bit-backoffice_prod_orcid_pipeline_1\\" +@message:\\"Claim refused\\""' # don't need the full set of results as the total is passed separately resp = query_Kibana(query=query, n_days=n_days, rows=5) total = resp['responses'][0]['hits']['total'] logger.info('Number of claims rejected in the last {} days: {}'.format( n_days, total))
def num_missing_profile(n_days=7, test=False): """ Queries logs via Kibana to get the number of profiles reported missing over a given time period. :param n_days: Number of days backwards to look, starting from now :return: None (outputs to logs) """ if test: logger = setup_logging('test_kibana') else: logger = setup_logging('reporting') query = '"+@log_group:\\"backoffice-logs\\" "+@log_group:\\"fluent-bit-backoffice_prod_orcid_pipeline_1\\" +@message:\\"Missing profile for\\""' resp = query_Kibana(query=query, n_days=n_days, rows=5) total = resp['responses'][0]['hits']['total'] logger.info( 'Number of missing profile errors in the last {} days: {}'.format( n_days, total))
def __init__(self, schema_='metrics'): self.logger = setup_logging('AdsDataSqlSync', 'INFO') self.schema = schema_ self.table = models.MetricsTable() self.table.schema = self.schema # used to buffer writes self.upserts = [] self.tmp_update_buffer = [] self.tmp_count = 0 self.config = {} self.config.update(load_config())
def __init__(self, sqlachemy_url, group_changes_in_chunks_of=1, sqlalchemy_echo=False, schema_prefix="citation_capture_", force=False): """ Initializes the class and prepares DB connection. :param sqlachemy_url: URL to connect to the DB. :param group_changes_in_chunks_of: Number of citation changes to be grouped when iterating. :param sqlalchemy_echo: Print every SQL statement. :param schema_prefix: Data is stored in schemas that correspond to a prefix + file last access date. :param force: If tables already exists in DB, drop them and re-ingest. """ self.engine = create_engine(sqlachemy_url, echo=sqlalchemy_echo) self.connection = self.engine.connect() self.session = sessionmaker(bind=self.engine)() # # - Use app logger: #import logging #self.logger = logging.getLogger('ads-citation-capture') # - Or individual logger for this file: from adsputils import setup_logging, load_config proj_home = os.path.realpath( os.path.join(os.path.dirname(__file__), '../')) config = load_config(proj_home=proj_home) self.logger = setup_logging(__name__, proj_home=proj_home, level=config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=config.get( 'LOG_STDOUT', False)) # self.table_name = RawCitation.__tablename__ self.expanded_table_name = "expanded_" + self.table_name self.recreated_previous_expanded_table_name = "recreated_previous_expanded_" + self.table_name self.missing_previous_expanded_table_name = "not_processed_" + self.table_name self.joint_table_name = CitationChanges.__tablename__ self.schema_prefix = schema_prefix self.schema_name = None self.previous_schema_name = None self.input_refids_filename = None self.group_changes_in_chunks_of = group_changes_in_chunks_of self.offset = 0 self.n_changes = 0 self.force = force self.last_modification_date = None
def __init__(self, fields, ignore_fields, new_fields): self.fields = fields self.ignore_fields = ignore_fields self.new_fields = new_fields # - Use app logger: # import logging # self.logger = logging.getLogger('master-pipeline') # - Or individual logger for this file: from adsputils import setup_logging, load_config proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../')) self.config = load_config(proj_home=proj_home) self.logger = setup_logging(__name__, proj_home=proj_home, level=self.config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=self.config.get('LOG_STDOUT', False))
def main(): global config config.update(load_config()) global logger logger = setup_logging('ADSData', config.get('LOG_LEVEL', 'INFO')) parser = argparse.ArgumentParser(description='generate nonbib data') ars = parser.parse_args() load(config) # compute metrics for a bibcode compute_metrics('2012ApJS..199...26H') # lots_of_metrics(config) logger.info('end of program')
def __init__(self, schema_='nonbib'): self.schema = schema_ self.meta = MetaData() self.table = models.NonBibTable() self.table.schema = self.schema # - Use app logger: #import logging #logger = logging.getLogger('ads-data') # - Or individual logger for this file: proj_home = os.path.realpath( os.path.join(os.path.dirname(__file__), '../')) config = load_config(proj_home=proj_home) self.logger = setup_logging(__name__, proj_home=proj_home, level=config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=config.get( 'LOG_STDOUT', False))
def __init__(self, file_): self._file = file_ self.read_count = 0 # needed for logging # - Use app logger: #import logging #logger = logging.getLogger('ads-data') # - Or individual logger for this file: proj_home = os.path.realpath( os.path.join(os.path.dirname(__file__), '../')) self.config = load_config(proj_home=proj_home) self.logger = setup_logging( __name__, proj_home=proj_home, level=self.config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=self.config.get('LOG_STDOUT', False)) self.logger.info('nonbib file ingest, file {}'.format(self._file)) self._iostream = open(file_, 'r')
def test_logging(self): logdir = os.path.abspath( os.path.join(os.path.dirname(__file__), '../../logs')) foo_log = logdir + '/foo.bar.log' if os.path.exists(foo_log): os.remove(foo_log) logger = adsputils.setup_logging('foo.bar') logger.warn('first') frameinfo = getframeinfo(currentframe()) logger.handlers[0].stream.flush() #print foo_log self.assertTrue(os.path.exists(foo_log)) c = _read_file(foo_log) j = json.loads(c) self.assertEqual(j['message'], 'first') self.assertTrue('hostname' in j) # verify warning has filename and linenumber self.assertEqual(os.path.basename(frameinfo.filename), j['filename']) self.assertEqual(j['lineno'], frameinfo.lineno - 1) time.sleep(0.01) # now multiline message logger.warn(u'second\nthird') logger.warn('last') c = _read_file(foo_log) found = False msecs = False for x in c.strip().split('\n'): j = json.loads(x) self.assertTrue(j) if j['message'] == u'second\nthird': found = True t = adsputils.get_date(j['asctime']) if t.microsecond > 0: msecs = True self.assertTrue(found) self.assertTrue(msecs)
def __init__(self, blocks=None, logger=None, merger_rules= _config['MERGER_RULES'], priorities = _config['PRIORITIES'], references_always_append = _config['REFERENCES_ALWAYS_APPEND'] ): self.blocks = blocks self.logger=logger self.block = {} self.altpublications = [] self.eL = enforce_schema.Enforcer().ensureList self.merger_rules = merger_rules self.priorities = priorities self.references_always_append = references_always_append if blocks: #Assert that there is only block type being merged assert len(set([i['tempdata']['type'] for i in blocks]))==1 self.blocktype = blocks[0]['tempdata']['type'] if not self.logger: self.logger = utils.setup_logging('merger')
def __init__(self, schema_='metrics'): # - Use app logger: #import logging #logger = logging.getLogger('ads-data') # - Or individual logger for this file: proj_home = os.path.realpath( os.path.join(os.path.dirname(__file__), '../')) self.config = load_config(proj_home=proj_home) self.logger = setup_logging( __name__, proj_home=proj_home, level=self.config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=self.config.get('LOG_STDOUT', False)) self.schema = schema_ self.table = models.MetricsTable() self.table.schema = self.schema # used to buffer writes self.upserts = [] self.tmp_update_buffer = [] self.tmp_count = 0
def __init__(self, app_name, *args, **kwargs): """ :param: app_name - string, name of the application (can be anything) :keyword: local_config - dict, configuration that should be applied over the default config (that is loaded from config.py and local_config.py) """ proj_home = None if 'proj_home' in kwargs: proj_home = kwargs.pop('proj_home') self.config = load_config(extra_frames=1, proj_home=proj_home, app_name=app_name) local_config = None if 'local_config' in kwargs and kwargs['local_config']: local_config = kwargs.pop('local_config') self.config.update(local_config) #our config if not proj_home: proj_home = self.config.get('PROJ_HOME', None) self.logger = setup_logging( app_name, proj_home=proj_home, level=self.config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=self.config.get('LOG_STDOUT', False))
def init_app(self, config=None): """This function must be called before you start working with the application (or worker, script etc) :return None """ if self._session is not None: # the app was already instantiated return if config: self._config.update(config) #our config self.conf.update( config ) #celery's config (devs should be careful to avoid clashes) self.logger = adsputils.setup_logging( self._app_name, self._config.get('LOGGING_LEVEL', 'INFO')) self._engine = create_engine(config.get('SQLALCHEMY_URL', 'sqlite:///'), echo=config.get('SQLALCHEMY_ECHO', False)) self._session_factory = sessionmaker() self._session = scoped_session(self._session_factory) self._session.configure(bind=self._engine)
import sys import os import utils import json import ptree import traceback from stat import ST_MTIME from datetime import datetime from dateutil.parser import parse from adsputils import setup_logging from adsft.utils import get_filenames logger = setup_logging(__name__) def file_last_modified_time(file_input): """ Stats the given file to find the last modified time :param file_input: path to file :return: date time object of the last modified time """ mtime = os.stat(file_input)[ST_MTIME] return datetime.fromtimestamp(mtime) def create_meta_path(dict_input, extract_path):
import sys import time import argparse import logging import traceback import requests import warnings from requests.packages.urllib3 import exceptions warnings.simplefilter('ignore', exceptions.InsecurePlatformWarning) from adsputils import setup_logging, get_date from ADSOrcid import updater, tasks from ADSOrcid.models import ClaimsLog, KeyValue, Records, AuthorInfo app = tasks.app logger = setup_logging('run.py') def reindex_claims(since=None, orcid_ids=None, **kwargs): """ Re-runs all claims, both from the pipeline and from the orcid-service storage. :param: since - RFC889 formatted string :type: str :return: no return """ if orcid_ids: for oid in orcid_ids: tasks.task_index_orcid_profile.delay({
from adsputils import setup_logging import spacy logger = setup_logging(__name__) def get_facilities(model, text): """ purpose: to identify facilities within the text input: model loaded from disk, text to process return: list of facilities identified with custom spacy ner model """ doc = model(text) facilities = [] for ent in doc.ents: facilities.append(ent.text) return facilities def load_model(dir): return spacy.load(dir)
def main(): parser = argparse.ArgumentParser(description='process column files into Postgres') parser.add_argument('-t', '--rowViewBaselineSchemaName', default='nonbibstaging', help='name of old postgres schema, used to compute delta') parser.add_argument('-d', '--diagnose', default=False, action='store_true', help='run simple test') parser.add_argument('-f', '--filename', default='bibcodes.txt', help='name of file containing the list of bibcode for metrics comparison') parser.add_argument('-m', '--metricsSchemaName', default='metrics', help='name of the postgres metrics schema') parser.add_argument('-n', '--metricsSchemaName2', default='', help='name of the postgres metrics schema for comparison') parser.add_argument('-r', '--rowViewSchemaName', default='nonbib', help='name of the postgres row view schema') parser.add_argument('-s', '--batchSize', default=100, help='used when queuing data') parser.add_argument('-b', '--bibcodes', default='', help='comma separate list of bibcodes send to master pipeline') parser.add_argument('command', default='help', nargs='?', help='ingest | verify | createIngestTables | dropIngestTables | renameSchema ' \ + ' | createJoinedRows | createMetricsTable | dropMetricsTable ' \ + ' | populateMetricsTable | createDeltaRows | populateMetricsTableDelta ' \ + ' | runRowViewPipeline | runMetricsPipeline | createNewBibcodes ' \ + ' | runRowViewPipelineDelta | runMetricsPipelineDelta '\ + ' | runPipelines | runPipelinesDelta | nonbibToMasterPipeline | nonbibDeltaToMasterPipeline' + ' | metricsToMasterPipeline | metricsDeltaToMasterPipeline | metricsCompare') args = parser.parse_args() config.update(load_config()) global logger logger = setup_logging('AdsDataSqlSync', config.get('LOG_LEVEL', 'INFO')) logger.info('starting AdsDataSqlSync.app with {}'.format(args.command)) nonbib_connection_string = config.get('INGEST_DATABASE', 'postgresql://postgres@localhost:5432/postgres') nonbib_db_engine = create_engine(nonbib_connection_string) nonbib_db_conn = nonbib_db_engine.connect() metrics_connection_string = config.get('METRICS_DATABASE', 'postgresql://postgres@localhost:5432/postgres') metrics_db_engine = create_engine(metrics_connection_string) metrics_db_conn = metrics_db_engine.connect() sql_sync = nonbib.NonBib(args.rowViewSchemaName) if args.command == 'help' and args.diagnose: diagnose_nonbib() diagnose_metrics() elif args.command == 'createIngestTables': sql_sync.create_column_tables(nonbib_db_engine) elif args.command == 'dropIngestTables': sql_sync.drop_column_tables(nonbib_db_engine) elif args.command == 'createJoinedRows': sql_sync.create_joined_rows(nonbib_db_conn) elif args.command == 'createMetricsTable' and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.create_metrics_table(metrics_db_engine) elif args.command == 'dropMetricsTable' and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.drop_metrics_table(metrics_db_engine) elif args.command == 'populateMetricsTable' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics() m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'populateMetricsTableDelta' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'renameSchema' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'createDeltaRows' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'createNewBibcodes' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.build_new_bibcodes(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'logDeltaReasons' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'runRowViewPipeline' and args.rowViewSchemaName: # drop tables, create tables, load data, create joined view sql_sync.drop_column_tables(nonbib_db_engine) sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) elif args.command == 'runMetricsPipeline' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.drop_metrics_table(metrics_db_engine) m.create_metrics_table(metrics_db_engine) m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'runRowViewPipelineDelta' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: # we delete the old data baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName) baseline_engine = create_engine(nonbib_connection_string) baseline_sql_sync.drop_column_tables(baseline_engine) # rename the current to be the old (for later comparison) sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName) # create the new and populate baseline_sql_sync = None sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) # compute delta between old and new sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName) sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'runMetricsPipelineDelta' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'runPipelines' and args.rowViewSchemaName and args.metricsSchemaName: # drop tables, create tables, load data, compute metrics sql_sync.drop_column_tables(nonbib_db_engine) sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) m = metrics.Metrics(args.metricsSchemaName) m.drop_metrics_table(metrics_db_engine) m.create_metrics_table(metrics_db_engine) m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'runPipelinesDelta' and args.rowViewSchemaName and args.metricsSchemaName and args.rowViewBaselineSchemaName: # drop tables, rename schema, create tables, load data, compute delta, compute metrics baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName) baseline_engine = create_engine(nonbib_connection_string) baseline_sql_sync.drop_column_tables(baseline_engine) sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName) baseline_sql_sync = None sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName) sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName) m = metrics.Metrics(args.metricsSchemaName) m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'nonbibToMasterPipeline' and args.diagnose: diagnose_nonbib() elif args.command == 'nonbibToMasterPipeline' and args.bibcodes: bibcodes = args.bibcodes.split(',') nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes) elif args.command == 'nonbibToMasterPipeline' and args.filename: bibcodes = [] with open(args.filename, 'r') as f: for line in f: bibcodes.append(line.strip()) if len(bibcodes) > 100: nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes) bibcodes = [] if len(bibcodes) > 0: nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes) elif args.command == 'nonbibToMasterPipeline': nonbib_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize)) elif args.command == 'nonbibDeltaToMasterPipeline': nonbib_delta_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize)) elif args.command == 'metricsToMasterPipeline' and args.diagnose: diagnose_metrics() elif args.command == 'metricsToMasterPipeline' and args.bibcodes: bibcodes = args.bibcodes.split(',') metrics_bibs_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, bibcodes) elif args.command == 'metricsToMasterPipeline': metrics_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, int(args.batchSize)) elif args.command == 'metricsDeltaToMasterPipeline': metrics_delta_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize)) elif args.command == 'metricsCompare': # compare the values in two metrics postgres tables # useful to compare results from new pipeline to produciton pipeline # read metrics records from both databases and compare metrics_logger = setup_logging('metricsCompare', 'INFO') metrics1 = metrics.Metrics(args.metricsSchemaName) Session = sessionmaker(bind=metrics_db_engine) session = Session() if args.metricsSchemaName: session.execute('set search_path to {}'.format(args.metricsSchemaName)) metrics2 = metrics.Metrics(args.metricsSchemaName2) metrics_connection_string2 = config.get('METRICS_DATABASE2', 'postgresql://postgres@localhost:5432/postgres') metrics_db_engine2 = create_engine(metrics_connection_string2) Session2 = sessionmaker(bind=metrics_db_engine2) session2 = Session2() if args.metricsSchemaName2: session2.execute('set search_path to {}'.format(args.metricsSchemaName2)) print 'm2', metrics_connection_string2 print 'm2 schema', args.metricsSchemaName2 with open(args.filename) as f: for line in f: bibcode = line.strip() m1 = metrics1.get_by_bibcode(session, bibcode) m2 = metrics2.get_by_bibcode(session2, bibcode) mismatch = metrics.Metrics.metrics_mismatch(line.strip(), m1, m2, metrics_logger) if mismatch: metrics_logger.error('{} MISMATCHED FIELDS: {}'.format(bibcode, mismatch)) print '{} MISMATCHED FIELDS: {}'.format(bibcode, mismatch) session.close() session2.close() else: print 'app.py: illegal command or missing argument, command = ', args.command print ' row view schema name = ', args.rowViewSchemaName print ' row view baseline schema name = ', args.rowViewBaselineSchemaName print ' metrics schema name = ', args.metricsSchemaName if nonbib_db_conn: nonbib_db_conn.close() if metrics_db_conn: metrics_db_conn.close() logger.info('completed {}'.format(args.command))
def claimed_records(debug=False, test=False): """ Reporting function; checks SOLR for the following: - number of records that have been claimed by at least one ORCID ID, in orcid_pub, orcid_user, orcid_other (each reported separately) - total number of accepted claims of each of orcid_pub, orcid_user, orcid_other (i.e. if a single record has two separate authors who have successfully created a claim, the number reported here is 2) - total number of bibcodes that have been claimed, of any type The report is designed to be run regularly, and the results compared to previous report runs (via logs) :return: None (output to logs) """ if test: logger = setup_logging('test_claimed') else: logger = setup_logging('reporting') config = {} config.update(load_config()) # the first 7 digits of ORCID IDs are zero padding orcid_wild = '000000*' resp_pub = query_solr(config['SOLR_URL'], 'orcid_pub:"' + orcid_wild + '"', rows=10, sort="bibcode desc", fl='bibcode') resp_user = query_solr(config['SOLR_URL'], 'orcid_user:"******"', rows=10, sort="bibcode desc", fl='bibcode') resp_other = query_solr(config['SOLR_URL'], 'orcid_other:"' + orcid_wild + '"', rows=10, sort="bibcode desc", fl='bibcode') logger.info('Number of records with an orcid_pub: {}'.format( resp_pub['response']['numFound'])) logger.info('Number of records with an orcid_user: {}'.format( resp_user['response']['numFound'])) logger.info('Number of records with an orcid_other: {}'.format( resp_other['response']['numFound'])) start = 0 rows = 1000 results = resp_pub['response']['docs'] num_orcid_pub = 0 num_orcid_user = 0 num_orcid_other = 0 bibcode_pub = set() bibcode_user = set() bibcode_other = set() while results: results = query_records(start=start, rows=rows) for i in range(len(results)): try: results[i]['orcid_pub'] except KeyError: pass else: num_p = len( fnmatch.filter(results[i].get('orcid_pub'), '0000*')) num_orcid_pub += num_p bibcode_pub.add(results[i].get('bibcode')) try: results[i]['orcid_user'] except KeyError: pass else: num_u = len( fnmatch.filter(results[i].get('orcid_user'), '0000*')) num_orcid_user += num_u bibcode_user.add(results[i].get('bibcode')) try: results[i]['orcid_other'] except KeyError: pass else: num_o = len( fnmatch.filter(results[i].get('orcid_other'), '0000*')) num_orcid_other += num_o bibcode_other.add(results[i].get('bibcode')) if debug: if (start + rows) % 10000 == 0: logger.info( 'Number of results processed so far: {}'.format(start + rows)) if test: break else: start += rows logger.info('Total number of orcid_pub claims: {}'.format(num_orcid_pub)) logger.info('Total number of orcid_user claims: {}'.format(num_orcid_user)) logger.info( 'Total number of orcid_other claims: {}'.format(num_orcid_other)) orcid_bibcodes = bibcode_pub.union(bibcode_user).union(bibcode_other) logger.info('Total number of records with any ORCID claims: {}'.format( len(orcid_bibcodes)))
import os import requests import json from adsputils import setup_logging logger = setup_logging('docmatch_log') def get_matches(metadata, doctype, mustmatch=False, match_doctype=None): """ :param metadata: :param doctype: :param mustmatch: :param match_doctype: list of doctypes, if specified only this type of doctype is matched :return: """ try: payload = { 'abstract': metadata['abstract'].replace('\n', ' '), 'title': metadata['title'].replace('\n', ' '), 'author': metadata['authors'], 'year': metadata['pubdate'][:4], 'doctype': doctype, 'bibcode': metadata['bibcode'], 'doi': metadata.get('doi', None), 'mustmatch': mustmatch, 'match_doctype': match_doctype } except KeyError as e: return (metadata['bibcode'], None, e)
import gzip import time import argparse from collections import OrderedDict from sqlalchemy.orm import load_only from aip.classic import read_records from adsputils import setup_logging from aip.models import Records, ChangeLog from aip import tasks import pyingest.parsers.aps as aps import pyingest.parsers.arxiv as arxiv app = tasks.app logger = setup_logging('run.py') def read_bibcodes(files): """Reads contents of the BIBFILES into memory; basically bibcode:json_fingerprint pairs. @param files: list of files to read from @return: OrderedDict instance """ start = time.time() records = OrderedDict() for f in files: logger.debug('...loading %s' % f)
from ads.ADSCachedExports import ADSRecords, init_lookers_cache from ads.ADSCachedExports import LOGGER as export_logger from aip.classic import conversions except ImportError: sys.path.append('/proj/ads/soft/python/lib/site-packages') #TODO: make it configurable try: from ads.ADSCachedExports import ADSRecords, init_lookers_cache from ads.ADSCachedExports import LOGGER as export_logger from aip.classic import conversions INIT_LOOKERS_CACHE = init_lookers_cache except ImportError: print "Unable to import ads.ADSExports.ADSRecords!" print "We will be unable to query ADS-classic for records!" logger = utils.setup_logging('read_records') def canonicalize_records(records, targets=None, ignore_fingerprints=False): ''' Takes a dict of {bibcode:fingerprint} and resolves each bibcode to its canonical. Finds all alternates associated with that bibcode and constructs the full JSON_fingerprint from all of these associated records Note: Pops from the input dict with no attempt to copy/deepcopy it. ''' #TODO(rca): getAlternates is called multiple times unnecessarily start = time.time() results = []
from adsputils import load_config, setup_logging from ADSOrcid import tasks from ADSOrcid.models import ClaimsLog from levenshtein_default import query_solr from sqlalchemy import func, and_, distinct from dateutil.tz import tzutc import fnmatch import datetime import cachetools import time import pytz import urllib3 import requests app = tasks.app logger = setup_logging('reporting') records_cache = cachetools.TTLCache(maxsize=1024, ttl=3600, timer=time.time, missing=None, getsizeof=None) @cachetools.cached(records_cache) def query_records(start=0,rows=1000): """ Function to query SOLR for a set of records and return the response. Kept as a separate function in order to use a cache. :param start: Row number to start with; default=0 rows: Number of rows to retrieve; default=1000 :return response: Response from the query """
from __future__ import print_function from future import standard_library standard_library.install_aliases() from builtins import str import json import sys import os import requests import argparse import json import pickle # python compare_solrs.py --solr-endpoints http://adsqb.cfa.harvard.edu:9983/solr/BumblebeeETL/select http://adsqb.cfa.harvard.edu:9983/solr/collection1/select --bibcode stdin fields < testBibcodes.txt from adsputils import setup_logging logger = setup_logging('compare-solr', level='DEBUG') SOLR1_PATH = 'http://localhost:9000/solr/select/' SOLR2_PATH = 'http://localhost:8900/solr/select/' def query_solr( endpoint, query, start=0, rows=200, sort='date desc', fl=None, ): d = { 'q': query,
def __init__(self, schema_='nonbib'): self.schema = schema_ self.meta = MetaData() self.table = models.NonBibTable() self.table.schema = self.schema self.logger = setup_logging('AdsDataSqlSync', 'INFO')
from kombu import Exchange, Queue, BrokerConnection import datetime # ============================= INITIALIZATION ==================================== # app = app_module.create_app() exch = Exchange(app.conf.get('CELERY_DEFAULT_EXCHANGE', 'ADSWorker'), type=app.conf.get('CELERY_DEFAULT_EXCHANGE_TYPE', 'topic')) app.conf.CELERY_QUEUES = ( Queue('errors', exch, routing_key='errors', durable=False, message_ttl=24*3600*5), Queue('some-queue', exch, routing_key='check-orcidid') ) logger = adsputils.setup_logging('ADSWorker', app.conf.get('LOGGING_LEVEL', 'INFO')) # connection to the other virtual host (for sending data out) forwarding_connection = BrokerConnection(app.conf.get('OUTPUT_CELERY_BROKER', '%s/%s' % (app.conf.get('CELRY_BROKER', 'pyamqp://'), app.conf.get('OUTPUT_EXCHANGE', 'other-pipeline')))) class MyTask(Task): def on_failure(self, exc, task_id, args, kwargs, einfo): logger.error('{0!r} failed: {1!r}'.format(task_id, exc)) # ============================= TASKS ============================================= # @app.task(base=MyTask, queue='some-queue')
import json from adsputils import date2solrstamp import sys import time from collections import OrderedDict # ============================= INITIALIZATION ==================================== # # - Use app logger: #import logging #logger = logging.getLogger('master-pipeline') # - Or individual logger for this file: from adsputils import setup_logging, load_config proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../')) config = load_config(proj_home=proj_home) logger = setup_logging(__name__, proj_home=proj_home, level=config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=config.get('LOG_STDOUT', False)) # =============================== FUNCTIONS ======================================= # def extract_metrics_pipeline(data, solrdoc): citation = data.get('citations', []) return dict(citation=citation) def extract_data_pipeline(data, solrdoc): reader = data.get('readers', [])
def main(): parser = argparse.ArgumentParser( description='process column files into Postgres') parser.add_argument( '-t', '--rowViewBaselineSchemaName', default='nonbibstaging', help='name of old postgres schema, used to compute delta') parser.add_argument('-d', '--diagnose', default=False, action='store_true', help='run simple test') parser.add_argument( '-f', '--filename', default=None, help= 'name of file containing the list of bibcode for metrics comparison') parser.add_argument('-m', '--metricsSchemaName', default='metrics', help='name of the postgres metrics schema') parser.add_argument( '-n', '--metricsSchemaName2', default='', help='name of the postgres metrics schema for comparison') parser.add_argument('-r', '--rowViewSchemaName', default='nonbib', help='name of the postgres row view schema') parser.add_argument('-s', '--batchSize', default=100, help='used when queuing data') parser.add_argument( '-b', '--bibcodes', default='', help='comma separate list of bibcodes send to master pipeline') parser.add_argument('command', default='help', nargs='?', help='ingest | verify | createIngestTables | dropIngestTables | renameSchema ' \ + ' | createJoinedRows | createMetricsTable | dropMetricsTable ' \ + ' | populateMetricsTable | createDeltaRows | populateMetricsTableDelta ' \ + ' | runRowViewPipeline | runMetricsPipeline | createNewBibcodes ' \ + ' | runRowViewPipelineDelta | runMetricsPipelineDelta '\ + ' | runPipelines | runPipelinesDelta | nonbibToMasterPipeline | nonbibDeltaToMasterPipeline' + ' | metricsToMasterPipeline | metricsDeltaToMasterPipeline | metricsCompare' + ' | resetNonbib') args = parser.parse_args() logger.info('starting AdsDataSqlSync.app with {}'.format(args.command)) nonbib_connection_string = config.get( 'INGEST_DATABASE', 'postgresql://postgres@localhost:5432/postgres') nonbib_db_engine = create_engine(nonbib_connection_string) nonbib_db_conn = nonbib_db_engine.connect() metrics_connection_string = config.get( 'METRICS_DATABASE', 'postgresql://postgres@localhost:5432/postgres') metrics_db_engine = create_engine(metrics_connection_string, pool_size=30) metrics_db_conn = metrics_db_engine.connect() sql_sync = nonbib.NonBib(args.rowViewSchemaName) if args.command == 'help' and args.diagnose: diagnose_nonbib() diagnose_metrics() elif args.command == 'resetNonbib': # detect if pipeline didn't complete and reset postgres tables if not nonbib_db_engine.has_table('rowviewm', schema='nonbib'): print 'merged table not found, resetting database' nonbib_db_engine.execute('drop schema if exists nonbib cascade') nonbib_db_engine.execute( 'alter schema nonbibstaging rename to nonbib') print 'reset complete' else: print 'merged output table found, reset not needed' elif args.command == 'createIngestTables': sql_sync.create_column_tables(nonbib_db_engine) elif args.command == 'dropIngestTables': sql_sync.drop_column_tables(nonbib_db_engine) elif args.command == 'createJoinedRows': sql_sync.create_joined_rows(nonbib_db_conn) elif args.command == 'createMetricsTable' and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.create_metrics_table(metrics_db_engine) elif args.command == 'dropMetricsTable' and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.drop_metrics_table(metrics_db_engine) elif args.command == 'populateMetricsTable' and args.rowViewSchemaName and args.metricsSchemaName and args.filename: m = metrics.Metrics(args.metricsSchemaName) with open(args.filename, 'r') as f: for line in f: bibcode = line.strip() if bibcode: m.update_metrics_bibcode(bibcode, metrics_db_conn, nonbib_db_conn) elif args.command == 'populateMetricsTable' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics() m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'populateMetricsTableDelta' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'renameSchema' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'createDeltaRows' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'createNewBibcodes' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.build_new_bibcodes(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'logDeltaReasons' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'runRowViewPipeline' and args.rowViewSchemaName: # drop tables, create tables, load data, create joined view sql_sync.drop_column_tables(nonbib_db_engine) sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) elif args.command == 'runMetricsPipeline' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.drop_metrics_table(metrics_db_engine) m.create_metrics_table(metrics_db_engine) m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'runRowViewPipelineDelta' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: # we delete the old data baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName) baseline_engine = create_engine(nonbib_connection_string) baseline_sql_sync.drop_column_tables(baseline_engine) # rename the current to be the old (for later comparison) sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName) # create the new and populate baseline_sql_sync = None sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) # compute delta between old and new sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName) sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'runMetricsPipelineDelta' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'runPipelines' and args.rowViewSchemaName and args.metricsSchemaName: # drop tables, create tables, load data, compute metrics sql_sync.drop_column_tables(nonbib_db_engine) sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) m = metrics.Metrics(args.metricsSchemaName) m.drop_metrics_table(metrics_db_engine) m.create_metrics_table(metrics_db_engine) m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'runPipelinesDelta' and args.rowViewSchemaName and args.metricsSchemaName and args.rowViewBaselineSchemaName: # drop tables, rename schema, create tables, load data, compute delta, compute metrics baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName) baseline_engine = create_engine(nonbib_connection_string) baseline_sql_sync.drop_column_tables(baseline_engine) sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName) baseline_sql_sync = None sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName) sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName) m = metrics.Metrics(args.metricsSchemaName) m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'nonbibToMasterPipeline' and args.diagnose: diagnose_nonbib() elif args.command == 'nonbibToMasterPipeline' and args.bibcodes: bibcodes = args.bibcodes.split(',') nonbib_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize), source=bibcodes) elif args.command == 'nonbibToMasterPipeline' and args.filename: bibcodes = [] with open(args.filename, 'r') as f: for line in f: bibcodes.append(line.strip()) nonbib_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize), source=bibcodes) elif args.command == 'nonbibToMasterPipeline': nonbib_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize), source="models.NonBibTable") elif args.command == 'nonbibDeltaToMasterPipeline': nonbib_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize), source="models.NonBibDeltaTable") elif args.command == 'metricsToMasterPipeline' and args.diagnose: diagnose_metrics() elif args.command == 'metricsToMasterPipeline' and args.filename: bibcodes = [] with open(args.filename, 'r') as f: for line in f: bibcodes.append(line.strip()) if len(bibcodes) > 100: metrics_bibs_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, bibcodes) bibcodes = [] if len(bibcodes) > 0: metrics_bibs_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, bibcodes) elif args.command == 'metricsToMasterPipeline' and args.bibcodes: bibcodes = args.bibcodes.split(',') metrics_bibs_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, bibcodes) elif args.command == 'metricsToMasterPipeline': metrics_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, int(args.batchSize)) elif args.command == 'metricsDeltaToMasterPipeline': metrics_delta_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize)) elif args.command == 'metricsCompare': # compare the values in two metrics postgres tables # useful to compare results from new pipeline to produciton pipeline # read metrics records from both databases and compare metrics_logger = setup_logging('metricsCompare', 'INFO') metrics1 = metrics.Metrics(args.metricsSchemaName) Session = sessionmaker(bind=metrics_db_engine) session = Session() if args.metricsSchemaName: session.execute('set search_path to {}'.format( args.metricsSchemaName)) metrics2 = metrics.Metrics(args.metricsSchemaName2) metrics_connection_string2 = config.get( 'METRICS_DATABASE2', 'postgresql://postgres@localhost:5432/postgres') metrics_db_engine2 = create_engine(metrics_connection_string2) Session2 = sessionmaker(bind=metrics_db_engine2) session2 = Session2() if args.metricsSchemaName2: session2.execute('set search_path to {}'.format( args.metricsSchemaName2)) print 'm2', metrics_connection_string2 print 'm2 schema', args.metricsSchemaName2 with open(args.filename) as f: for line in f: bibcode = line.strip() m1 = metrics1.get_by_bibcode(session, bibcode) m2 = metrics2.get_by_bibcode(session2, bibcode) mismatch = metrics.Metrics.metrics_mismatch( line.strip(), m1, m2, metrics_logger) if mismatch: metrics_logger.error('{} MISMATCHED FIELDS: {}'.format( bibcode, mismatch)) print '{} MISMATCHED FIELDS: {}'.format(bibcode, mismatch) session.close() session2.close() else: print 'app.py: illegal command or missing argument, command = ', args.command print ' row view schema name = ', args.rowViewSchemaName print ' row view baseline schema name = ', args.rowViewBaselineSchemaName print ' metrics schema name = ', args.metricsSchemaName if nonbib_db_conn: nonbib_db_conn.close() if metrics_db_conn: metrics_db_conn.close() logger.info('completed {}'.format(args.command))
import requests import json import os import sys import re import traceback from adsputils import setup_logging, get_date, date2solrstamp from aip.classic import enforce_schema logger = setup_logging('solr_adapter') ARTICLE_TYPES = set(['eprint', 'article', 'inproceedings', 'inbook']) AUTHOR_TYPES = set(['regular', 'collaboration']) def get_date_by_datetype(ADS_record): """computes the standard pubdate by selecting the appropriate value from the ADS_record and formatting it as YYYY-MM-DD""" dates = ADS_record['metadata']['general']['publication']['dates'] for datetype in [ 'date-published', 'date-thesis', 'date-preprint' ]: try: return next(i['content'] for i in dates if i['type'].lower() == datetype) except StopIteration: pass return None def _normalize_author_name(strname): if not strname: return None
from datetime import datetime, timedelta from os import remove from shutil import move import subprocess import os from adsputils import setup_logging, load_config logger = setup_logging('AutomatedIngestReport') conf = load_config(proj_home='./') # enums used to to generate file names class FileType: CANONICAL = 'CANONICAL' SOLR = 'SOLR' FULLTEXT = 'FULLTEXT' class FileAdjective: MISSING = 'MISSING' DELETED = 'DELETED' EXTRA = 'EXTRA' NEW = 'NEW' class Date: TODAY = 1 YESTERDAY = 2