def __init__(self): self.args = self.parser.parse_args() solr_url = os.environ.get('SOLR_URL') if not solr_url and not self.args.solr_url: raise argparse.ArgumentTypeError( '--url or ``SOLR_URL`` enviroment variable must be the set, use --help.' ) if not solr_url: self.solr = Solr(self.args.solr_url, timeout=10) else: self.solr = Solr(solr_url, timeout=10) if self.args.period: self.args.from_date = datetime.now() - timedelta( days=self.args.period)
def __init__(self, period=None, from_date=None, until_date=None, collection=None, issn=None, delete=False, differential=False, load_indicators=False): self.delete = delete self.collection = collection self.from_date = from_date self.until_date = until_date self.differential = differential self.load_indicators = load_indicators self.issn = issn self.solr = Solr(SOLR_URL, timeout=10) if period: self.from_date = datetime.now() - timedelta(days=period)
def __init__(self, period=None, from_date=None, until_date=None, collection=None, issn=None, delete=False, sanitization=False): self.delete = delete self.sanitization = sanitization self.collection = collection self.from_date = from_date self.until_date = until_date self.issn = issn self.solr = Solr(SOLR_URL, timeout=10) if period: self.from_date = datetime.now() - timedelta(days=period)
def __init__(self, collection=None, issn=None): self.collection = collection self.issn = issn self.solr = Solr(SOLR_URL, timeout=10)
def main(settings, *args, **xargs): solr = Solr(settings['endpoints']['solr'], timeout=int(settings['request']['timeout'])) parser = argparse.ArgumentParser(description='Script to handle article duplication on article index') parser.add_argument('-d', '--debug', action='store_true', help='execute the script in DEBUG mode (don\'t update the index)') parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.1') args = parser.parse_args() if args.debug: log.setLevel(logging.DEBUG) log.info('Start find duplication script') # set csv file for register duplication articles csv_filename = '{0}-{1}.csv'.format(settings['csv']['filename_prefix'], datetime.now().strftime('%Y-%m-%d') ) csv_file = open(csv_filename, 'wb') csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_MINIMAL) total_duplicated = 0 offset = 0 fail_list = [] while True: try: duplication_lst = get_duplication_list(solr, offset) total_for_process = len(duplication_lst) if total_for_process == 0: break; log.info('Processing {0} duplication entries'.format(total_for_process)) offset += int(settings['params']['limit_offset']) for dup_code in duplication_lst: # ignore partial upgrade duplication signature (SOLR-4016) if dup_code[0] == '0000000000000000': continue process_list = get_duplication_articles(solr, dup_code[0]) if process_list: main_article = [article['id'] for article in process_list if article['in'][0] == 'scl'] # only process if is identified only one main article from SCL collection if len(main_article) == 1: for update_article in process_list: update_id = update_article['id'] # if is the main article (SCL colection) update index # otherwise delete article duplication if update_id == main_article[0]: log.info('Updating colection element of article: {0}'.format(update_id)) save_csv_entry(csv_writer, update_article, 'updated') if not args.debug: status = update_main_article(solr, update_id, process_list) else: log.info('Deleting duplicated article: {0}'.format(update_id)) save_csv_entry(csv_writer, update_article, 'duplication deleted') if not args.debug: delete_query = 'id:"{0}"'.format(update_id) status = solr.delete(delete_query) total_duplicated += 1 if status != 0: log.error('Unable to delete article {0}, code:{1}'.format( update_id, status)) # check for udpate solr status (update or delete) if not args.debug and status != 0: log.error('Unable to update article {0}, code:{1}'.format( update_id, status)) fail_list.append(update_id) # skip else: log.debug('Skipping articles due missing main article of SCL collection :{0}'.format( [art['id'].encode('utf-8') for art in process_list]) ) # save list of ignored articles to csv file for art in process_list: save_csv_entry(csv_writer, art, 'ignored due missing main article') # write a empty line for separate next group of duplication articles csv_writer.writerow([' ']) except Exception as e: log.critical('Unexpected error: {0}'.format(e)) # commit at end to avoid offset process gap commit(solr, debug=args.debug) # script summary summary(total_duplicated, fail_list, args.debug)