示例#1
0
    def __init__(self):

        self.args = self.parser.parse_args()

        solr_url = os.environ.get('SOLR_URL')

        if not solr_url and not self.args.solr_url:
            raise argparse.ArgumentTypeError(
                '--url or ``SOLR_URL`` enviroment variable must be the set, use --help.'
            )

        if not solr_url:
            self.solr = Solr(self.args.solr_url, timeout=10)
        else:
            self.solr = Solr(solr_url, timeout=10)

        if self.args.period:
            self.args.from_date = datetime.now() - timedelta(
                days=self.args.period)
示例#2
0
 def __init__(self, period=None, from_date=None, until_date=None,
              collection=None, issn=None, delete=False, differential=False,
              load_indicators=False):
     self.delete = delete
     self.collection = collection
     self.from_date = from_date
     self.until_date = until_date
     self.differential = differential
     self.load_indicators = load_indicators
     self.issn = issn
     self.solr = Solr(SOLR_URL, timeout=10)
     if period:
         self.from_date = datetime.now() - timedelta(days=period)
示例#3
0
 def __init__(self,
              period=None,
              from_date=None,
              until_date=None,
              collection=None,
              issn=None,
              delete=False,
              sanitization=False):
     self.delete = delete
     self.sanitization = sanitization
     self.collection = collection
     self.from_date = from_date
     self.until_date = until_date
     self.issn = issn
     self.solr = Solr(SOLR_URL, timeout=10)
     if period:
         self.from_date = datetime.now() - timedelta(days=period)
 def __init__(self, collection=None, issn=None):
     self.collection = collection
     self.issn = issn
     self.solr = Solr(SOLR_URL, timeout=10)
示例#5
0
def main(settings, *args, **xargs):

    solr = Solr(settings['endpoints']['solr'], timeout=int(settings['request']['timeout']))

    parser = argparse.ArgumentParser(description='Script to handle article duplication on article index')

    parser.add_argument('-d', '--debug',
                        action='store_true',
                        help='execute the script in DEBUG mode (don\'t update the index)')

    parser.add_argument('-v', '--version',
                        action='version',
                        version='%(prog)s 0.1')

    args = parser.parse_args()

    if args.debug:
        log.setLevel(logging.DEBUG)

    log.info('Start find duplication script')

    # set csv file for register duplication articles
    csv_filename = '{0}-{1}.csv'.format(settings['csv']['filename_prefix'],
         datetime.now().strftime('%Y-%m-%d') )
    csv_file = open(csv_filename, 'wb')
    csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_MINIMAL)

    total_duplicated = 0
    offset = 0
    fail_list = []

    while True:
        try:
            duplication_lst = get_duplication_list(solr, offset)
            total_for_process = len(duplication_lst)

            if total_for_process == 0:
                break;

            log.info('Processing {0} duplication entries'.format(total_for_process))

            offset += int(settings['params']['limit_offset'])

            for dup_code in duplication_lst:

                # ignore partial upgrade duplication signature (SOLR-4016)
                if dup_code[0] == '0000000000000000':
                    continue
                
                process_list = get_duplication_articles(solr, dup_code[0])

                if process_list:
                    main_article = [article['id'] for article in process_list if article['in'][0] == 'scl']

                    # only process if is identified only one main article from SCL collection
                    if len(main_article) == 1:
                        
                        for update_article in process_list:
                            update_id = update_article['id']
                            # if is the main article (SCL colection) update index
                            # otherwise delete article duplication
                            if update_id == main_article[0]:
                                log.info('Updating colection element of article: {0}'.format(update_id))
                                save_csv_entry(csv_writer, update_article, 'updated')
                                
                                if not args.debug:
                                    status = update_main_article(solr, update_id, process_list)

                            else:
                                log.info('Deleting duplicated article: {0}'.format(update_id))
                                save_csv_entry(csv_writer, update_article, 'duplication deleted')

                                if not args.debug:
                                    delete_query = 'id:"{0}"'.format(update_id)
                                    status = solr.delete(delete_query)
                                    total_duplicated += 1
                                    if status != 0:
                                        log.error('Unable to delete article {0}, code:{1}'.format(
                                            update_id, status))


                            # check for udpate solr status (update or delete)
                            if not args.debug and status != 0:
                                log.error('Unable to update article {0}, code:{1}'.format(
                                        update_id, status))
                                fail_list.append(update_id)

                    # skip
                    else:
                        log.debug('Skipping articles due missing main article of SCL collection :{0}'.format(
                            [art['id'].encode('utf-8') for art in process_list]) )

                        # save list of ignored articles to csv file 
                        for art in process_list:
                            save_csv_entry(csv_writer, art, 'ignored due missing main article')

                # write a empty line for separate next group of duplication articles
                csv_writer.writerow([' '])

        except Exception as e:
            log.critical('Unexpected error: {0}'.format(e))

    # commit at end to avoid offset process gap
    commit(solr, debug=args.debug)
    # script summary
    summary(total_duplicated, fail_list, args.debug)