Пример #1
0
def main(MONGO=MONGO,*args):
  if args:
    sys.argv.extend(*args)

  parser = argparse.ArgumentParser()

  parser.add_argument(
    '--target-bibcodes',
    nargs='*',
    default=[],
    dest='targetBibcodes',
    help='Only analyze the specified bibcodes, and ignore their JSON fingerprints. Only works when --async=False. Use the syntax @filename.txt to read these from file (1 bibcode per file)'
    )

  parser.add_argument(
    '--async',
    default=False,
    action='store_true',
    dest='async',
    help='start in async mode'
    )

  parser.add_argument(
    '--dont-init-lookers-cache',
    default=False,
    action='store_true',
    dest='dont_init_lookers_cache',
    help='dont call ADSExports2.init_lookers_cache()'
    )

  parser.add_argument(
    '--load-records-from-pickle',
    nargs='*',
    default=None,
    dest='load_records_from_pickle',
    help='Load XML records from a pickle instead of ADSExports',
    )

  parser.add_argument(
    '--dump-output-to-file',
    nargs=1,
    type=str,
    default=None,
    dest='outfile',
    help='Output records to a file'
    )

  parser.add_argument(
    '--ignore-json-fingerprints',
    default=False,
    action='store_true',
    dest='ignore_json_fingerprints',
    help='ignore json fingerprints when finding new records to update (ie, force update)',
    )

  parser.add_argument(
    '--process-deletions',
    default=False,
    action='store_true',
    dest='process_deletions',
    help='Find orphaned bibcodes in the mongodb, then send these bibcodes to delete via rabbitMQ. No updates will be processed with this flag is set.',
    )

  parser.add_argument(
    '--max-deletions',
    default=2000,
    type=int,
    dest='max_deletions',
    help='Maximum number of deletions to attempt; If over this limit, exit and log an error',
    )

  args = parser.parse_args()

  if not args.dont_init_lookers_cache:
    start = time.time()
    logger.info("Calling init_lookers_cache()")
    ReadRecords.INIT_LOOKERS_CACHE()
    logger.info("init_lookers_cache() returned in %0.1f sec" % (time.time()-start))

  records = readBibcodesFromFile(BIBCODE_FILES)
  targets = None
  if args.targetBibcodes:
    if args.targetBibcodes[0].startswith('@'):
      with open(args.targetBibcodes[0].replace('@','')) as fp:
        targetBibcodes = deque([L.strip() for L in fp.readlines() if L and not L.startswith('#')])
    else:
        targetBibcodes = args.targetBibcodes
    targets = {bibcode:records[bibcode] for bibcode in targetBibcodes}
  
  records = deque(ReadRecords.canonicalize_records(records,targets))
  total = float(len(records)) #Save to print later

  if args.ignore_json_fingerprints:
    records = deque([(r[0],'ignore') for r in records])

  if args.process_deletions:
    start = time.time()
    logger.info("Processing deletions. This will block for several hours until the database is compared, then exit.")
    logger.warning("No updates will be processed when --process-deletions is set")
    mongo = MongoConnection.PipelineMongoConnection(**MONGO)
    mongo.close()
    results = mongo.getAllBibcodes()
    if len(results) != mongo.db[mongo.collection].count():
      logger.warning("len getAllBibcodes (%s) != len count (%s). Continue anyways." % (len(results),mongo.db[mongo.collection].count()))
    records = [i[0] for i in records]
    payload = list(set(results).difference(set(records)))
    if len(payload) > args.max_deletions:
      logger.critical("|".join(payload))
      logger.critical("Too many deletions: {} > {}".format(len(payload), args.max_deletions))
      sys.exit(1)
    w = RabbitMQWorker()   
    w.connect(psettings.RABBITMQ_URL)
    publish(w,payload,routing_key='DeletionRoute')
    logger.info("Found %s orphaned bibcodes in %0.1f seconds." % (len(payload),time.time()-start))
    sys.exit(0)


  if not args.async:
    mongo = MongoConnection.PipelineMongoConnection(**MONGO)
    records = mongo.findNewRecords(records)
    if args.load_records_from_pickle:
      records = ReadRecords.readRecordsFromPickles(records,args.load_records_from_pickle)
    else:
      records = ReadRecords.readRecordsFromADSExports(records)
    merged = UpdateRecords.mergeRecords(records)
    if args.outfile:
      with open(args.outfile[0],'w') as fp:
        r = {'merged': merged, 'nonmerged': records}
        json.dump(r,fp,indent=1)
    else:
      bibcodes = mongo.upsertRecords(merged)
      #SolrUpdater.solrUpdate(bibcodes)
  elif args.async:
    w = RabbitMQWorker()   
    w.connect(psettings.RABBITMQ_URL)
    lastLogged = None
    while records:
      payload = []
      while len(payload) < BIBCODES_PER_JOB:
        try:
          payload.append( records.popleft() )
        except IndexError:
          break
      percent = round((1-len(records)/total)*100.0)
      if not percent % 5 and percent!=lastLogged:
        lastLogged=percent
        logger.info("There are %s records left (%0.1f%% completed)" % (len(records),percent))
      publish(w,payload)