def test_canonicalize_records(self):
    from lib import ReadRecords

    records = OrderedDict([
      ('2014arXiv1401.2993T','b'), #This is an alternate to 'f'
      ('2014MNRAS.439.1884T','f'), #This is the canonical of 'b'

      ('2013MNRAS.434.1889H','d'), #This is the canonical of 'g'
      ('2013arXiv1306.3186H','g'), #This is the alternate of 'd'

      ('1978Natur.275..624M','c'), #No alternates, already canonical

      ('1988ESASP.281b.287G','x1'), #Canonical, the following are alternates
      ('1988IUE88...2..287G','a1'),
      ('1988IUES....1..287G','a2'),
      ('1988uvai....2..287G','a3'),
      
      ('2014PhRvD..90d4013F','h'), #This is the canonical of 'h'
      ('2013arXiv1311.6899F','k'), #This it the alternate of 'k'
    ])
    expected =  [
      ('2014MNRAS.439.1884T', 'b;f'),
      ('2013MNRAS.434.1889H', 'd;g'),
      ('1978Natur.275..624M', 'c'),
      ('1988ESASP.281b.287G','a1;a2;a3;x1'),
      ('2014PhRvD..90d4013F','h;k'),
    ]
    
    results = ReadRecords.canonicalize_records(OrderedDict((k,v) for k,v in records.iteritems()))
    self.assertEqual(results, expected)
Exemplo n.º 2
0
def main(MONGO=MONGO,*args):
  if args:
    sys.argv.extend(*args)

  parser = argparse.ArgumentParser()

  parser.add_argument(
    '--target-bibcodes',
    nargs='*',
    default=[],
    dest='targetBibcodes',
    help='Only analyze the specified bibcodes, and ignore their JSON fingerprints. Only works when --async=False. Use the syntax @filename.txt to read these from file (1 bibcode per file)'
    )

  parser.add_argument(
    '--async',
    default=False,
    action='store_true',
    dest='async',
    help='start in async mode'
    )

  parser.add_argument(
    '--dont-init-lookers-cache',
    default=False,
    action='store_true',
    dest='dont_init_lookers_cache',
    help='dont call ADSExports2.init_lookers_cache()'
    )

  parser.add_argument(
    '--load-records-from-pickle',
    nargs='*',
    default=None,
    dest='load_records_from_pickle',
    help='Load XML records from a pickle instead of ADSExports',
    )

  parser.add_argument(
    '--dump-output-to-file',
    nargs=1,
    type=str,
    default=None,
    dest='outfile',
    help='Output records to a file'
    )

  parser.add_argument(
    '--ignore-json-fingerprints',
    default=False,
    action='store_true',
    dest='ignore_json_fingerprints',
    help='ignore json fingerprints when finding new records to update (ie, force update)',
    )

  parser.add_argument(
    '--process-deletions',
    default=False,
    action='store_true',
    dest='process_deletions',
    help='Find orphaned bibcodes in the mongodb, then send these bibcodes to delete via rabbitMQ. No updates will be processed with this flag is set.',
    )

  parser.add_argument(
    '--max-deletions',
    default=2000,
    type=int,
    dest='max_deletions',
    help='Maximum number of deletions to attempt; If over this limit, exit and log an error',
    )

  args = parser.parse_args()

  if not args.dont_init_lookers_cache:
    start = time.time()
    logger.info("Calling init_lookers_cache()")
    ReadRecords.INIT_LOOKERS_CACHE()
    logger.info("init_lookers_cache() returned in %0.1f sec" % (time.time()-start))

  records = readBibcodesFromFile(BIBCODE_FILES)
  targets = None
  if args.targetBibcodes:
    if args.targetBibcodes[0].startswith('@'):
      with open(args.targetBibcodes[0].replace('@','')) as fp:
        targetBibcodes = deque([L.strip() for L in fp.readlines() if L and not L.startswith('#')])
    else:
        targetBibcodes = args.targetBibcodes
    targets = {bibcode:records[bibcode] for bibcode in targetBibcodes}
  
  records = deque(ReadRecords.canonicalize_records(records,targets))
  total = float(len(records)) #Save to print later

  if args.ignore_json_fingerprints:
    records = deque([(r[0],'ignore') for r in records])

  if args.process_deletions:
    start = time.time()
    logger.info("Processing deletions. This will block for several hours until the database is compared, then exit.")
    logger.warning("No updates will be processed when --process-deletions is set")
    mongo = MongoConnection.PipelineMongoConnection(**MONGO)
    mongo.close()
    results = mongo.getAllBibcodes()
    if len(results) != mongo.db[mongo.collection].count():
      logger.warning("len getAllBibcodes (%s) != len count (%s). Continue anyways." % (len(results),mongo.db[mongo.collection].count()))
    records = [i[0] for i in records]
    payload = list(set(results).difference(set(records)))
    if len(payload) > args.max_deletions:
      logger.critical("|".join(payload))
      logger.critical("Too many deletions: {} > {}".format(len(payload), args.max_deletions))
      sys.exit(1)
    w = RabbitMQWorker()   
    w.connect(psettings.RABBITMQ_URL)
    publish(w,payload,routing_key='DeletionRoute')
    logger.info("Found %s orphaned bibcodes in %0.1f seconds." % (len(payload),time.time()-start))
    sys.exit(0)


  if not args.async:
    mongo = MongoConnection.PipelineMongoConnection(**MONGO)
    records = mongo.findNewRecords(records)
    if args.load_records_from_pickle:
      records = ReadRecords.readRecordsFromPickles(records,args.load_records_from_pickle)
    else:
      records = ReadRecords.readRecordsFromADSExports(records)
    merged = UpdateRecords.mergeRecords(records)
    if args.outfile:
      with open(args.outfile[0],'w') as fp:
        r = {'merged': merged, 'nonmerged': records}
        json.dump(r,fp,indent=1)
    else:
      bibcodes = mongo.upsertRecords(merged)
      #SolrUpdater.solrUpdate(bibcodes)
  elif args.async:
    w = RabbitMQWorker()   
    w.connect(psettings.RABBITMQ_URL)
    lastLogged = None
    while records:
      payload = []
      while len(payload) < BIBCODES_PER_JOB:
        try:
          payload.append( records.popleft() )
        except IndexError:
          break
      percent = round((1-len(records)/total)*100.0)
      if not percent % 5 and percent!=lastLogged:
        lastLogged=percent
        logger.info("There are %s records left (%0.1f%% completed)" % (len(records),percent))
      publish(w,payload)