示例#1
0
def publish(records,url=psettings.RABBITMQ_URL,exchange='MergerPipelineExchange',routing_key='FindNewRecordsRoute'):
  #Its ok that we create/tear down this connection many times within this script; it is not a bottleneck
  #and likely slightly increases stability of the workflow
  w = RabbitMQWorker()
  w.connect(psettings.RABBITMQ_URL)
  w.channel.basic_publish('MergerPipelineExchange','FindNewRecordsRoute',json.dumps(records))
  w.connection.close()
def publish(bibcodes,url=psettings.RABBITMQ_URL,exchange='MergerPipelineExchange',routing_key='SolrUpdateRoute'):
  w = RabbitMQWorker()
  w.connect(psettings.RABBITMQ_URL)
  
  payload = json.dumps(bibcodes)
  w.channel.basic_publish(exchange,routing_key,payload)
  w.connection.close()
示例#3
0
    def connect_publisher(self):
        """
        Makes a connection between the worker and the RabbitMQ instance, and
        sets up an attribute as a channel.

        :return: no return
        """

        self.publish_worker = RabbitMQWorker()
        self.ret_queue = self.publish_worker.connect(psettings.RABBITMQ_URL)
示例#4
0
def publish(records,max_queue_size=30,url=psettings.RABBITMQ_URL,exchange='MergerPipelineExchange',routing_key='FindNewRecordsRoute',LOGGER=LOGGER):
  #Its ok that we create/tear down this connection many times within this script; it is not a bottleneck
  #and likely slightly increases stability of the workflow

  w = RabbitMQWorker()
  w.connect(psettings.RABBITMQ_URL)

  #Hold onto the message if publishing it would cause the number of queued messages to exceed max_queue_size
  responses = [w.channel.queue_declare(queue=i,passive=True) for i in ['UpdateRecordsQueue','ReadRecordsQueue']]
  while any([r.method.message_count >= max_queue_size for r in responses]):
    LOGGER.debug(">%s messages in the relevant queue(s). I will wait 15s while they get consumed." % max_queue_size)
    time.sleep(15)
    responses = [w.channel.queue_declare(queue=i,passive=True) for i in ['UpdateRecordsQueue','ReadRecordsQueue']]
  
  payload = json.dumps(records)
  w.channel.basic_publish('MergerPipelineExchange','FindNewRecordsRoute',payload)
  LOGGER.debug("Published payload with hash: %s" % hash(payload))
  w.connection.close()
示例#5
0
    def connect_publisher(self):
        """
        Makes a connection between the worker and the RabbitMQ instance, and
        sets up an attribute as a channel.

        :return: no return
        """

        self.publish_worker = RabbitMQWorker()
        self.ret_queue = self.publish_worker.connect(psettings.RABBITMQ_URL)
示例#6
0
class TestGeneric(unittest.TestCase):
    """
    Generic test class. Used as the primary class that implements a standard
    integration test. Also contains a range of helper functions, and the correct
    tearDown method when interacting with RabbitMQ.
    """

    def setUp(self):
        """
        Sets up the parameters for the RabbitMQ workers, and also the workers
        themselves. Generates all the queues that should be in place for testing
        the RabbitMQ workers.

        :return: no return
        """

        # Build the link files
        build_links(test_name='integration')

        # Load the extraction worker
        check_params = psettings.WORKERS['CheckIfExtractWorker']
        standard_params = psettings.WORKERS['StandardFileExtractWorker']
        writer_params = psettings.WORKERS['WriteMetaFileWorker']
        error_params = psettings.WORKERS['ErrorHandlerWorker']
        proxy_params = psettings.WORKERS['ProxyPublishWorker']

        for params in [check_params, standard_params, writer_params,
                       error_params, proxy_params]:
            params['RABBITMQ_URL'] = psettings.RABBITMQ_URL
            params['ERROR_HANDLER'] = psettings.ERROR_HANDLER
            params['extract_key'] = 'FULLTEXT_EXTRACT_PATH_UNITTEST'
            params['TEST_RUN'] = True
            params['PDF_EXTRACTOR'] = psettings.PDF_EXTRACTOR
            params['PROXY_PUBLISH'] = psettings.PROXY_PUBLISH

        self.params = params
        self.check_worker = CheckIfExtractWorker(params=check_params)
        self.standard_worker = StandardFileExtractWorker(params=standard_params)

        self.standard_worker.logger.debug('params: {0}'.format(standard_params))

        self.meta_writer = WriteMetaFileWorker(params=writer_params)
        self.error_worker = ErrorHandlerWorker(params=error_params)
        self.proxy_worker = ProxyPublishWorker(params=proxy_params)
        self.meta_path = ''
        self.channel_list = None

        # Queues and routes are switched on so that they can allow workers
        # to connect
        TM = TaskMaster(psettings.RABBITMQ_URL, psettings.RABBITMQ_ROUTES,
                        psettings.WORKERS)
        TM.initialize_rabbitmq()

        self.connect_publisher()
        self.purge_all_queues()

    def connect_publisher(self):
        """
        Makes a connection between the worker and the RabbitMQ instance, and
        sets up an attribute as a channel.

        :return: no return
        """

        self.publish_worker = RabbitMQWorker()
        self.ret_queue = self.publish_worker.connect(psettings.RABBITMQ_URL)

    def purge_all_queues(self):
        """
        Purges all the content from all the queues existing in psettings.py.

        :return: no return
        """
        for queue in psettings.RABBITMQ_ROUTES['QUEUES']:
            _q = queue['queue']
            self.publish_worker.channel.queue_purge(queue=_q)

    def tearDown(self):
        """
        General tearDown of the class. Purges the queues and then sleeps so that
        there is no contaminating the next set of tests.

        :return: no return
        """

        self.purge_all_queues()
        time.sleep(5)

    def helper_get_details(self, test_publish):
        """
        Generates a bunch of relevant information about the stub data being
        used. The attribute names should be relevant.

        :param test_publish: the file to the test stub file
        :return: no return
        """

        with open(os.path.join(PROJ_HOME, test_publish), "r") as f:
            lines = f.readlines()
            self.nor = len(lines)

        self.bibcode, self.ft_source, self.provider = \
            lines[0].strip().split('\t')
        self.bibcode_list = [i.strip().split('\t')[0] for i in lines]

        self.test_expected = check_if_extract.create_meta_path(
            {'bibcode': self.bibcode},
            extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST'
        )

        self.meta_list = \
            [check_if_extract.create_meta_path(
                {"bibcode": j},
                extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST'
            ).replace('meta.json', '') for j in self.bibcode_list]

        self.meta_path = self.test_expected.replace('meta.json', '')

        self.number_of_PDFs = len(
            list(
                filter(lambda x: x.lower().endswith('.pdf'),
                       [i.strip().split("\t")[-2] for i in lines])
            )
        )

        self.number_of_standard_files = self.nor - self.number_of_PDFs

    def calculate_expected_folders(self, full_text_links):
        """
        Determines the paths that should exist if the test data was extracted.

        :param full_text_links: file that contains the full text links stub data
        :return: list of expected paths that would be created when the full text
        was extracted
        """

        with open(os.path.join(PROJ_HOME, full_text_links), "r") as inf:
            lines = inf.readlines()

        expected_paths = \
            [check_if_extract.create_meta_path(
                {CONSTANTS['BIBCODE']: line.strip().split('\t')[0]},
                extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST'
            ).replace('meta.json', '') for line in lines]

        return expected_paths

    def clean_up_path(self, paths):
        """
        Takes the path given and deletes any content that should have been
        created when the full text was extracted.

        :param paths: list of paths to clean up their content
        :return: no return
        """

        for path in paths:
            if os.path.exists(path):
                meta = os.path.join(path, 'meta.json')
                fulltext = os.path.join(path, 'fulltext.txt')
                dataset = os.path.join(path, 'dataset.txt')
                acknowledgements = os.path.join(path, 'acknowledgements.txt')

                file_list = [meta, fulltext, dataset, acknowledgements]
                for file_ in file_list:
                    if os.path.exists(file_):
                        os.remove(file_)
                os.rmdir(path)

                print('deleted: {0} and its content'.format(path))
            else:
                print('Could not delete {0}, does not exist'.format(path))
示例#7
0
class TestGeneric(unittest.TestCase):
    """
    Generic test class. Used as the primary class that implements a standard
    integration test. Also contains a range of helper functions, and the correct
    tearDown method when interacting with RabbitMQ.
    """
    def setUp(self):
        """
        Sets up the parameters for the RabbitMQ workers, and also the workers
        themselves. Generates all the queues that should be in place for testing
        the RabbitMQ workers.

        :return: no return
        """

        # Build the link files
        build_links(test_name='integration')

        # Load the extraction worker
        check_params = psettings.WORKERS['CheckIfExtractWorker']
        standard_params = psettings.WORKERS['StandardFileExtractWorker']
        writer_params = psettings.WORKERS['WriteMetaFileWorker']
        error_params = psettings.WORKERS['ErrorHandlerWorker']
        proxy_params = psettings.WORKERS['ProxyPublishWorker']

        for params in [
                check_params, standard_params, writer_params, error_params,
                proxy_params
        ]:
            params['RABBITMQ_URL'] = psettings.RABBITMQ_URL
            params['ERROR_HANDLER'] = psettings.ERROR_HANDLER
            params['extract_key'] = 'FULLTEXT_EXTRACT_PATH_UNITTEST'
            params['TEST_RUN'] = True
            params['PDF_EXTRACTOR'] = psettings.PDF_EXTRACTOR
            params['PROXY_PUBLISH'] = psettings.PROXY_PUBLISH

        self.params = params
        self.check_worker = CheckIfExtractWorker(params=check_params)
        self.standard_worker = StandardFileExtractWorker(
            params=standard_params)

        self.standard_worker.logger.debug(
            'params: {0}'.format(standard_params))

        self.meta_writer = WriteMetaFileWorker(params=writer_params)
        self.error_worker = ErrorHandlerWorker(params=error_params)
        self.proxy_worker = ProxyPublishWorker(params=proxy_params)
        self.meta_path = ''
        self.channel_list = None

        # Queues and routes are switched on so that they can allow workers
        # to connect
        TM = TaskMaster(psettings.RABBITMQ_URL, psettings.RABBITMQ_ROUTES,
                        psettings.WORKERS)
        TM.initialize_rabbitmq()

        self.connect_publisher()
        self.purge_all_queues()

    def connect_publisher(self):
        """
        Makes a connection between the worker and the RabbitMQ instance, and
        sets up an attribute as a channel.

        :return: no return
        """

        self.publish_worker = RabbitMQWorker()
        self.ret_queue = self.publish_worker.connect(psettings.RABBITMQ_URL)

    def purge_all_queues(self):
        """
        Purges all the content from all the queues existing in psettings.py.

        :return: no return
        """
        for queue in psettings.RABBITMQ_ROUTES['QUEUES']:
            _q = queue['queue']
            self.publish_worker.channel.queue_purge(queue=_q)

    def tearDown(self):
        """
        General tearDown of the class. Purges the queues and then sleeps so that
        there is no contaminating the next set of tests.

        :return: no return
        """

        self.purge_all_queues()
        time.sleep(5)

    def helper_get_details(self, test_publish):
        """
        Generates a bunch of relevant information about the stub data being
        used. The attribute names should be relevant.

        :param test_publish: the file to the test stub file
        :return: no return
        """

        with open(os.path.join(PROJ_HOME, test_publish), "r") as f:
            lines = f.readlines()
            self.nor = len(lines)

        self.bibcode, self.ft_source, self.provider = \
            lines[0].strip().split('\t')
        self.bibcode_list = [i.strip().split('\t')[0] for i in lines]

        self.test_expected = check_if_extract.create_meta_path(
            {'bibcode': self.bibcode},
            extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST')

        self.meta_list = \
            [check_if_extract.create_meta_path(
                {"bibcode": j},
                extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST'
            ).replace('meta.json', '') for j in self.bibcode_list]

        self.meta_path = self.test_expected.replace('meta.json', '')

        self.number_of_PDFs = len(
            list(
                filter(lambda x: x.lower().endswith('.pdf'),
                       [i.strip().split("\t")[-2] for i in lines])))

        self.number_of_standard_files = self.nor - self.number_of_PDFs

    def calculate_expected_folders(self, full_text_links):
        """
        Determines the paths that should exist if the test data was extracted.

        :param full_text_links: file that contains the full text links stub data
        :return: list of expected paths that would be created when the full text
        was extracted
        """

        with open(os.path.join(PROJ_HOME, full_text_links), "r") as inf:
            lines = inf.readlines()

        expected_paths = \
            [check_if_extract.create_meta_path(
                {CONSTANTS['BIBCODE']: line.strip().split('\t')[0]},
                extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST'
            ).replace('meta.json', '') for line in lines]

        return expected_paths

    def clean_up_path(self, paths):
        """
        Takes the path given and deletes any content that should have been
        created when the full text was extracted.

        :param paths: list of paths to clean up their content
        :return: no return
        """

        for path in paths:
            if os.path.exists(path):
                meta = os.path.join(path, 'meta.json')
                fulltext = os.path.join(path, 'fulltext.txt')
                dataset = os.path.join(path, 'dataset.txt')
                acknowledgements = os.path.join(path, 'acknowledgements.txt')

                file_list = [meta, fulltext, dataset, acknowledgements]
                for file_ in file_list:
                    if os.path.exists(file_):
                        os.remove(file_)
                os.rmdir(path)

                print('deleted: {0} and its content'.format(path))
            else:
                print('Could not delete {0}, does not exist'.format(path))
示例#8
0
def main(MONGO=MONGO,*args):
  if args:
    sys.argv.extend(*args)

  parser = argparse.ArgumentParser()

  parser.add_argument(
    '--target-bibcodes',
    nargs='*',
    default=[],
    dest='targetBibcodes',
    help='Only analyze the specified bibcodes, and ignore their JSON fingerprints. Only works when --async=False. Use the syntax @filename.txt to read these from file (1 bibcode per file)'
    )

  parser.add_argument(
    '--async',
    default=False,
    action='store_true',
    dest='async',
    help='start in async mode'
    )

  parser.add_argument(
    '--dont-init-lookers-cache',
    default=False,
    action='store_true',
    dest='dont_init_lookers_cache',
    help='dont call ADSExports2.init_lookers_cache()'
    )

  parser.add_argument(
    '--load-records-from-pickle',
    nargs='*',
    default=None,
    dest='load_records_from_pickle',
    help='Load XML records from a pickle instead of ADSExports',
    )

  parser.add_argument(
    '--dump-output-to-file',
    nargs=1,
    type=str,
    default=None,
    dest='outfile',
    help='Output records to a file'
    )

  parser.add_argument(
    '--ignore-json-fingerprints',
    default=False,
    action='store_true',
    dest='ignore_json_fingerprints',
    help='ignore json fingerprints when finding new records to update (ie, force update)',
    )

  parser.add_argument(
    '--process-deletions',
    default=False,
    action='store_true',
    dest='process_deletions',
    help='Find orphaned bibcodes in the mongodb, then send these bibcodes to delete via rabbitMQ. No updates will be processed with this flag is set.',
    )

  parser.add_argument(
    '--max-deletions',
    default=2000,
    type=int,
    dest='max_deletions',
    help='Maximum number of deletions to attempt; If over this limit, exit and log an error',
    )

  args = parser.parse_args()

  if not args.dont_init_lookers_cache:
    start = time.time()
    logger.info("Calling init_lookers_cache()")
    ReadRecords.INIT_LOOKERS_CACHE()
    logger.info("init_lookers_cache() returned in %0.1f sec" % (time.time()-start))

  records = readBibcodesFromFile(BIBCODE_FILES)
  targets = None
  if args.targetBibcodes:
    if args.targetBibcodes[0].startswith('@'):
      with open(args.targetBibcodes[0].replace('@','')) as fp:
        targetBibcodes = deque([L.strip() for L in fp.readlines() if L and not L.startswith('#')])
    else:
        targetBibcodes = args.targetBibcodes
    targets = {bibcode:records[bibcode] for bibcode in targetBibcodes}
  
  records = deque(ReadRecords.canonicalize_records(records,targets))
  total = float(len(records)) #Save to print later

  if args.ignore_json_fingerprints:
    records = deque([(r[0],'ignore') for r in records])

  if args.process_deletions:
    start = time.time()
    logger.info("Processing deletions. This will block for several hours until the database is compared, then exit.")
    logger.warning("No updates will be processed when --process-deletions is set")
    mongo = MongoConnection.PipelineMongoConnection(**MONGO)
    mongo.close()
    results = mongo.getAllBibcodes()
    if len(results) != mongo.db[mongo.collection].count():
      logger.warning("len getAllBibcodes (%s) != len count (%s). Continue anyways." % (len(results),mongo.db[mongo.collection].count()))
    records = [i[0] for i in records]
    payload = list(set(results).difference(set(records)))
    if len(payload) > args.max_deletions:
      logger.critical("|".join(payload))
      logger.critical("Too many deletions: {} > {}".format(len(payload), args.max_deletions))
      sys.exit(1)
    w = RabbitMQWorker()   
    w.connect(psettings.RABBITMQ_URL)
    publish(w,payload,routing_key='DeletionRoute')
    logger.info("Found %s orphaned bibcodes in %0.1f seconds." % (len(payload),time.time()-start))
    sys.exit(0)


  if not args.async:
    mongo = MongoConnection.PipelineMongoConnection(**MONGO)
    records = mongo.findNewRecords(records)
    if args.load_records_from_pickle:
      records = ReadRecords.readRecordsFromPickles(records,args.load_records_from_pickle)
    else:
      records = ReadRecords.readRecordsFromADSExports(records)
    merged = UpdateRecords.mergeRecords(records)
    if args.outfile:
      with open(args.outfile[0],'w') as fp:
        r = {'merged': merged, 'nonmerged': records}
        json.dump(r,fp,indent=1)
    else:
      bibcodes = mongo.upsertRecords(merged)
      #SolrUpdater.solrUpdate(bibcodes)
  elif args.async:
    w = RabbitMQWorker()   
    w.connect(psettings.RABBITMQ_URL)
    lastLogged = None
    while records:
      payload = []
      while len(payload) < BIBCODES_PER_JOB:
        try:
          payload.append( records.popleft() )
        except IndexError:
          break
      percent = round((1-len(records)/total)*100.0)
      if not percent % 5 and percent!=lastLogged:
        lastLogged=percent
        logger.info("There are %s records left (%0.1f%% completed)" % (len(records),percent))
      publish(w,payload)