예제 #1
0
파일: aws_reader.py 프로젝트: cmluria/emmaa
def read_pmids(pmids, date):
    """Return extracted INDRA Statements per PMID after running reading on AWS.

    Parameters
    ----------
    pmids : list[str]
        A list of PMIDs to read.

    Returns
    -------
    dict[str, list[indra.statements.Statement]]
        A dict of PMIDs and the list of Statements extracted for the given
        PMID by reading.
    """
    pmid_fname = 'pmids-%s.txt' % date_str
    with open(pmid_fname, 'wt') as fh:
        fh.write('\n'.join(pmids))
    job_list = submit_reading('emmaa', pmid_fname, ['reach'])
    date_str = date.strftime('%Y-%m-%d-%H-%M-%S')
    wait_for_complete('run_reach_queue',
                      job_list,
                      idle_log_timeout=600,
                      kill_on_log_timeout=True)
    pmid_stmts = {}
    for pmid in pmids:
        reach_json_str = get_reader_json_str('reach', pmid)
        rp = reach.process_json_str(reach_json_str)
        if not rp:
            pmid_stmts[pmid] = []
        else:
            pmid_stmts[pmid] = rp.statements
    return pmid_stmts
예제 #2
0
    def _run_reading(self, db, trids, max_refs=5000):
        if len(trids) / max_refs >= 1000:
            raise ReadingUpdateError("Too many id's for one submission. "
                                     "Break it up and do it manually.")

        logger.info("Producing readings on aws for %d text refs with new "
                    "content not read by %s." % (len(trids), self.reader.name))
        job_prefix = ('%s_reading_%s' %
                      (self.reader.name.lower(),
                       self.run_datetime.strftime('%Y%m%d_%H%M%S')))
        with open(job_prefix + '.txt', 'w') as f:
            f.write('\n'.join(['trid:%s' % trid for trid in trids]))
        logger.info("Submitting jobs...")
        job_ids = submit_db_reading(job_prefix,
                                    job_prefix + '.txt',
                                    readers=[self.reader.name.lower()],
                                    start_ix=0,
                                    end_ix=None,
                                    pmids_per_job=max_refs,
                                    num_tries=2,
                                    force_read=False,
                                    force_fulltext=False,
                                    read_all_fulltext=False,
                                    project_name=self.project_name)
        logger.info("Waiting for complete...")
        wait_for_complete('run_db_reading_queue',
                          job_list=job_ids,
                          job_name_prefix=job_prefix,
                          idle_log_timeout=1200,
                          kill_on_log_timeout=True,
                          stash_log_method='s3')
        return
예제 #3
0
def test_handler():
    """Test the lambda handler locally."""
    dts = make_date_str()
    key = f'models/test/test_model_{dts}.pkl'
    event = {'Records': [{'s3': {'object': {'key': key}}}]}
    context = None
    res = lambda_handler(event, context)
    print(res)
    assert res['statusCode'] == 200, res
    assert res['result'] == 'SUCCESS', res
    assert res['job_id'], res
    job_id = res['job_id']

    results = {}
    wait_for_complete(QUEUE,
                      job_list=[{
                          'jobId': job_id
                      }],
                      result_record=results)
    print(results)
    assert job_id in [job_def['jobId'] for job_def in results['succeeded']], \
        results['failed']

    s3 = get_s3_client()
    s3_res = s3.list_objects(Bucket='emmaa', Prefix='results/test/' + dts[:10])
    print(s3_res.keys())
    assert s3_res, s3_res
예제 #4
0
def run_machine(model_path, pmids, belief_threshold, search_genes=None,
                ndex_cred=None, twitter_cred=None, grounding_map=None):
    start_time_local = datetime.datetime.now(tzlocal.get_localzone())
    date_str = make_date_str()

    # Save PMIDs in file and send for remote reading
    if aws_available:
        pmid_fname = 'pmids-%s.txt' % date_str
        all_pmids = []
        for v in pmids.values():
            all_pmids += v
        all_pmids = list(set(all_pmids))

        with open(pmid_fname, 'wt') as fh:
            for pmid in all_pmids:
                fh.write('%s\n' % pmid)
        # Submit reading
        job_list = submit_reading('rasmachine', pmid_fname, ['reach'])

        # Wait for reading to complete
        wait_for_complete('run_reach_queue', job_list, idle_log_timeout=600,
                          kill_on_log_timeout=True)

    # Load the model
    logger.info(time.strftime('%c'))
    logger.info('Loading original model.')
    inc_model_file = os.path.join(model_path, 'model.pkl')
    model = IncrementalModel(inc_model_file)
    # Include search genes as prior genes
    if search_genes:
        model.prior_genes = search_genes
    stats = {}
    logger.info(time.strftime('%c'))
    logger.info('Preassembling original model.')
    model.preassemble(filters=global_filters, grounding_map=grounding_map)
    logger.info(time.strftime('%c'))

    # Original statistics
    stats['orig_stmts'] = len(model.get_statements())
    stats['orig_assembled'] = len(model.assembled_stmts)
    orig_stmts = filter_db_highbelief(model.assembled_stmts, ['bel', 'biopax'],
                                      belief_threshold)
    orig_stmts = ac.filter_top_level(orig_stmts)
    stats['orig_final'] = len(orig_stmts)
    logger.info('%d final statements' % len(orig_stmts))

    # Extend the model with PMIDs
    logger.info('----------------')
    logger.info(time.strftime('%c'))
    logger.info('Extending model.')
    stats['new_papers'], stats['new_abstracts'], stats['existing'] = \
        extend_model(model_path, model, pmids, start_time_local)
    # Having added new statements, we preassemble the model
    model.preassemble(filters=global_filters, grounding_map=grounding_map)

    # New statistics
    stats['new_stmts'] = len(model.get_statements())
    stats['new_assembled'] = len(model.assembled_stmts)
    new_stmts = filter_db_highbelief(model.assembled_stmts, ['bel', 'biopax'],
                                     belief_threshold)
    new_stmts = ac.filter_top_level(new_stmts)
    stats['new_final'] = len(new_stmts)
    logger.info('%d final statements' % len(new_stmts))

    check_pmids(model.get_statements())

    # Save model
    logger.info(time.strftime('%c'))
    logger.info('Saving model')
    model.save(inc_model_file)
    logger.info(time.strftime('%c'))

    # Save a time stamped version of the pickle for backup/diagnostic purposes
    if not aws_available:
        inc_model_bkp_file = os.path.join(model_path,
                                          'model-%s.pkl' % date_str)
        model.save(inc_model_bkp_file)
    else:
        key = 'rasmachine/%s/model-%s.pkl' % (model_path.replace('/', '_'),
                                              date_str)
        s3 = boto3.client('s3')
        s3.upload_file(inc_model_file, 'bigmech', key)

    # Upload the new, final statements to NDEx
    if ndex_cred:
        upload_new_ndex(model_path, new_stmts, ndex_cred)

    # Print and tweet the status message
    logger.info('--- Final statistics ---')
    for k, v in sorted(stats.items(), key=lambda x: x[0]):
        logger.info('%s: %s' % (k, v))
    logger.info('------------------------')

    msg_str = make_status_message(stats)
    if msg_str is not None:
        logger.info('Status message: %s' % msg_str)
        if twitter_cred:
            logger.info('Now tweeting: %s' % msg_str)
            twitter_client.update_status(msg_str, twitter_cred)
예제 #5
0
        type=int,
        help=('If the logs are not updated for %(metavar)s seconds, '
              'print a warning. If `--kill_on_log_timeout` flag is set, then '
              'the offending jobs will be automatically terminated.'))
    parser.add_argument(
        '--kill_on_timeout',
        '-K',
        action='store_true',
        help='If a log times out, terminate the offending job.')
    parser.add_argument(
        '--stash_log_method',
        '-l',
        choices=['s3', 'local'],
        metavar='METHOD',
        help=('Select a method from: [%(choices)s] to store the job logs. '
              'If no method is specified, the logs will not be '
              'loaded off of AWS. If \'s3\' is specified, then '
              '`job_name_prefix` must also be given, as this will indicate '
              'where on s3 to store the logs.'))
    args = parser.parse_args()

    from indra.tools.reading.submit_reading_pipeline import wait_for_complete

    job_list = None
    if args.job_list is not None:
        job_list = [{'jobId': jid} for jid in args.job_list]

    wait_for_complete(args.queue_name, job_list, args.job_name_prefix,
                      args.poll_interval, args.timeout, args.kill_on_timeout,
                      args.stash_log_method)
예제 #6
0
def run_machine(model_path,
                pmids,
                belief_threshold,
                search_genes=None,
                ndex_cred=None,
                twitter_cred=None,
                grounding_map=None):
    start_time_local = datetime.datetime.now(tzlocal.get_localzone())
    date_str = make_date_str()

    # Save PMIDs in file and send for remote reading
    if aws_available:
        pmid_fname = 'pmids-%s.txt' % date_str
        all_pmids = []
        for v in pmids.values():
            all_pmids += v
        all_pmids = list(set(all_pmids))

        with open(pmid_fname, 'wt') as fh:
            for pmid in all_pmids:
                fh.write('%s\n' % pmid)
        # Submit reading
        job_list = submit_reading('rasmachine', pmid_fname, ['reach'])

        # Wait for reading to complete
        wait_for_complete('run_reach_queue',
                          job_list,
                          idle_log_timeout=600,
                          kill_on_log_timeout=True)

    # Load the model
    logger.info(time.strftime('%c'))
    logger.info('Loading original model.')
    inc_model_file = os.path.join(model_path, 'model.pkl')
    model = IncrementalModel(inc_model_file)
    # Include search genes as prior genes
    if search_genes:
        model.prior_genes = search_genes
    stats = {}
    logger.info(time.strftime('%c'))
    logger.info('Preassembling original model.')
    model.preassemble(filters=global_filters, grounding_map=grounding_map)
    logger.info(time.strftime('%c'))

    # Original statistics
    stats['orig_stmts'] = len(model.get_statements())
    stats['orig_assembled'] = len(model.assembled_stmts)
    orig_stmts = filter_db_highbelief(model.assembled_stmts, ['bel', 'biopax'],
                                      belief_threshold)
    orig_stmts = ac.filter_top_level(orig_stmts)
    stats['orig_final'] = len(orig_stmts)
    logger.info('%d final statements' % len(orig_stmts))

    # Extend the model with PMIDs
    logger.info('----------------')
    logger.info(time.strftime('%c'))
    logger.info('Extending model.')
    stats['new_papers'], stats['new_abstracts'], stats['existing'] = \
        extend_model(model_path, model, pmids, start_time_local)
    # Having added new statements, we preassemble the model
    model.preassemble(filters=global_filters, grounding_map=grounding_map)

    # New statistics
    stats['new_stmts'] = len(model.get_statements())
    stats['new_assembled'] = len(model.assembled_stmts)
    new_stmts = filter_db_highbelief(model.assembled_stmts, ['bel', 'biopax'],
                                     belief_threshold)
    new_stmts = ac.filter_top_level(new_stmts)
    stats['new_final'] = len(new_stmts)
    logger.info('%d final statements' % len(new_stmts))

    check_pmids(model.get_statements())

    # Save model
    logger.info(time.strftime('%c'))
    logger.info('Saving model')
    model.save(inc_model_file)
    logger.info(time.strftime('%c'))

    # Save a time stamped version of the pickle for backup/diagnostic purposes
    if not aws_available:
        inc_model_bkp_file = os.path.join(model_path,
                                          'model-%s.pkl' % date_str)
        model.save(inc_model_bkp_file)
    else:
        key = 'rasmachine/%s/model-%s.pkl' % (model_path.replace(
            '/', '_'), date_str)
        s3 = boto3.client('s3')
        s3.upload_file(inc_model_file, 'bigmech', key)

    # Upload the new, final statements to NDEx
    if ndex_cred:
        upload_new_ndex(model_path, new_stmts, ndex_cred)

    # Print and tweet the status message
    logger.info('--- Final statistics ---')
    for k, v in sorted(stats.items(), key=lambda x: x[0]):
        logger.info('%s: %s' % (k, v))
    logger.info('------------------------')

    msg_str = make_status_message(stats)
    if msg_str is not None:
        logger.info('Status message: %s' % msg_str)
        if twitter_cred:
            logger.info('Now tweeting: %s' % msg_str)
            twitter_client.update_status(msg_str, twitter_cred)
예제 #7
0
def run_reading(pmid_fname):
    job_list = submit_reading(basen, pmid_fname, ['reach'], pmids_per_job=2000)
    reading_res = wait_for_complete(job_list)
    combine_res = submit_combine(basen, job_list)