예제 #1
0
    def worker_process_func(job_holder, status_holder):
        """ Worker process
        :type job_holder: JobHolder
        :type status_holder: StatusHolder
        """
        signal.signal(signal.SIGTERM, Controller.slave_signal_handler)
        while not Controller.quit_flag:
            a_job = job_holder.get_job()
            if a_job is None:
                gevent.sleep(1)
                continue

            runner = JobRunner(a_job, job_holder, status_holder)
            runner.start()
예제 #2
0
def main(argv):
    if len(argv) == 0:
        argv = [
            'bigquery-e2e', 'publicdata', 'samples', 'shakespeare',
            'bigquery-e2e', '3', '/tmp/bigquery'
        ]
    if len(argv) < 6:
        # Wrong number of args, print the usage and quit.
        arg_names = [
            sys.argv[0], '<project_id>', '<source_project_id>',
            '<source_dataset_id>', '<source_table_id>', '<destination_bucket>',
            '<partition_count>', '[output_directory]'
        ]
        print 'Usage: %s' % (' '.join(arg_names))
        print 'Got: %s' % (argv, )
        return
    gcs_bucket = argv[4]
    job_runner = JobRunner(project_id=argv[0])

    partition_count = int(argv[5])
    download_dir = argv[6] if len(argv) > 6 else None
    gcs_readers = []
    for index in range(partition_count):
        # Note: a separate GCS reader is required per partition.
        gcs_reader = GcsReader(gcs_bucket=gcs_bucket,
                               download_dir=download_dir)
        gcs_readers.append(gcs_reader)

    run_partitioned_extract_job(job_runner,
                                gcs_readers,
                                source_project_id=argv[1],
                                source_dataset_id=argv[2],
                                source_table_id=argv[3])
예제 #3
0
def run_bigquery_job(job_id_prefix, job_type, config):
    '''Run a bigquery job and update pipeline status.'''
    global g_state
    runner = JobRunner(PROJECT_ID,
                       job_id_prefix + '_' + job_type,
                       client=bigquery)
    runner.start_job({job_type: config})
    with g_state_lock:
        g_state[job_type + '_job_id'] = runner.job_id
    job_state = 'STARTED'
    while job_state != 'DONE':
        time.sleep(5)
        result = runner.get_job()
        job_state = result['status']['state']
        with g_state_lock:
            g_state[job_type + '_result'] = pre(json.dumps(result, indent=2))

    if 'errorResult' in result['status']:
        raise RuntimeError(
            json.dumps(result['status']['errorResult'], indent=2))
예제 #4
0
def run_bigquery_job(job_id_prefix, job_type, config):
  '''Run a bigquery job and update pipeline status.'''
  global g_state
  runner = JobRunner(PROJECT_ID,
                     job_id_prefix + '_' + job_type,
                     client=bigquery)
  runner.start_job({job_type: config})
  with g_state_lock:
    g_state[job_type + '_job_id'] = runner.job_id
  job_state = 'STARTED'
  while job_state != 'DONE':
    time.sleep(5)
    result = runner.get_job()
    job_state = result['status']['state']
    with g_state_lock:
      g_state[job_type + '_result'] = pre(json.dumps(result, indent=2))

  if 'errorResult' in result['status']:
    raise RuntimeError(json.dumps(result['status']['errorResult'], 
                       indent=2))
예제 #5
0
def build(args):
    jr = JobRunner(cpus=args.j)

    epigenomes = WebEpigenomesLoader(args)
    for assembly in ["hg19", "mm10"]:
        for assays in ["H3K27ac", "DNase"]:
            epis = epigenomes.GetByAssemblyAndAssays(assembly, assays)
            for epi in epis.epis:
                for exp in epi.exps():
                    if exp.encodeID.startswith("EN"):
                        # print exp.encodeID
                        jr.append([[
                            os.path.realpath(__file__), "--job", exp.encodeID,
                            "--assembly", assembly, "--process"
                        ]])
                    else:
                        # ROADMAP
                        bigWig = exp.files[0]
                        if not bigWig:
                            print "missing", exp
                        else:
                            if not os.path.exists(bigWig.normFnp()):
                                jr.append([[
                                    normBin, "--assembly=" + bigWig.assembly,
                                    "--bwFnp=" + bigWig.normFnp(),
                                    bigWig.fnp()
                                ]])
    if args.test:
        return jr.runOne()

    if args.local:
        return jr.run()

    jobOptions = {"mem": 64000, "time": "3:59", "cores": 2, "queue": "short"}

    jr.cluster("/project/umw_zhiping_weng/encyc/norm", jobOptions)
예제 #6
0
def build(args):
    jr = JobRunner(cpus=args.j)

    epigenomes = WebEpigenomesLoader(args)
    for assembly in ["hg19", "mm10"]:
        for assays in ["H3K27ac", "DNase"]:
            epis = epigenomes.GetByAssemblyAndAssays(assembly, assays)
            for epi in epis.epis:
                for exp in epi.exps():
                    if exp.encodeID.startswith("EN"):
                        # print exp.encodeID
                        jr.append([
                            [os.path.realpath(__file__), "--job", exp.encodeID,
                             "--assembly", assembly, "--process"]])
                    else:
                        # ROADMAP
                        bigWig = exp.files[0]
                        if not bigWig:
                            print "missing", exp
                        else:
                            if not os.path.exists(bigWig.normFnp()):
                                jr.append([
                                    [normBin,
                                     "--assembly=" + bigWig.assembly,
                                     "--bwFnp=" + bigWig.normFnp(),
                                     bigWig.fnp()]])
    if args.test:
        return jr.runOne()

    if args.local:
        return jr.run()

    jobOptions = {"mem": 64000,
                  "time": "3:59",
                  "cores": 2,
                  "queue": "short"}

    jr.cluster("/project/umw_zhiping_weng/encyc/norm", jobOptions)
예제 #7
0
from platform import uname
import sys
from job_runner import JobRunner

IS_MAC = (uname()[0] == 'Darwin')
if IS_MAC:
    cores_per_node = 8
    runner = JobRunner('Fifteen6', cores_per_node)
else:
    cores_per_node = int(sys.argv[1])
    runner = JobRunner(sys.argv[2], cores_per_node)

runner.run_jobs()

예제 #8
0
def main(argv):
    logging.basicConfig()
    parser = ArgumentParser(description='Read BigQuery table into a text file')
    parser.add_argument('-a',
                        '--service_account',
                        required=True,
                        help='Big Query service account name')
    parser.add_argument(
        '-s',
        '--client_secret',
        required=True,
        help='Path to client_secrets.json file required for API login')
    parser.add_argument(
        '-c',
        '--credentials',
        help=
        ('Path to credentials file (e.g. bigquery_credentials.dat) required for API login. ',
         'If the file is not present, the browser window will be shown and you will be asked to authenticate'
         ))
    parser.add_argument('-k',
                        '--keyfile',
                        help='Path to the key file (e.g., key.p12)')
    parser.add_argument('-o',
                        '--download_dir',
                        default='.',
                        help='The directory where the output will be exported')
    parser.add_argument('-p',
                        '--project_id',
                        required=True,
                        help='BigQuery source project ID')
    parser.add_argument('-d',
                        '--dataset_id',
                        required=True,
                        help='BigQuery source dataset ID')
    parser.add_argument('-t', '--table_id', help='Source table ID')
    parser.add_argument('-b',
                        '--gcs_bucket',
                        help='Google Cloud Service destination bucket')
    parser.add_argument('-n',
                        '--partition_count',
                        help='Partition count for partitioned reader',
                        type=int)
    parser.add_argument('--partitioned',
                        dest="partitioned",
                        help='Use partitioned reader',
                        required=False,
                        action='store_true')
    parser.set_defaults(partitioned=False)
    args = parser.parse_args()

    job_runner = JobRunner(project_id=args.project_id)
    auth = BigQuery_Auth(service_acc=args.service_account,
                         client_secrets=args.client_secret,
                         credentials=args.credentials,
                         key_file=args.keyfile)
    if args.partitioned:
        gcs_readers = []
        for index in range(int(args.partition_count)):
            # Note: a separate GCS reader is required per partition.
            gcs_readers.append(
                GcsReader(auth=auth,
                          gcs_bucket=args.gcs_bucket,
                          download_dir=args.download_dir))
        run_partitioned_extract_job(job_runner,
                                    gcs_readers,
                                    source_project_id=args.project_id,
                                    source_dataset_id=args.dataset_id,
                                    source_table_id=args.table_id)
    else:
        reader = SimpleReader
        gcs_reader = GcsReader(auth=auth,
                               gcs_bucket=args.gcs_bucket,
                               download_dir=args.download_dir)
        reader.run_extract_job(job_runner,
                               gcs_reader,
                               source_project_id=args.project_id,
                               source_dataset_id=args.dataset_id,
                               source_table_id=args.table_id)
예제 #9
0
# released into the public domain by the authors.
'''Runs Python commands used in Chapter 12'''

import auth
gcs_bucket = 'bigquery-e2e'
project_id = 'bigquery-e2e'

# Using GcsReader
from gcs_reader import GcsReader
GcsReader(gcs_bucket=gcs_bucket,
          download_dir='/tmp/bigquery').read('shakespeare.json')

# Extracting the publicdata:samples.shakespeare table and reading.
from job_runner import JobRunner
import extract_and_read
extract_and_read.run_extract_job(JobRunner(project_id=project_id),
                                 GcsReader(gcs_bucket=gcs_bucket,
                                           download_dir='/tmp/bigquery'),
                                 source_project_id='publicdata',
                                 source_dataset_id='samples',
                                 source_table_id='shakespeare')

# Partitioned extract and parallel read.
from extract_and_partitioned_read import run_partitioned_extract_job
run_partitioned_extract_job(JobRunner(project_id=project_id), [
    GcsReader(gcs_bucket=gcs_bucket, download_dir='/tmp/bigquery')
    for x in range(3)
],
                            source_project_id='publicdata',
                            source_dataset_id='samples',
                            source_table_id='shakespeare')
import numpy as np
import pandas as pd
from seaborn import jointplot, lmplot, hls_palette
import matplotlib.pyplot as plt
from matplotlib import gridspec
import os
import sys
from matplotlib import cm

from job_runner import JobRunner

export_plots = True
show_plots = False

job_name = 'Fifteen6'
runner = JobRunner(job_name, readonly=True)

figure_folder = os.path.join(os.path.sep, 'Users', 'Jason', 'Desktop',
                             'TempFig', job_name)
if not os.path.exists(figure_folder):
    os.makedirs(figure_folder)

runner.cursor.execute(
    "SELECT * FROM job_results WHERE objective_value IS NOT NULL AND job_name=\"{0}\""
    .format(job_name))
completed_job_data = runner.cursor.fetchall()
Y_all = []
X_all = []
for row in completed_job_data:
    Y_all.append(row['objective_value'])
    X_all.append([row[key] for key in runner.parameters_to_optimize])