Пример #1
0
    def _get_default_args_list(self):
        """ Return the base list of args for sherlock that remains unchanged
        across multiple invocations of this worker's process_msg.
        """
        args = [
            # First arg, does not matter
            "ingest_multiple_dates.py",

            # We do not want to record status in redshift
            "--skip-progress-in-redshift",

            # run parallelism in the worker instead
            "--serial-stepper",
        ]

        cur_base_dir = os.getcwd()
        if self._should_run_local:
            private_file = os.path.join(
                cur_base_dir,
                staticconf.read_string("run_local.private", "private.yaml")
            )
            args.extend(["-r"])  # For run-local in sherlock
        else:
            private_file = staticconf.read_string("run_service.private")

        if self._config_override_loc is not None:
            args.extend(["--config-override", self._config_override_loc])
        args.extend(["--private", private_file])
        args.extend(["--config", self._config_loc])

        return args
Пример #2
0
def create_emr_args(date_with_slashes, cores, infile_prefix, local):
    """creates a string containing arguments for mr job

    inputs:
        date_with_slashes -- a date string of the form 'YYYY/MM/DD'
        cores -- the number of cores to use for a conversion
        infile_prefix -- the prefix to the search bucket
        delimiter -- column delimiter for S3 output

    outputs:
        string containing arguments used by ET mr job"""

    input_file = infile_prefix + date_with_slashes +\
        read_string('pipeline.et_step.s3_input_suffix')
    user_prefix = get_s3_output_user_prefix()
    output_file = os.path.join(user_prefix, date_with_slashes)

    if int(cores) > MAX_CORES:
        cores = MAX_CORES

    extractions = pipeline_yaml_schema_file_path()
    delimiter = read_string('redshift_column_delimiter')
    if local:
        template = read_string('run_local.mrjob_arg_template')
    else:
        template = read_string('run_service.mrjob_arg_template')

    return template.format(input_file, output_file, cores, extractions,
                           delimiter)
Пример #3
0
def test_create_emr_args(input_date, dev, cores, pipeline_yaml):
    print "just starting"
    load_package_config('config.yaml')
    YamlConfiguration(pipeline_yaml)

    input_prefix = read_list('pipeline.et_step.s3_prefixes')[0]
    input_file = input_prefix + input_date + '/part-*.gz'

    expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS
    expected_out_file = read_string('pipeline.s3_output_prefix')
    delimiter = read_string('redshift_column_delimiter')
    with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}):
        logname = os.environ['LOGNAME']
        expected_out_file = os.path.join(
            expected_out_file.format(logname=logname),
            input_date
        )
        extractions = pipeline_yaml_schema_file_path()
        formatted_args = expected_args.format(input_file,
                                              expected_out_file,
                                              cores,
                                              extractions,
                                              delimiter)
        output_under_test = create_emr_args(input_date, 10,
                                            input_prefix, dev)
        assert output_under_test == formatted_args
Пример #4
0
def create_emr_args(date_with_slashes, cores, infile_prefix, local):
    """creates a string containing arguments for mr job

    inputs:
        date_with_slashes -- a date string of the form 'YYYY/MM/DD'
        cores -- the number of cores to use for a conversion
        infile_prefix -- the prefix to the search bucket
        delimiter -- column delimiter for S3 output

    outputs:
        string containing arguments used by ET mr job"""

    input_file = infile_prefix + date_with_slashes +\
        read_string('pipeline.et_step.s3_input_suffix')
    user_prefix = get_s3_output_user_prefix()
    output_file = os.path.join(user_prefix, date_with_slashes)

    if int(cores) > MAX_CORES:
        cores = MAX_CORES

    extractions = pipeline_yaml_schema_file_path()
    delimiter = read_string('redshift_column_delimiter')
    if local:
        template = read_string('run_local.mrjob_arg_template')
    else:
        template = read_string('run_service.mrjob_arg_template')

    return template.format(
        input_file, output_file, cores, extractions, delimiter
    )
Пример #5
0
def test_setup_config_cluster(cluster, pool, scheduler, tag,
                              mock_config_files):
    args = argparse.Namespace(
        env_config_path='/nail/etc/config.yaml',
        cluster=cluster,
        pool=pool,
        scheduler=scheduler,
        signals_branch_or_tag=tag,
    )
    with mock.patch(
            'clusterman.config.load_cluster_pool_config',
            autospec=True,
    ) as mock_pool_load, mock.patch('clusterman.config._load_module_configs',
                                    ) as mock_load_module_configs:

        config.setup_config(args)

        assert mock_load_module_configs.call_args == mock.call(
            '/nail/etc/config.yaml')
        assert staticconf.read_string('aws.region') == 'us-test-3'
        if pool:
            assert mock_pool_load.call_args == mock.call(
                cluster, pool, scheduler, tag)
        else:
            assert mock_pool_load.call_count == 0
            if tag:
                assert staticconf.read_string(
                    'autoscale_signal.branch_or_tag') == tag
Пример #6
0
def main():
    staticconf.YamlConfiguration(CONFIG_FILE)
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )
    api = API(auth)

    big_ben_ids = [723360691457945600, 723375789467553793, 723390890664824834, 723405988221489154, 
                   723421087703261186, 723436186644025344, 723451541563138052, 723466386304057344, 
                   723481486737985536, 723497089410457600, 723511939465392128, 723528048931430400,
                   723541884208091137, 723556981991202816, 723572081485615104, 723587184276721665,
                   723602282374414338, 723617381017374720, 723632480964759553, 723647581516124160,
                   723662932664524800, 723678284538589184, 723693384272121857, 723709493939453952,
                   723723076614164480]

    for startid, endid in zip(big_ben_ids, big_ben_ids[1:]):
        for tweet in limit_handled(Cursor(api.search,
                q=' OR '.join('qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890'),
                since_id=str(startid),
                max_id=str(endid),
                lang="en").items(2500)):
            print(json.dumps(tweet._json))
Пример #7
0
    def _get_default_args_list(self):
        """ Return the base list of args for sherlock that remains unchanged
        across multiple invocations of this worker's process_msg.
        """
        args = [
            # First arg, does not matter
            "ingest_multiple_dates.py",

            # We do not want to record status in redshift
            "--skip-progress-in-redshift",

            # run parallelism in the worker instead
            "--serial-stepper",
        ]

        cur_base_dir = os.getcwd()
        if self._should_run_local:
            private_file = os.path.join(
                cur_base_dir,
                staticconf.read_string("run_local.private", "private.yaml"))
            args.extend(["-r"])  # For run-local in sherlock
        else:
            private_file = staticconf.read_string("run_service.private")

        if self._config_override_loc is not None:
            args.extend(["--config-override", self._config_override_loc])
        args.extend(["--private", private_file])
        args.extend(["--config", self._config_loc])

        return args
def main():
    staticconf.YamlConfiguration(CONFIG_FILE)
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )
    api = API(auth)

    big_ben_ids = [
        723360691457945600, 723375789467553793, 723390890664824834,
        723405988221489154, 723421087703261186, 723436186644025344,
        723451541563138052, 723466386304057344, 723481486737985536,
        723497089410457600, 723511939465392128, 723528048931430400,
        723541884208091137, 723556981991202816, 723572081485615104,
        723587184276721665, 723602282374414338, 723617381017374720,
        723632480964759553, 723647581516124160, 723662932664524800,
        723678284538589184, 723693384272121857, 723709493939453952,
        723723076614164480
    ]

    for startid, endid in zip(big_ben_ids, big_ben_ids[1:]):
        for tweet in limit_handled(
                Cursor(
                    api.search,
                    q=' OR '.join(
                        'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890'
                    ),
                    since_id=str(startid),
                    max_id=str(endid),
                    lang="en").items(2500)):
            print(json.dumps(tweet._json))
def main():
    staticconf.YamlConfiguration(CONFIG_FILE)
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )
    api = API(auth)

    big_ben_ids = [
        727256357607464960, 727271714187522048, 727287317912817664,
        727302414039158785, 727317768509480960, 727332108876705794,
        727347714380419072, 727362055750176768, 727377660742123520,
        727393264060534784, 727407354162122753, 727422705876762624,
        727437555210293248, 727452651210809344, 727468761842876416,
        727483610119413760, 727498961741856768, 727513051440762881,
        727528910452305921, 727543248458149888, 727559107612422144,
        727574712830857221, 727588550133288961, 727603646221914113,
        727619000348348416
    ]

    for startid, endid in zip(big_ben_ids[13:], big_ben_ids[14:]):
        for tweet in limit_handled(
                Cursor(
                    api.search,
                    q=' OR '.join(
                        'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890'
                    ),
                    since_id=str(startid),
                    max_id=str(endid),
                    lang="en").items(2500)):
            print(json.dumps(tweet._json))
Пример #10
0
def init():
        # Generate the configures in advance and cache them to avoid repeated processing
        global S3_BUCKET
        global S3_LOG_PREFIX
        global PATH_RE
        S3_BUCKET = staticconf.read_string('s3_bucket')
        S3_LOG_PREFIX = staticconf.read_string('s3_log_prefix')
        PATH_RE = re.compile(PATH_RE_PREFIX.format(S3_LOG_PREFIX))
Пример #11
0
 def __init__(self, cluster: str, pool: str) -> None:
     super().__init__(cluster, pool)
     kubernetes.config.load_kube_config(
         staticconf.read_string(f'clusters.{cluster}.kubeconfig_path'))
     self._core_api = kubernetes.client.CoreV1Api()
     self._safe_to_evict_annotation = staticconf.read_string(
         f'clusters.{cluster}.pod_safe_to_evict_annotation',
         default='cluster-autoscaler.kubernetes.io/safe-to-evict',
     )
 def __init__(self, cluster: str, pool: Optional[str]) -> None:
     super().__init__(cluster, pool)
     self.kubeconfig_path = staticconf.read_string(
         f'clusters.{cluster}.kubeconfig_path')
     self._safe_to_evict_annotation = staticconf.read_string(
         f'clusters.{cluster}.pod_safe_to_evict_annotation',
         default='cluster-autoscaler.kubernetes.io/safe-to-evict',
     )
     self._nodes_by_ip = {}
Пример #13
0
def et_scanner_main(args):
    """ Create an instance of ETScanner and run it once.
    """
    setup_config(args, 'ETScanner')
    sqs_scanner_queue = SQSWrapper(read_string("sqs.et_scanner_queue_name"))
    sqs_worker_queue = SQSWrapper(read_string("sqs.et_queue_name"))
    scanner = ETScanner(TableConnection.get_connection('ScheduledJobs'),
                        sqs_scanner_queue, sqs_worker_queue, Mailer(args.run_local))
    scanner.run()
Пример #14
0
def _init_session():
    global _session

    if not _session:
        _session = boto3.session.Session(
            staticconf.read_string('accessKeyId', namespace=CREDENTIALS_NAMESPACE),
            staticconf.read_string('secretAccessKey', namespace=CREDENTIALS_NAMESPACE),
            region_name=staticconf.read_string('aws.region')
        )
Пример #15
0
def et_scanner_main(args):
    """ Create an instance of ETScanner and run it once.
    """
    setup_config(args, 'ETScanner')
    sqs_scanner_queue = SQSWrapper(read_string("sqs.et_scanner_queue_name"))
    sqs_worker_queue = SQSWrapper(read_string("sqs.et_queue_name"))
    scanner = ETScanner(TableConnection.get_connection('ScheduledJobs'),
                        sqs_scanner_queue, sqs_worker_queue,
                        Mailer(args.run_local))
    scanner.run()
Пример #16
0
 def __init__(self, cluster_name: str) -> None:
     self.client = sqs
     self.cluster = cluster_name
     self.drain_queue_url = staticconf.read_string(
         f'clusters.{cluster_name}.drain_queue_url')
     self.termination_queue_url = staticconf.read_string(
         f'clusters.{cluster_name}.termination_queue_url')
     self.draining_host_ttl_cache: Dict[str, arrow.Arrow] = {}
     self.warning_queue_url = staticconf.read_string(
         f'clusters.{cluster_name}.warning_queue_url',
         default=None,
     )
Пример #17
0
def fetch_creds():
    '''
    Return a dictionary holding temporary credentials from the metadata server.
    This function will block upto the timeout specified in config file. You may
    not call this method unless config.yaml is loaded
    '''
    url = '{url_root}/{name}'.format(
        url_root=staticconf.read_string('instance_profile_creds_url'),
        name=staticconf.read_string('instance_profile_name'))
    in_stream = urllib2.urlopen(
        url,
        timeout=staticconf.read_int(
            'instance_profile_creds_timeout_in_seconds', default=4))
    return simplejson.load(in_stream)
Пример #18
0
def rs_check_schema(rs_mgmt, args):
    yaml_data = load_from_file(args.schema)
    tables = RedShiftLogSchema(safe_load(yaml_data)).tables()

    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    pipe_strm_lgr = PipelineStreamLogger(
        log_stream,
        True,
        'rs_check_schema'
    )
    psql = RedshiftPostgres(pipe_strm_lgr, args.credentials, run_local=True)
    rs_check_table_def(psql, db, tables, args.redshift_schema)
    rs_check_table_rows(psql, db, tables, args.redshift_schema)
Пример #19
0
def main():
    """Connects to the stream and starts threads to write them to a file."""
    staticconf.YamlConfiguration(CONFIG_FILE)
    listener = QueueListener()
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )

    writer_thread = threading.Thread(target=worker, args=(listener,))
    writer_thread.start()

    stream = Stream(auth, listener)

    print_status(listener)

    try:
        while True:
            try:
                
                stream.sample(languages=['en'])  # blocking!
                
            except KeyboardInterrupt:
                print('KEYBOARD INTERRUPT', file=sys.stderr)
                return
            except (socket.error, httplib.HTTPException):
                global tcpip_delay
                print(
                    'TCP/IP Error: Restarting after {delay} seconds.'.format(
                        delay=tcpip_delay,
                    ),
                    file=sys.stderr,
                )
                time.sleep(min(tcpip_delay, MAX_TCPIP_TIMEOUT))
                tcpip_delay += 0.25
    finally:
        print('Disconnecting stream', file=sys.stderr)
        stream.disconnect()
        print('Waiting for last tweets to finish processing', file=sys.stderr)
        # Send poison pill to writer thread and wait for it to exit
        listener.queue.put(None)
        listener.queue.join()
        print('Waiting for writer thread to finish', file=sys.stderr)
        writer_thread.join()
        print('Exit successful', file=sys.stderr)
Пример #20
0
def main():
    """Connects to the stream and starts threads to write them to a file."""
    staticconf.YamlConfiguration(CONFIG_FILE)
    listener = QueueListener()
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )

    writer_thread = threading.Thread(target=worker, args=(listener,))
    writer_thread.start()

    stream = Stream(auth, listener)

    print_status(listener)

    try:
        while True:
            try:
                # stream.sample()  # blocking!
                stream.filter(track=["#airpodsmax"])
            except KeyboardInterrupt:
                print('KEYBOARD INTERRUPT', file=sys.stderr)
                return
            except (socket.error):
                global tcpip_delay
                print(
                    'TCP/IP Error: Restarting after {delay} seconds.'.format(
                        delay=tcpip_delay,
                    ),
                    file=sys.stderr,
                )
                time.sleep(min(tcpip_delay, MAX_TCPIP_TIMEOUT))
                tcpip_delay += 0.25
    finally:
        print('Disconnecting stream', file=sys.stderr)
        stream.disconnect()
        print('Waiting for last tweets to finish processing', file=sys.stderr)
        # Send poison pill to writer thread and wait for it to exit
        listener.queue.put(None)
        # listener.queue.join()
        print('Waiting for writer thread to finish', file=sys.stderr)
        writer_thread.join()
        print('Exit successful', file=sys.stderr)
Пример #21
0
 def test_setup_config_with_env_vars(self):
     args = parse_cmd_args(['program', '--config=./config.yaml',
                            '--config-override=config-env-dev.yaml', '-r'])
     with staticconf.testing.MockConfiguration(MOCK_CONFIG):
         setup_config(args, 'test_worker')
         # pick some key and ensure it ws loaded from config
         assert read_string('log_stream_name', 'default') != 'default'
Пример #22
0
def process_queues(cluster_name: str) -> None:
    draining_client = DrainingClient(cluster_name)
    mesos_master_url = staticconf.read_string(
        f'clusters.{cluster_name}.mesos_master_fqdn')
    mesos_secret_path = staticconf.read_string(
        f'mesos.mesos_agent_secret_path', default=None)
    operator_client = operator_api(mesos_master_url, mesos_secret_path)
    logger.info('Polling SQS for messages every 5s')
    while True:
        draining_client.clean_processing_hosts_cache()
        draining_client.process_warning_queue()
        draining_client.process_drain_queue(
            mesos_operator_client=operator_client, )
        draining_client.process_termination_queue(
            mesos_operator_client=operator_client, )
        time.sleep(5)
Пример #23
0
    def __init__(self,
                 config_loc,
                 config_override_loc,
                 run_local,
                 emailer,
                 dummy_run=False):
        super(ImdWorker, self).__init__(
            config_loc,
            config_override_loc,
            emailer,
            num_processes=3,
        )
        for key in self.KEYS_TO_LOAD:
            self.__setattr__(key, staticconf.read_string(key))
        if dummy_run:
            log("Dummy worker! Skip the real etl process. Just for test.")
            import mycroft.backend.worker.fake_ingest_multiple_dates as ingest_multiple_dates
        else:
            import sherlock.batch.ingest_multiple_dates as ingest_multiple_dates
        self._should_run_local = run_local
        self.dummy_run = dummy_run
        self.ingest_multiple_dates = ingest_multiple_dates.ingest_multiple_dates_main
        self.queue_name = staticconf.get_string("sqs.et_queue_name")
        self.scanner_queue_name = staticconf.get_string(
            "sqs.et_scanner_queue_name")

        log("ImdWorker initialization")
        log(dict((k, str(v)) for k, v in vars(self).iteritems()))
Пример #24
0
    def run(self):
        while self.running:
            time.sleep(splay_event_time(
                self.run_interval,
                self.get_name() + staticconf.read_string('aws.region'),
            ))

            now = arrow.utcnow()
            with self.metrics_client.get_writer(METADATA) as writer:
                try:
                    with suppress_request_limit_exceeded():
                        self.write_prices(now, writer)
                except socket.timeout:
                    # We don't really care if we miss a few spot price changes so just continue here
                    logger.warn(f'Timed out getting spot prices:\n\n{format_exc()}')
                    continue

            # Report successful run to Sensu.
            sensu_args = dict(
                check_name='check_clusterman_spot_prices_running',
                output='OK: clusterman spot_prices was successful',
                check_every='1m',
                source=self.options.aws_region,
                ttl='10m',
                noop=self.options.disable_sensu,
            )
            sensu_checkin(**sensu_args)
Пример #25
0
def fetch_creds():
    '''
    Return a dictionary holding temporary credentials from the metadata server.
    This function will block upto the timeout specified in config file. You may
    not call this method unless config.yaml is loaded
    '''
    url = '{url_root}/{name}'.format(
        url_root=staticconf.read_string('instance_profile_creds_url'),
        name=staticconf.read_string('instance_profile_name'))
    in_stream = urllib2.urlopen(
        url,
        timeout=staticconf.read_int(
            'instance_profile_creds_timeout_in_seconds', default=4
        )
    )
    return simplejson.load(in_stream)
Пример #26
0
def test__get_key_name():
    log_name = 'x'
    log_version = 'y'
    return_value = _get_key_name(log_name, log_version)
    s3_log_prefix = staticconf.read_string('s3_log_prefix')
    path_re = re.compile(PATH_RE_PREFIX.format(s3_log_prefix))
    assert path_re.match(return_value) is not None
Пример #27
0
    def configure_initial(self) -> None:
        setup_config(self.options)

        # Since we want to collect metrics for all the pools, we need to call setup_config
        # first to load the cluster config path, and then read all the entries in that directory
        self.pools: MutableMapping[str, List[str]] = {}
        for scheduler in {'mesos', 'kubernetes'}:
            self.pools[scheduler] = get_pool_name_list(self.options.cluster,
                                                       scheduler)
        for scheduler, pools in self.pools.items():
            for pool in pools:
                self.config.watchers.append({
                    f'{pool}.{scheduler}':
                    get_pool_config_path(self.options.cluster, pool,
                                         scheduler),
                })
                load_cluster_pool_config(self.options.cluster, pool, scheduler,
                                         None)

        self.region = staticconf.read_string('aws.region')
        self.run_interval = staticconf.read_int(
            'batches.cluster_metrics.run_interval_seconds')
        self.logger = logger

        self.metrics_client = ClustermanMetricsBotoClient(
            region_name=self.region)
Пример #28
0
def _get_logger(run_local, tag):
    try:
        return PipelineStreamLogger(staticconf.read_string("log_stream_name"),
                                    run_local, tag)
    except:
        logger.write_msg("Error creating a pipeline stream logger!")
        return logger  # Return existing logger instance in case of errors
Пример #29
0
def test__get_key_name():
    log_name = 'x'
    log_version = 'y'
    return_value = _get_key_name(log_name, log_version)
    s3_log_prefix = staticconf.read_string('s3_log_prefix')
    path_re = re.compile(PATH_RE_PREFIX.format(s3_log_prefix))
    assert path_re.match(return_value) is not None
Пример #30
0
 def parse_config(self, config_file_path):
     super(S3Feeder, self).parse_config(config_file_path)
     self.s3_event_notifications_queue_name = staticconf.read(
         's3_event_notifications_queue_name')
     self.number_messages = staticconf.read('number_messages', default=1)
     self.aws_region = staticconf.read('aws_region', default=None)
     self.owner_account_id = staticconf.read_string('owner_account_id')
     self.role_arn = staticconf.read('role_arn', default=None)
Пример #31
0
def get_scanner_queue(etl_type):
    """
    Return the scanner sqs for jobs to send a message when post a job
    to wake up the scanner
    :param etl_type: et or load
    :type etl_type: string in ['et', 'load']
    """
    return SQSWrapper(read_string("sqs.{0}_scanner_queue_name".format(etl_type)))
Пример #32
0
def get_sqs_connection():
    '''
    :returns: sqs connection
    '''
    return boto.sqs.connect_to_region(
        read_string('aws_config.region'),
        **get_boto_creds()
    )
Пример #33
0
def dynamodb_table_names():
    '''
    :returns: iterable of string that each element is a DyanmoDB table name used in mycroft
    '''
    table_names = []
    # append other table resources required by mycroft
    table_names.append(staticconf.read_string('aws_config.scheduled_jobs_table'))
    return table_names
Пример #34
0
def get_dynamodb_connection():
    '''
    :returns: dynamodb2 connection
    '''
    return boto.dynamodb2.connect_to_region(
        read_string('aws_config.region'),
        **get_boto_creds()
    )
Пример #35
0
def main():
    staticconf.YamlConfiguration(CONFIG_FILE)
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )
    api = API(auth)

    big_ben_ids = [727256357607464960, 
                   727271714187522048, 
                   727287317912817664, 
                   727302414039158785, 
                   727317768509480960, 
                   727332108876705794, 
                   727347714380419072, 
                   727362055750176768, 
                   727377660742123520, 
                   727393264060534784, 
                   727407354162122753, 
                   727422705876762624,
                   727437555210293248, 
                   727452651210809344, 
                   727468761842876416, 
                   727483610119413760,
                   727498961741856768, 
                   727513051440762881, 
                   727528910452305921, 
                   727543248458149888,
                   727559107612422144, 
                   727574712830857221, 
                   727588550133288961, 
                   727603646221914113,
                   727619000348348416]

    for startid, endid in zip(big_ben_ids[13:], big_ben_ids[14:]):
        for tweet in limit_handled(Cursor(api.search,
                q=' OR '.join('qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890'),
                since_id=str(startid),
                max_id=str(endid),
                lang="en").items(2500)):
            print(json.dumps(tweet._json))
Пример #36
0
def setup_config(args: argparse.Namespace) -> None:
    # load_default_config merges the 'module_config' key from the first file
    # and the 'module_env_config' key from the second file to configure packages.
    # This allows us to configure packages differently in different hiera envs by
    # changing 'module_env_config'. We use the same file for both keys.
    _load_module_configs(args.env_config_path)

    signals_branch_or_tag = getattr(args, 'signals_branch_or_tag', None)
    cluster_config_directory = getattr(args, 'cluster_config_directory',
                                       None) or DEFAULT_CLUSTER_DIRECTORY
    staticconf.DictConfiguration(
        {'cluster_config_directory': cluster_config_directory})

    aws_region = getattr(args, 'aws_region', None)
    cluster = getattr(args, 'cluster', None)
    pool = getattr(args, 'pool', None)
    scheduler = getattr(args, 'scheduler', None)
    if aws_region and cluster:
        raise argparse.ArgumentError(
            None, 'Cannot specify both cluster and aws_region')

    # If there is a cluster specified via --cluster, load cluster-specific attributes
    # into staticconf.  These values are not specified using hiera in srv-configs because
    # we might want to be operating on a cluster in one region while running from a
    # different region.
    elif cluster:
        aws_region = staticconf.read_string(f'clusters.{cluster}.aws_region',
                                            default=None)
        if pool:
            load_cluster_pool_config(cluster, pool, scheduler,
                                     signals_branch_or_tag)

    staticconf.DictConfiguration({'aws': {'region': aws_region}})

    boto_creds_file = staticconf.read_string('aws.access_key_file',
                                             default=None)
    if boto_creds_file:
        staticconf.JSONConfiguration(boto_creds_file,
                                     namespace=CREDENTIALS_NAMESPACE)

    if signals_branch_or_tag:
        staticconf.DictConfiguration(
            {'autoscale_signal': {
                'branch_or_tag': signals_branch_or_tag
            }})
Пример #37
0
def _get_logger(run_local, tag):
    try:
        return PipelineStreamLogger(
            staticconf.read_string("log_stream_name"),
            run_local, tag
        )
    except:
        logger.write_msg("Error creating a pipeline stream logger!")
        return logger  # Return existing logger instance in case of errors
Пример #38
0
 def __init__(self, cluster: str, pool: str) -> None:
     super().__init__(cluster, pool)
     mesos_master_fqdn = staticconf.read_string(f'clusters.{self.cluster}.mesos_master_fqdn')
     self.non_batch_framework_prefixes = self.pool_config.read_list(
         'non_batch_framework_prefixes',
         default=['marathon'],
     )
     self.api_endpoint = f'http://{mesos_master_fqdn}:5050/'
     logger.info(f'Connecting to Mesos masters at {self.api_endpoint}')
Пример #39
0
def rs_cluster_restore(rs_mgmt, args):
    """ restore cluster from snapshot
    Output can be appended to a YAML config file
    """

    if not args.subnet_group_name:
        args.subnet_group_name = read_string('redshift_cluster_subnet_group_name')
    if not args.vpc_security_group:
        args.vpc_security_group = read_string('security_group_id')
    rs_mgmt.restore_from_cluster_snapshot(
        args.cluster_name,
        args.snapshot,
        args.parameter_group,
        args.vpc_security_group,
        args.subnet_group_name,
    )
    cluster_info = rs_mgmt.get_cluster_info(args.cluster_name)
    return cluster_info['Endpoint']['Address'], cluster_info['Endpoint']['Port']
Пример #40
0
def test_load_cluster_pool_config(cluster, pool, pool_other_config,
                                  mock_config_files):
    config.load_cluster_pool_config(cluster, pool, 'mesos', None)

    pool_namespace = POOL_NAMESPACE.format(pool=pool, scheduler='mesos')
    assert staticconf.read_int('other_config',
                               namespace=pool_namespace) == pool_other_config
    assert staticconf.read_string(f'resource_groups',
                                  namespace=pool_namespace) == cluster
Пример #41
0
def test_setup_config_region(mock_load_module_configs, mock_config_files):
    args = argparse.Namespace(
        env_config_path='/nail/etc/config.yaml',
        aws_region='fake-region-A',
    )
    config.setup_config(args)
    assert staticconf.read_string('aws.region') == 'fake-region-A'
    assert mock_load_module_configs.call_args == mock.call(
        '/nail/etc/config.yaml')
Пример #42
0
def ensure_account_id(cluster) -> None:
    current_account_id = sts.get_caller_identity()['Account']
    cluster_account_id = staticconf.read_string(
        f'clusters.{cluster}.aws_account_number')

    if (current_account_id != cluster_account_id):
        raise AccountNumberMistmatchError(
            f'ACCOUNT ID MISMATCH! Current account id: {current_account_id}. Cluster account id: {cluster_account_id}'
        )
Пример #43
0
def setup_private(input_args):
    """
    setup_private sets up the aws credentials required to run on the server
    in the appropriate environment variables

    Args:
    local -- True if we're on dev, False if on stageb
    input_args -- input yaml file with aws access_key_id and secret_access_key

    Returns
    a yaml file with the private information in it
    ---
    """

    YamlConfiguration(input_args, optional=True)
    os.environ['AWS_ACCESS_KEY_ID'] = read_string('emr_aws_access_key_id')
    os.environ['AWS_SECRET_ACCESS_KEY'] = \
        read_string('emr_aws_secret_access_key')
Пример #44
0
def get_scanner_queue(etl_type):
    """
    Return the scanner sqs for jobs to send a message when post a job
    to wake up the scanner
    :param etl_type: et or load
    :type etl_type: string in ['et', 'load']
    """
    return SQSWrapper(
        read_string("sqs.{0}_scanner_queue_name".format(etl_type)))
Пример #45
0
def dynamodb_table_names():
    '''
    :returns: iterable of string that each element is a DyanmoDB table name used in mycroft
    '''
    table_names = []
    # append other table resources required by mycroft
    table_names.append(
        staticconf.read_string('aws_config.scheduled_jobs_table'))
    return table_names
Пример #46
0
def setup_private(input_args):
    """
    setup_private sets up the aws credentials required to run on the server
    in the appropriate environment variables

    Args:
    local -- True if we're on dev, False if on stageb
    input_args -- input yaml file with aws access_key_id and secret_access_key

    Returns
    a yaml file with the private information in it
    ---
    """

    YamlConfiguration(input_args, optional=True)
    os.environ['AWS_ACCESS_KEY_ID'] = read_string('emr_aws_access_key_id')
    os.environ['AWS_SECRET_ACCESS_KEY'] = \
        read_string('emr_aws_secret_access_key')
Пример #47
0
def s3_to_psv_main(args):

    mrjob = read_string('pipeline.et_step.mrjob')
    stream_name = read_string('pipeline.et_step.s3_to_s3_stream')
    DATABASE = read_string('pipeline.redshift_database')

    LOG_STREAM = PipelineStreamLogger(
        stream_name,
        args.run_local,
        mrjob,
        input_date=args.date
    )

    day_to_run = setup_dates_to_check(args.date, args.run_local, LOG_STREAM)

    try:
        if not args.run_local:
            setup_private(args.private)
        # Create a psql instance based on args
        if args.skip_progress_in_redshift:
            status_table = DynamoDbStatusTable(
                LOG_STREAM, run_local=args.run_local
            )
        else:
            status_table = RedshiftStatusTable(
                RedshiftPostgres(
                    LOG_STREAM, args.private, run_local=args.run_local
                )
            )
        load_msg = __load_data_from_s3(
            status_table,
            read_list('pipeline.et_step.s3_prefixes'),
            day_to_run,
            mrjob,
            args.run_local,
            DATABASE,
            LOG_STREAM,
            force_et=args.force_et
        )
        LOG_STREAM.write_msg("complete", extra_msg=load_msg)

    finally:
        clear_env(args.run_local)
Пример #48
0
def pipeline_yaml_schema_file_path():
    """Return the full path of the yaml schema file for the pipeline. Do
    nothing if the path is already an S3 path
    """
    yaml_schema_file_path = read_string('pipeline.yaml_schema_file')
    if is_s3_path(yaml_schema_file_path):
        return yaml_schema_file_path
    return '{directory}/{filename}'.format(
        directory=os.environ['YELPCODE'],
        filename=yaml_schema_file_path,
    )
Пример #49
0
def fetch_creds_from_file():
    '''
    Returns a dictionary holding credentials from a file defined in config.yaml
    '''
    with open(staticconf.read_string('run_local.session_file'), 'r') as creds:
        if os.fstat(creds.fileno()).st_size == 0:
            raise Exception("session file is empty")
        creds_dict = simplejson.load(creds)
        creds_dict['Expiration'] = creds_dict.get('Expiration', MAX_UNIX_TIME)
        for optional_key in ['Token', 'LastUpdated']:
            creds_dict[optional_key] = creds_dict.get(optional_key)
        return creds_dict
Пример #50
0
    def mail_result(self, final_status, msg, additional_info=None):
        link = self.link_temp.format(msg['uuid'])
        content = self.template.format(
            msg['uuid'], final_status, msg['log_name'], msg['log_schema_version'],
            msg['s3_path'], msg['redshift_id'], msg['start_date'], msg['end_date'], link,
            additional_info
        )

        new_msg = MIMEText(content)
        new_msg['Subject'] = self.subject.format(msg['uuid'])
        new_msg['From'] = self.address
        new_msg['To'] = ','.join(msg['contact_emails'])

        smtp_host = staticconf.read_string('smtp_host', 'localhost')
        smtp_port = staticconf.read_string('smtp_port', None)
        smtp_login = staticconf.read_string('smtp_login', None)
        smtp_password = staticconf.read_string('smtp_password', None)
        smtp_security = staticconf.read_string('smtp_security', None)

        if smtp_port is not None:
            smtp_host = "{0}:{1}".format(smtp_host, smtp_port)

        if smtp_security is not None:
            smtp_security = smtp_security.upper()

        if smtp_security == 'SSL':
            s = smtplib.SMTP_SSL(smtp_host)
            s.login(smtp_login, smtp_password)
        elif smtp_security == 'TLS':
            s = smtplib.SMTP(smtp_host)
            s.ehlo()
            s.starttls()
            s.login(smtp_login, smtp_password)
        else:
            s = smtplib.SMTP(smtp_host)

        s.sendmail(self.address, msg['contact_emails'], new_msg.as_string())
        s.quit()
Пример #51
0
def search_log_source_by_keyword(request_body):
    disabled_logfinder = staticconf.read_bool('disable_logfinder_service')
    if disabled_logfinder:
        return {'logs': []}

    # send HTTP request
    search_endpoint = staticconf.read_string('log_finder_search_end_point')
    response = requests.post(search_endpoint, request_body)

    # if we get a bad HTTP status, raise an exception
    response.raise_for_status()

    content = response.json()
    return content
Пример #52
0
    def __init__(self, logstrm, psql_auth_file, run_local=False):

        self.run_local = run_local
        self.host = staticconf.read_string('redshift_host')
        self.port = staticconf.read_int('redshift_port')
        private_dict = YamlConfiguration(psql_auth_file)
        self.user = private_dict['redshift_user']
        self.password = private_dict['redshift_password']
        self.log_stream = logstrm
        self._aws_key = ''
        self._aws_secret = ''
        self._aws_token = ''
        self._aws_token_expiry = datetime.utcnow()
        self._whitelist = ['select', 'create', 'insert', 'update']
        self._set_aws_auth()
        psycopg2.extensions.set_wait_callback(wait_select_inter)
Пример #53
0
def get_log_meta_data(bucket_name, log_name):
    if bucket_name is None or log_name is None:
        return None

    if staticconf.read_bool('disable_logfinder_service'):
        return None

    # send HTTP request
    endpoint = staticconf.read_string('log_finder_buckets_end_point') \
        + '/' + bucket_name + '/' + log_name
    response = requests.get(endpoint)

    # if we get a bad HTTP status, raise an exception
    response.raise_for_status()

    return response.json()
Пример #54
0
def copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream):
    s3_log, rs_table = log_tuple
    namespaced_table_name = get_namespaced_tablename(rs_table)
    table_start = time.time()
    extra_msg = "from s3 log: {0}".format(s3_log)
    logstream.write_msg('starting', extra_msg=extra_msg)

    # about to load new day, remove oldest
    rows_deleted = None
    if ttl_days is not None:
        rows_deleted = \
            delete_old_data(psql_helper, db_name, rs_table, ttl_days - 1)
    if rows_deleted:
        logstream.write_msg('delete_ok',
                            extra_msg="{0} rows".format(rows_deleted))

    # Try to reclaim disk space.  If not needed, it will be fast.
    # Calling here and not in the 'if rows_deleted' code to prevent
    # scenario where rows were deleted but compact failed. Then on retry
    # there will be nothing to delete but since space is not reclaimed
    # there may not be enough for a new load, resulting in failure forever.
    if ttl_days is not None:
        compact_table(psql_helper, db_name, namespaced_table_name)

    delimiter = read_string('redshift_column_delimiter')
    delimiter = delimiter.decode("string_escape")
    if delimiter not in string.printable:
        delimiter = '\\' + oct(ord(delimiter))

    copy_sql = LOAD % (namespaced_table_name, s3_log, delimiter)
    result = psql_helper.run_sql(
        copy_sql,
        db_name, " copying from " + s3_log,
        s3_needed=True,
        time_est_secs=read_int('pipeline.load_step.copy_time_est_secs')
    )
    if result is not False:
        logstream.write_msg('complete', job_start_secs=table_start,
                            extra_msg=extra_msg)
    return result
Пример #55
0
 def get_connection(cls, table_object_name):
     if table_object_name not in cls._connection_dict:
         if cls._region_conn is None:
             cls._region_conn = get_dynamodb_connection()
         table_properties = cls._TABLE_NAME_TO_PROPERTIES[table_object_name]
         avro_schema = get_avro_schema(table_properties['avro_schema'])
         table_name = read_string(table_properties['physical_id_key'])
         table = Table(
             table_name,
             connection=cls._region_conn
         )
         try:
             results = table.describe()
             raw_indexes = results['Table'].get('GlobalSecondaryIndexes', [])
             table.global_indexes = introspect_global_indexes(raw_indexes)
         except Exception:
             log_exception("Table Connection Failed")
         cls._connection_dict[table_object_name] = table_properties['class'](
             table,
             avro_schema
         )
     return cls._connection_dict[table_object_name]
Пример #56
0
    def __init__(self, config_loc, config_override_loc, run_local, emailer, dummy_run=False):
        super(ImdWorker, self).__init__(
            config_loc,
            config_override_loc,
            emailer,
            num_processes=3,
        )
        for key in self.KEYS_TO_LOAD:
            self.__setattr__(key, staticconf.read_string(key))
        if dummy_run:
            log("Dummy worker! Skip the real etl process. Just for test.")
            import mycroft.backend.worker.fake_ingest_multiple_dates as ingest_multiple_dates
        else:
            import sherlock.batch.ingest_multiple_dates as ingest_multiple_dates
        self._should_run_local = run_local
        self.dummy_run = dummy_run
        self.ingest_multiple_dates = ingest_multiple_dates.ingest_multiple_dates_main
        self.queue_name = staticconf.get_string("sqs.et_queue_name")
        self.scanner_queue_name = staticconf.get_string("sqs.et_scanner_queue_name")

        log("ImdWorker initialization")
        log(dict((k, str(v))for k, v in vars(self).iteritems()))
Пример #57
0
def get_redshift_schema():
    # note we do lower for backward compatability
    return read_string('redshift_schema', DEFAULT_NAMESPACE).lower()
Пример #58
0
def s3_to_redshift_main(args):

    db = read_string('pipeline.redshift_database')
    s3_log_prefix = read_string('pipeline.s3_output_prefix').format(
        logname=os.environ.get('LOGNAME', 'unknown')
    )

    # setup logging
    stream_name = read_string('pipeline.load_step.s3_to_redshift_stream')
    LOG_STREAM = PipelineStreamLogger(
        stream_name,
        args.run_local,
        's3_to_redshift',
        job_name='load'
    )

    # handle to redshift db
    loader_psql = RedshiftPostgres(
        LOG_STREAM, args.private, run_local=args.run_local
    )

    if args.skip_progress_in_redshift:
        status_table = DynamoDbStatusTable(
            LOG_STREAM, run_local=args.run_local
        )
    else:
        status_table = RedshiftStatusTable(loader_psql)

    create_tuples = get_table_creates(args.db_file, LOG_STREAM)

    data_candidates = dates_from_rs_status(
        status_table,
        db,
        LOG_STREAM,
        args.retry_errors,
        args.date,
    )
    if data_candidates:
        try:
            update_database_schema(
                loader_psql,
                db,
                data_candidates[0],
                s3_log_prefix,
                args.db_file,
                LOG_STREAM
            )
        except Exception as e:
            status_table.update_status(
                db,
                data_candidates[0],
                get_yaml_table_versions(pipeline_yaml_schema_file_path()),
                "error",
                start_time_secs=time.time(), error_msg=repr(e)
            )
            raise
    elif args.date is not None:
        raise IOError("{0} data is either already loaded \
or has not yet completed ET step".format(args.date))

    logs_to_copy = []
    for input_date in data_candidates:
        LOG_STREAM = PipelineStreamLogger(
            stream_name,
            args.run_local,
            's3_to_redshift',
            job_name='load',
            input_date=input_date
        )
        logs_to_copy = [
            (join(s3_log_prefix, input_date, table), table)
            for (table, _) in create_tuples
        ]
        copy_tables(loader_psql, status_table, db, input_date, logs_to_copy,
                    args.ttl_days, LOG_STREAM)
Пример #59
0
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures)
        )


if __name__ == "__main__":
    args = get_cmd_line_args()
    run_local = args.run_local
    merge_configs(args.config)
    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint')
    psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local)

    yaml = load_from_file(args.schema)
    schema = RedShiftLogSchema(safe_load(yaml))

    if args.compact:
        compact_tables(psql, db, schema.tables(), args.redshift_schema)
    analyze_tables(psql, db, schema.tables(), args.redshift_schema)