Python Disco.Disco示例，disco.core.Disco.Disco Python示例

示例#1

0

显示文件

    def __init__(self, *args, **kwargs):
        # load the defaults
        super(Settings, self).update(defaults)

        # override with the settings file
        path = kwargs.get('settings_file') or self['settings_file']
        if path and os.path.exists(path):
            try:
                import yaml
                self.update(yaml.load(open(path)))
            except:
                pass  # if ya can't ya can't

        # final overrides
        super(Settings, self).update(overrides)
        super(Settings, self).__init__(*args, **kwargs)

        # set up ddfs and disco
        if not self['server'].startswith('disco://'):
            self['server'] = 'disco://' + self['server']

        if 'ddfs' not in self:
            self['ddfs'] = DDFS(self['server'])
        self['server'] = Disco(self['server'])

        # set up worker
        if 'worker' not in self:
            worker_mod, _, worker_class = self['worker_class'].rpartition('.')
            mod = __import__(worker_mod, {}, {}, worker_mod)
            self['worker'] = getattr(mod, worker_class)()

示例#2

0

显示文件

def get_disco_handle(server):
    from disco.core import Disco
    from disco.ddfs import DDFS

    if server and not server.startswith('disco://'):
        server = 'disco://' + server

    return Disco(server), DDFS(server)

示例#3

0

显示文件

文件： discojob.py 项目： pombredanne/odisco

def submit(master, jobpack):
    from disco.settings import DiscoSettings
    from disco.core import Disco
    settings = DiscoSettings()
    dmaster = Disco(master)
    print "Submitting job to ", master
    status, response = json.loads(dmaster.request('/disco/job/new', jobpack))
    if status != 'ok':
        errmsg('Failed to start job. Server replied: %s' % response)
    print response

示例#4

0

显示文件

文件： kclustering.py 项目： chinnurtb/disco_playground

    """
    Predict the closest clusters for the datapoints in input.
    """
    job = master.new_job(name='kcluster_predict',
                         input=input,
                         map_reader=map_reader,
                         map=predict_map,
                         params=Params(centers=centers, **center),
                         nr_reduces=0)

    return job.wait()


if __name__ == '__main__':
    parser = OptionParser(usage='%prog [options] inputs')
    parser.add_option('--disco-master',
                      default=getenv('DISCO_MASTER'),
                      help='Disco master')
    parser.add_option('--iterations', default=10, help='Numbers of iteration')
    parser.add_option('--clusters', default=10, help='Numbers of clusters')

    (options, input) = parser.parse_args()
    master = Disco(options.disco_master)

    centers = estimate(master, input, mean_point_center, int(options.clusters),
                       int(options.iterations))

    res = predict(master, input, mean_point_center, centers)

    print '\n'.join(res)

示例#5

0

显示文件

 def disco(self):
     return Disco(self.disco_master_url)

示例#6

0

显示文件

 def disco(self):
     from disco.core import Disco
     return Disco(self.settings['DISCO_MASTER'])

示例#7

0

显示文件

    input = inputs or [
        maybe_list(line.split()) for line in fileinput.input(inputs)
    ]
    job = reify(jobclass)(program.disco, name)

    try:
        params = job.params
    except AttributeError:
        params = Params()
    params.__dict__.update(**dict(program.options.params))

    job.run(input=input, **program.option_parser.jobdict)
    print job.name


@Disco.command
def wait(program, jobname):
    """Usage: jobname

    Wait for the named job to complete.
    """
    program.disco.wait(jobname)


if __name__ == '__main__':
    Disco(option_parser=DiscoOptionParser()).main()

    # Workaround for "disco test" in Python2.5 which doesn't shutdown the
    # test_server thread properly.
    sys.exit(0)  # XXX still needed?

示例#8

0

显示文件

文件： cli.py 项目： tpeng/disco

 def disco(self):
     from disco.core import Disco
     return Disco(settings=self.settings)

示例#9

0

显示文件

import sys
from disco.core import Disco, result_iterator
from disco.settings import DiscoSettings


def map(line, params):
    for word in line.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job.."
print "Go to %s to see status of the job." % disco.master
results = disco.new_job(
    name="wordcount",
    input=["http://discoproject.org/media/text/chekhov.txt"],
    map=map,
    reduce=reduce,
    save=True).wait()
print "Job done. Results:"
for word, count in result_iterator(results):
    print word, count

示例#10

0

显示文件

 def __init__(self, name=None, master=None, worker=None, settings=None):
     from disco.core import Disco
     self.name = name or type(self).__name__
     self.disco = master if isinstance(master, Disco) else Disco(master)
     self.worker = worker or self.Worker()
     self.settings = settings or DiscoSettings()

示例#11

0

显示文件

def fit_predict(training_data,
                fitting_data,
                tau=1,
                samples_per_job=0,
                save_results=True,
                show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.core import Disco
    """
    training_data - training samples
    fitting_data - dataset to be fitted to training data.
    tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
    samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
    """

    try:
        tau = float(tau)
        if tau <= 0:
            raise Exception("Parameter tau should be >= 0.")
    except ValueError:
        raise Exception("Parameter tau should be numerical.")

    if fitting_data.params["id_index"] == -1:
        raise Exception("Predict data should have id_index set.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fitting_data.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = fitting_data.params
    job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])

    samples = {}
    results = []
    tau = float(2 * tau**2)  # calculate tau once
    counter = 0

    for test_id, x in result_iterator(job.wait(show=show)):
        if samples_per_job == 0:
            # calculate number of samples per job
            if len(x) <= 100:  # if there is less than 100 attributes
                samples_per_job = 100  # 100 samples is max per on job
            else:
                # there is more than 100 attributes
                samples_per_job = len(x) * -25 / 900. + 53  # linear function

        samples[test_id] = x
        if counter == samples_per_job:
            results.append(
                _fit_predict(training_data, samples, tau, save_results, show))
            counter = 0
            samples = {}
        counter += 1

    if len(samples) > 0:  # if there is some samples left in the the dictionary
        results.append(
            _fit_predict(training_data, samples, tau, save_results, show))

    # merge results of every iteration into a single tag
    ddfs = Disco().ddfs
    ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])

    return ["tag://" + job.name]

示例#12

0

显示文件

 def disco(self):
     return Disco(settings=self.settings)

示例#13

0

显示文件

from discodex import settings
from discodex.mapreduce import (Indexer, DiscoDBIterator)
from discodex.objects import (DataSet, IChunks, Indices, Index, Results, Dict)

from disco.core import Disco
from disco.ddfs import DDFS
from disco.error import DiscoError
from disco.util import flatten, parse_dir

discodex_settings = settings.DiscodexSettings()
disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER']
disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX']
index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX']
purge_file = discodex_settings['DISCODEX_PURGE_FILE']
disco_master = Disco(disco_master_url)
ddfs = DDFS(disco_master_url)

NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead'


class IndexCollection(Collection):
    allowed_methods = ('GET', 'POST')

    def delegate(self, request, *args, **kwargs):
        name = str(kwargs.pop('name'))
        return IndexResource(name)(request, *args, **kwargs)

    @property
    def names(self):
        return ddfs.list(index_prefix)

示例#14

0

显示文件

文件： paralleletl.py 项目： xiufengliu/ETLMR

	                  2. Online ODAT; 3. Offline dim')
	parser.add_option('--post-fix',
	                  default=1,
	                  help='Does post-fixing for ODAT? (default=1): 1. Yes; 2. No')	
	parser.add_option('--go-live',
	                  default=1,
	                  help='Load offline dim data to DW DBMS? (default=1): 1. yes; 2. No')		
	parser.add_option('--profile',
	                  default=False,
	                  help='Profile (default=False)')
	parser.add_option('--config',
	                  default='conf/config.py',
	                  help='The path to config.py (default=conf/config.py)')

	(options, input_paths) = parser.parse_args()
	master = Disco("disco://"+options.disco_master)	
	
	load_method = odotetlmr
	seq_process = None
	post_fixing = -1
	load_step = int(options.load_step)
	if options.load_method=='2':
		load_method = odatetlmr
		if  load_step==1:
			post_fixing = int(options.post_fix)
			seq_process = multiprocessing.Process(target=seq_server)
			seq_process.start()		
	elif options.load_method=='3':
		load_method = offdimetlmr
		
	input_file_urls = []

示例#15

0

显示文件

 def data():
     return Disco(self.master).jobpack(self.jobname)