예제 #1
0
    def __init__(self, *args, **kwargs):
        # load the defaults
        super(Settings, self).update(defaults)

        # override with the settings file
        path = kwargs.get('settings_file') or self['settings_file']
        if path and os.path.exists(path):
            try:
                import yaml
                self.update(yaml.load(open(path)))
            except:
                pass  # if ya can't ya can't

        # final overrides
        super(Settings, self).update(overrides)
        super(Settings, self).__init__(*args, **kwargs)

        # set up ddfs and disco
        if not self['server'].startswith('disco://'):
            self['server'] = 'disco://' + self['server']

        if 'ddfs' not in self:
            self['ddfs'] = DDFS(self['server'])
        self['server'] = Disco(self['server'])

        # set up worker
        if 'worker' not in self:
            worker_mod, _, worker_class = self['worker_class'].rpartition('.')
            mod = __import__(worker_mod, {}, {}, worker_mod)
            self['worker'] = getattr(mod, worker_class)()
예제 #2
0
def get_disco_handle(server):
    from disco.core import Disco
    from disco.ddfs import DDFS

    if server and not server.startswith('disco://'):
        server = 'disco://' + server

    return Disco(server), DDFS(server)
예제 #3
0
def submit(master, jobpack):
    from disco.settings import DiscoSettings
    from disco.core import Disco
    settings = DiscoSettings()
    dmaster = Disco(master)
    print "Submitting job to ", master
    status, response = json.loads(dmaster.request('/disco/job/new', jobpack))
    if status != 'ok':
        errmsg('Failed to start job. Server replied: %s' % response)
    print response
예제 #4
0
    """
    Predict the closest clusters for the datapoints in input.
    """
    job = master.new_job(name='kcluster_predict',
                         input=input,
                         map_reader=map_reader,
                         map=predict_map,
                         params=Params(centers=centers, **center),
                         nr_reduces=0)

    return job.wait()


if __name__ == '__main__':
    parser = OptionParser(usage='%prog [options] inputs')
    parser.add_option('--disco-master',
                      default=getenv('DISCO_MASTER'),
                      help='Disco master')
    parser.add_option('--iterations', default=10, help='Numbers of iteration')
    parser.add_option('--clusters', default=10, help='Numbers of clusters')

    (options, input) = parser.parse_args()
    master = Disco(options.disco_master)

    centers = estimate(master, input, mean_point_center, int(options.clusters),
                       int(options.iterations))

    res = predict(master, input, mean_point_center, centers)

    print '\n'.join(res)
예제 #5
0
 def disco(self):
     return Disco(self.disco_master_url)
예제 #6
0
 def disco(self):
     from disco.core import Disco
     return Disco(self.settings['DISCO_MASTER'])
예제 #7
0
    input = inputs or [
        maybe_list(line.split()) for line in fileinput.input(inputs)
    ]
    job = reify(jobclass)(program.disco, name)

    try:
        params = job.params
    except AttributeError:
        params = Params()
    params.__dict__.update(**dict(program.options.params))

    job.run(input=input, **program.option_parser.jobdict)
    print job.name


@Disco.command
def wait(program, jobname):
    """Usage: jobname

    Wait for the named job to complete.
    """
    program.disco.wait(jobname)


if __name__ == '__main__':
    Disco(option_parser=DiscoOptionParser()).main()

    # Workaround for "disco test" in Python2.5 which doesn't shutdown the
    # test_server thread properly.
    sys.exit(0)  # XXX still needed?
예제 #8
0
파일: cli.py 프로젝트: tpeng/disco
 def disco(self):
     from disco.core import Disco
     return Disco(settings=self.settings)
예제 #9
0
import sys
from disco.core import Disco, result_iterator
from disco.settings import DiscoSettings


def map(line, params):
    for word in line.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job.."
print "Go to %s to see status of the job." % disco.master
results = disco.new_job(
    name="wordcount",
    input=["http://discoproject.org/media/text/chekhov.txt"],
    map=map,
    reduce=reduce,
    save=True).wait()
print "Job done. Results:"
for word, count in result_iterator(results):
    print word, count
예제 #10
0
 def __init__(self, name=None, master=None, worker=None, settings=None):
     from disco.core import Disco
     self.name = name or type(self).__name__
     self.disco = master if isinstance(master, Disco) else Disco(master)
     self.worker = worker or self.Worker()
     self.settings = settings or DiscoSettings()
예제 #11
0
def fit_predict(training_data,
                fitting_data,
                tau=1,
                samples_per_job=0,
                save_results=True,
                show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.core import Disco
    """
    training_data - training samples
    fitting_data - dataset to be fitted to training data.
    tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
    samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
    """

    try:
        tau = float(tau)
        if tau <= 0:
            raise Exception("Parameter tau should be >= 0.")
    except ValueError:
        raise Exception("Parameter tau should be numerical.")

    if fitting_data.params["id_index"] == -1:
        raise Exception("Predict data should have id_index set.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fitting_data.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = fitting_data.params
    job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])

    samples = {}
    results = []
    tau = float(2 * tau**2)  # calculate tau once
    counter = 0

    for test_id, x in result_iterator(job.wait(show=show)):
        if samples_per_job == 0:
            # calculate number of samples per job
            if len(x) <= 100:  # if there is less than 100 attributes
                samples_per_job = 100  # 100 samples is max per on job
            else:
                # there is more than 100 attributes
                samples_per_job = len(x) * -25 / 900. + 53  # linear function

        samples[test_id] = x
        if counter == samples_per_job:
            results.append(
                _fit_predict(training_data, samples, tau, save_results, show))
            counter = 0
            samples = {}
        counter += 1

    if len(samples) > 0:  # if there is some samples left in the the dictionary
        results.append(
            _fit_predict(training_data, samples, tau, save_results, show))

    # merge results of every iteration into a single tag
    ddfs = Disco().ddfs
    ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])

    return ["tag://" + job.name]
예제 #12
0
 def disco(self):
     return Disco(settings=self.settings)
예제 #13
0
from discodex import settings
from discodex.mapreduce import (Indexer, DiscoDBIterator)
from discodex.objects import (DataSet, IChunks, Indices, Index, Results, Dict)

from disco.core import Disco
from disco.ddfs import DDFS
from disco.error import DiscoError
from disco.util import flatten, parse_dir

discodex_settings = settings.DiscodexSettings()
disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER']
disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX']
index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX']
purge_file = discodex_settings['DISCODEX_PURGE_FILE']
disco_master = Disco(disco_master_url)
ddfs = DDFS(disco_master_url)

NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead'


class IndexCollection(Collection):
    allowed_methods = ('GET', 'POST')

    def delegate(self, request, *args, **kwargs):
        name = str(kwargs.pop('name'))
        return IndexResource(name)(request, *args, **kwargs)

    @property
    def names(self):
        return ddfs.list(index_prefix)
예제 #14
0
	                  2. Online ODAT; 3. Offline dim')
	parser.add_option('--post-fix',
	                  default=1,
	                  help='Does post-fixing for ODAT? (default=1): 1. Yes; 2. No')	
	parser.add_option('--go-live',
	                  default=1,
	                  help='Load offline dim data to DW DBMS? (default=1): 1. yes; 2. No')		
	parser.add_option('--profile',
	                  default=False,
	                  help='Profile (default=False)')
	parser.add_option('--config',
	                  default='conf/config.py',
	                  help='The path to config.py (default=conf/config.py)')

	(options, input_paths) = parser.parse_args()
	master = Disco("disco://"+options.disco_master)	
	
	load_method = odotetlmr
	seq_process = None
	post_fixing = -1
	load_step = int(options.load_step)
	if options.load_method=='2':
		load_method = odatetlmr
		if  load_step==1:
			post_fixing = int(options.post_fix)
			seq_process = multiprocessing.Process(target=seq_server)
			seq_process.start()		
	elif options.load_method=='3':
		load_method = offdimetlmr
		
	input_file_urls = []	
예제 #15
0
 def data():
     return Disco(self.master).jobpack(self.jobname)