def _work(job_queue, result_queue, remote): while True: job = None try: job = _pickle.loads(base64.b64decode(job_queue.get())) if remote: if hasattr(type(job), package_path): if not os.path.exists(job.package_path): fetch(job.package_path) for input in job.task.input.values(): if isinstance(input, list): input_list = input else: input_list = [input] for file in input_list: if not os.path.exists(file): fetch(file) result = job.run() result_queue.put((job.id, _pickle.dumps(result))) except KeyboardInterrupt: pass except Exception as e: result_id = job.id if job is not None else None traceback.print_exc() e.traceback = traceback.format_exc() result_queue.put((result_id, _pickle.dumps(e)))
def _work(job_queue, result_queue): while True: job = None try: job = _pickle.loads(job_queue.get()) # need something to create this task and mape arguments properly result = job.run() result_queue.put((job.id, _pickle.dumps(result))) except KeyboardInterrupt: pass except Exception as e: result_id = job.id if job is not None else None traceback.print_exc() e.traceback = traceback.format_exc() result_queue.put((result_id, _pickle.dumps(e)))
def register(self, function, server): """ Register the function with the server. Returns a function id used by the local invoker to proxy the call. """ registrar_uri = "PYRO:remote_registrar@{0}:8007".format(server) remote_registrar = Pyro4.Proxy(registrar_uri) pickled_function = cloudpickle.dumps(function) try: if self._metadata_manager is None: self._metadata_manager = remote_registrar.get_metadata() # Check to see if function has been registered already pickled_function_hash = hash(pickled_function) if pickled_function_hash in self._metadata_manager: return self._metadata_manager[pickled_function_hash] else: # If not registered already, register it return remote_registrar.deserialize_and_register( function.func_name, pickled_function) except Pyro4.errors.CommunicationError as comm_error: print "Communication error:", comm_error print "Did you run the easyrpc start script?" sys.exit(1) except Exception as reg_error: print "Remote registration error:", reg_error sys.exit(1)
def to_s3(self, obj, bucket_id, key, protocol=0, **kwargs): """ Save object to Amazon S3 as string representation Parameters: obj: the Pandas object to be saved conn: an S3 connection or S3 bucket object bucket_id: the name of the bucket where the object is located key: the key to assign to the saved object protocol: passed to pickle.dumps **kwargs: passed to pandas boto `set_contents_from_string` """ if type(obj) == str: obj_string = zlib.compress(obj) else: try: obj_string = zlib.compress(dumps(obj, protocol)) except: raise Exception("obj could not be pickled") try: bucket = self.conn_s3.get_bucket(bucket_id) except: bucket = self.conn_s3.create_bucket(bucket_id) k = Key(bucket) k.key = key _ = k.set_contents_from_string(obj_string, **kwargs) return key
def register(self, function, server): """ Register the function with the server. Returns a function id used by the local invoker to proxy the call. """ registrar_uri = "PYRO:remote_registrar@{0}:8007".format(server) remote_registrar = Pyro4.Proxy(registrar_uri) pickled_function = cloudpickle.dumps(function) try: if self._metadata_manager is None: self._metadata_manager = remote_registrar.get_metadata() # Check to see if function has been registered already pickled_function_hash = hash(pickled_function) if pickled_function_hash in self._metadata_manager: return self._metadata_manager[pickled_function_hash] else: # If not registered already, register it return remote_registrar.deserialize_and_register(function.func_name, pickled_function) except Pyro4.errors.CommunicationError as comm_error: print "Communication error:", comm_error print "Did you run the easyrpc start script?" sys.exit(1) except Exception as reg_error: print "Remote registration error:", reg_error sys.exit(1)
def savenetwork(self, filename): # from cPickle import Pickler # this is from picloud/cloud because built-in pickle won't do the job from cloud.serialization.cloudpickle import dumps f = file(filename, "wb") # p = Pickler(f) # p.dump(self.res) f.write(dumps(self.res)) f.close()
def pickle(self, job): try: pickled = _pickle.dumps(job) return pickled except Exception as e: self.results[job.id] = e job.failed = True self.completed_jobs[job.id] = job self.fail_downstream(job) return None
def dispatch(self, job): try: self.job_queue.put(_pickle.dumps(job)) self.pending_jobs[job.id] = job except Exception as e: self.results[job.id] = e job.failed = True self.completed_jobs[job.id] = job self.fail_downstream(job) del self.candidate_jobs[job.id] self.num_cores_in_use += job.num_cores
def map_sync(self, f, *args): res = [] exceptions = [] for group in zip(*args): # simulate network roundtrip group = loads(dumps(group)) tries = 0 while tries < self.tries: tries += 1 try: res.append(f(*group)) except: if tries < self.tries: pass else: # lol exceptions.append(unwrap_exception(wrap_exception())) break if exceptions: raise CompositeError("Mock Composite error", exceptions) else: return res
def test_lambda_cp(self): import cloud.serialization.cloudpickle as cp f = lambda filename: open(filename, 'r').close() f_dill = cp.loads(cp.dumps(f)) f_dill("test.txt")
def main(load, train): if train: # open the tmx file tmx_file = open('../memoire_en-US_es-ES.tmx', 'r') # Parse the TMX file into python objects tmx_tree = etree.parse(tmx_file) # create a list of the "seg" elements, where our segments are contained tree = [e for e in tmx_tree.iter("seg")] # Pair these segments up and put them into a list pairs = zip(tree, tree[1:])[::2] corpus = [] count = 0 # import the source and target language stopwords stopwords_en = nltk.corpus.stopwords.words('english') stopwords_es = nltk.corpus.stopwords.words('spanish') # iterate through the segment pairs for e, k in pairs: # tuid = el.getparent().getparent().attrib['tuid'] # eliminate any segments that have non alpha terms and terms which are 1 char long if (e.text and len(e.text) > 1) and (k.text and len(k.text) > 1): # split the words of the segment into a list. Lowercase all the tokens and elimate any stopwords e_token = [w.lower() for w in nltk.word_tokenize(unicode(e.text)) if w.lower() not in stopwords_en] k_token = [x.lower() for x in nltk.word_tokenize(unicode(k.text)) if x.lower() not in stopwords_es] # add the token list the corpus if ''.join(e_token).isalpha() and ''.join(k_token).isalpha(): if len(e_token) > 0 and len(k_token) > 0: # print e.text, e_token, k_token # if 'articles' in e_token: # print e.text, e_token, k_token corpus.append(AlignedSent(e_token, k_token, tuid=e.getparent().getparent().attrib['tuid'])) for e, k in pairs: # eliminate any segments that have non alpha terms and terms which are 1 char long if (e.text and len(e.text) > 1) and (k.text and len(k.text) > 1): # split the words of the segment into a list. Lowercase all the tokens and elimate any stopwords e_token = [w.lower() for w in nltk.word_tokenize(unicode(e.text)) if w.lower() not in stopwords_en] k_token = [x.lower() for x in nltk.word_tokenize(unicode(k.text)) if x.lower() not in stopwords_es] # add the token list the corpus if ''.join(e_token).isalpha() and ''.join(k_token).isalpha(): if len(e_token) > 0 and len(k_token) > 0: source_out, target_out = pos_realign(" ".join(e_token), " ".join(k_token)) corpus.append(AlignedSent(source_out, target_out, tuid=e.getparent().getparent().attrib['tuid'])) # train the aligned corpus to figure out which pairs of words match model = dumps(IBMModel2(corpus, 1)) with open("models/ibm2.p", "w") as dest: dest.write(model) result = dumps(corpus) with open("models/corpus", "w") as dest: dest.write(result) else: with open("models/ibm2.p") as source: result = source.read() model = loads(result) with open("models/corpus") as source: result = source.read() corpus = loads(result) if csv: # iterate through the model print "%s,%s,%s" % ("source", "target", "precision") for k, v in model.probabilities.items(): # print the term pair if the precision is >= 0.5 and not the same string if max(v.values()) >= 0.5: if k.encode('utf-8') != max(v, key=v.get).encode('utf-8'): print "%s,%s,%s" % (k.encode('utf-8'), max(v, key=v.get).encode('utf-8'), max(v.values())) # get sent alignments with TUID property if debug: aligned_corpus = {} for sent in corpus: aligned = model.align(sent) for word in aligned.words: v = model.probabilities[word] if word not in aligned_corpus: if (max(v.values()) >= 0.4) and max(v, key=v.get): if word.encode('utf-8') != max(v, key=v.get).encode('utf-8'): aligned_corpus[word] = {} aligned_corpus[word]['target'] = max(v, key=v.get).encode('utf-8') aligned_corpus[word]['tuid'] = aligned.tuid aligned_corpus[word]['precision'] = max(v.values()) aligned_corpus[word]['source_sent'] = " ".join(aligned.words).encode('utf-8') aligned_corpus[word]['target_sent'] = " ".join(aligned.mots).encode('utf-8') aligned_corpus[word]['alignment'] = str(aligned.alignment) print json.dumps(aligned_corpus, indent=4, sort_keys=False)
def create_script(self, func, func_kwargs=None, mask="anaconda", aptget=None, custom=None): """ Create a custom python script to run a function on EC2. Parameters __________ * func: an arbitrary function * bucket_id: the name of the s3 bucket to which to load func results * func_kwargs: a dictionary of keyword arguments to feed to func * mask: a key in lib_dicts indicating which Python modules should be assumed loaded on EC2 * apt_get: a list of package names to install on EC2 via apt-get (not tested) * custom: a custom script to run in the EC2 shell before starting python (not tested) """ # get function dependencies setup_specs = self.get_objects(func, mask=mask) # start script script = "#!/usr/bin/env python\n" # always install these modules script += "try:\n import cPickle as pickle\n" script += "except:\n import pickle\n\n" script += "try:\n from cStringIO import StringIO\n" script += "except:\n from StringIO import StringIO\n\n" script += "import sys, os, time, uuid, inspect, imp, dis, subprocess, zlib\n\n" script += "from cloud.serialization.cloudpickle import dumps\n" script += "import boto.s3.connection as s3\n" script += "from boto.s3.key import Key\n" script += "def quotes(s, remove=True):\n" script += " empty = chr(32)[:0]\n" script += " double = [100, 111, 117, 98, 108, 101]\n" script += " middle = [95, 95, 95, 113, 95, 95, 95]\n" script += " single = [115, 105, 110, 103, 108, 101]\n" script += " grave = [103, 114, 97, 118, 101]\n" script += " double_r = empty.join([chr(x) for x in double+middle+double])\n" script += " single_r = empty.join([chr(x) for x in single+middle+single])\n" script += " grave_r = empty.join([chr(x) for x in grave+middle+grave])\n" script += " if remove:\n" script += " return s.replace(chr(34), double_r).replace(chr(39), single_r\n" script += " ).replace(chr(96), grave_r)\n" script += " else:\n" script += " return s.replace(double_r, chr(34)).replace(single_r, chr(39)\n" script += " ).replace(grave_r, chr(96))\n\n" script += "def manual_module(path, name, from_string=True):\n" script += " new_module = imp.new_module(name)\n" script += " if not from_string:\n" script += " with open(path, 'r') as f:\n" script += " new_module_code = f.read()\n" script += " else:\n" script += " new_module_code = path\n" script += " new_module_code = new_module_code\n" script += " exec new_module_code in new_module.__dict__\n" script += " sys.modules[name] = new_module\n\n" script += "def to_s3(obj, conn, bucket_id, key, **kwargs):\n" script += " if type(obj)==str:\n" script += " obj_string = zlib.compress(obj)\n" script += " else:\n" script += " try:\n" script += " obj_string = zlib.compress(dumps(obj))\n" script += " except:\n" script += " raise Exception('obj could not be pickled')\n" script += " if type(conn) is s3.S3Connection:\n" script += " try:\n" script += " bucket = conn.get_bucket(bucket_id)\n" script += " except:\n" script += " bucket = conn.create_bucket(bucket_id)\n" script += " elif type(conn) is s3.bucket.Bucket:\n" script += " bucket = conn\n" script += " k = Key(bucket)\n" script += " k.key = key\n" script += " _ = k.set_contents_from_string(obj_string, **kwargs)\n" script += " return key\n\n" # establish an s3 connection script += "conn_s3 = s3.S3Connection('%(aki)s', '%(sak)s')\n" % { "aki": self.access_key_id, "sak": self.secret_access_key, } # install any non-python programs necessary if aptget is not None: script += "subprocess.call([%s, %s])\n" % (repr("apt-get"), repr("upgrade")) for a in aptget: script += "subprocess.call([%s, %s, %s, %s])\n" % ( repr("apt-get"), repr("install"), repr("-y"), repr(a), ) # install pip then install modules if len(setup_specs.installs) > 0: if mask is not None: if "python-pip" not in mask: script += "subprocess.call([%s, %s, %s])\n" % (repr("apt-get"), repr("install"), repr("python-pip")) for p in setup_specs.installs: script += "subprocess.call([%s, %s, %s, %s])\n" % (repr("pip"), repr("install"), repr("-y"), repr(p)) # do anything else before opening the python interpreter if custom is not None: for c in custom: script += "subprocess.call([%s])\n" % repr(c) # import custom modules (not on path) if len(setup_specs.full_files) > 0: for item in setup_specs.full_files: module_name = item["name"] with open(item["path"], "r") as f: module_code = f.read() module_code = self.quotes(module_code) script += "manual_module(quotes(%s, remove=False), %s, from_string=True)\n" % ( repr(module_code), repr(module_name), ) script += "import %s\n" % module_name script += "from %s import *\n" % module_name # import any other modules necessary for the function to run if len(setup_specs.imports) > 0: for i in setup_specs.imports: script += "%s\n" % i # recreate and run the function dumped_code = dumps(func) dumped_code = self.quotes(dumped_code) script += "do_func_code = quotes(%s, remove=False)\n" % repr(dumped_code) script += "do_func = pickle.loads(do_func_code)\n" script += "f_glob = do_func.func_globals\n" script += "exports = [x for x in f_glob if not x.startswith(chr(95)*2)]\n" script += "for name in exports:\n" script += " globals()[name] = f_glob[name]\n" if func_kwargs is None: script += "output = do_func()\n" else: script += "kwargs = quotes(%s, remove=False)\n" % (repr(self.quotes(dumps(func_kwargs)))) script += "kwargs = pickle.loads(kwargs)\n" script += "output = do_func(**kwargs)\n" # transfer results to s3 script += "key_id = str(uuid.uuid1())\n" script += "to_s3(output, conn=conn_s3, bucket_id=%s, key=key_id)\n" % (repr(self.collection_bucket.name)) # script += 'print %s\n' % repr('working script completed') return script.strip("\n")
def get_objects(self, func, mask="anaconda"): """ Return a class with three attributes: * imports: python modules that need to be imported for a function to be run on EC2 * installs: python modules that will first need to be installed (via apt-get) in order for a function to be run on EC2 * full_files: local paths to scripts that will need to be imported as custom modules in order for a function to be run on EC2 """ imports_list = self.extract_code_dependencies(func) inter = [k for k, v in imports_list.items() if self.flag_interactive_objects(v)] del v n_inter = len(inter) new_n_inter = n_inter * 2 while new_n_inter > n_inter: for k in inter: new_imports = self.extract_code_dependencies(imports_list[k]) imports_list.update(new_imports) inter = [k for k, v in imports_list.items() if self.flag_interactive_objects(v)] del v n_inter = new_n_inter new_n_inter = len(inter) try: parent_module = func.__module__ except: parent_module = "__main__" not_in_keys = parent_module not in imports_list.keys() not_in_main = parent_module != "__main__" if not_in_keys & not_in_main: imports_list[parent_module] = sys.modules[parent_module] line_items = [] apt_get = [] load_files = [] for alias, obj in imports_list.items(): if inspect.ismodule(obj): line_item = "import %s" % obj.__name__ root_module = obj.__file__.split("/") elif obj.__module__ != "__main__": line_item = "from %s import %s" % (obj.__module__, obj.__name__) root_module = inspect.getsourcefile(obj).split("/") else: pickled_obj = dumps(obj) line_item = "%s = pickle.loads(quotes(%s, remove=False))\n" % (alias, repr(self.quotes(pickled_obj))) root_module = None if obj.__name__ != alias: line_item += " as %s" % alias ind = None if root_module is not None: try: ind = root_module.index("site-packages") except: try: _ = root_module.index("python%d.%d" % (sys.version_info[0:2])) except: source_file = inspect.getsourcefile(obj) load_files.append({"name": alias, "path": source_file}) if ind is not None: root_module = root_module[ind + 1] apt_get.append(root_module) line_items.append(line_item) if mask is not None: apt_get = [item for item in apt_get if not any([item.startswith(x) for x in self.lib_dict[mask]])] class script_setup(object): imports = list(set(line_items)) installs = list(set(apt_get)) full_files = load_files return script_setup
def dump_featurevector(filename, fv): with open(filename, 'wb') as fil: whole = pickle.dumps(fv, protocol=1) fil.write(whole) fil.flush()
def dump(self, Object, destinationPath, fileName): """Dumps the object at a specified location.""" with open(destinationPath + fileName, 'wb') as outputFile: outputFile.write(dumps(Object))