def get_memory_usage(self): """ Returns the sizes (in bytes) of the four main models that the classifier keeps in memory. """ with self.lock: return asizeof.asizeof(self.hc), asizeof.asizeof(self.htc), asizeof.asizeof(self.tc), \ asizeof.asizeof(self.thc)
def _get_lines(self, file1, file2, lines, file1_dest, file2_dest): """ Given two files to open and a sorted list of lines to get, open the files, retrieves the desired lines, and dumps them to specified file locations. """ lines = deque(lines) buf1, buf2 = [], [] line_counter, target_line = 0, lines.popleft() for f1_line, f2_line in zip(open(file1, 'r'), open(file2, 'r')): if target_line == line_counter: buf1.append(f1_line.strip()) buf2.append(f2_line.strip()) if asizeof.asizeof(buf1) + asizeof.asizeof(buf2) > \ self.mem_limit: self._dump_bufs_to( [file1_dest, file2_dest], [buf1, buf2]) if len(lines) != 0: target_line = lines.popleft() else: break line_counter += 1 self._dump_bufs_to( [file1_dest, file2_dest], [buf1, buf2])
def cleanse(self, src_lang_file, tar_lang_file): """ Cleans the file provided by lowercasing all words and ensuring each line in the text file is within min_len and max_len. Operates on two streams simultaneously in order to keep line to line correspondence """ self._validate_file(src_lang_file), self._validate_file(tar_lang_file) src_dest_file = self.destdir + utilities.strip_filename_from_path(src_lang_file) + ".cleansed" tar_dest_file = self.destdir + utilities.strip_filename_from_path(tar_lang_file) + ".cleansed" if utilities.files_exist([src_dest_file, tar_dest_file]): return else: utilities.wipe_files([src_dest_file, tar_dest_file]) self._print("""Cleaning data. Ensuring uniformity of data...""") src_buf, tar_buf = [], [] for src_line, tar_line in zip(open(src_lang_file), open(tar_lang_file)): src_line = src_line.lower().split() tar_line = tar_line.lower().split() if len(src_line) > self.min_len and len(src_line) < self.max_len and \ len(tar_line) > self.min_len and len(tar_line) < self.max_len: src_buf.append(' '.join(src_line)) tar_buf.append(' '.join(tar_line)) if asizeof.asizeof(src_buf) + asizeof.asizeof(tar_buf) > self.mem_limit: self._dump_bufs_to( [src_dest_file, tar_dest_file], [src_buf, tar_buf]) self._dump_bufs_to([src_dest_file, tar_dest_file], [src_buf, tar_buf]) self._print("Done\n")
def test_copy_features_does_not_copy_entityset(es): agg = Sum(es['log']['value'], es['sessions']) agg_where = Sum(es['log']['value'], es['sessions'], where=IdentityFeature(es['log']['value']) == 2) agg_use_previous = Sum(es['log']['value'], es['sessions'], use_previous='4 days') agg_use_previous_where = Sum(es['log']['value'], es['sessions'], where=IdentityFeature(es['log']['value']) == 2, use_previous='4 days') features = [agg, agg_where, agg_use_previous, agg_use_previous_where] in_memory_size = asizeof(locals()) copied = [f.copy() for f in features] new_in_memory_size = asizeof(locals()) assert new_in_memory_size < 2 * in_memory_size for f, c in zip(features, copied): assert f.entityset assert c.entityset assert id(f.entityset) == id(c.entityset) if f.where: assert c.where assert id(f.where.entityset) == id(c.where.entityset) for bf, bf_c in zip(f.base_features, c.base_features): assert id(bf.entityset) == id(bf_c.entityset) if bf.where: assert bf_c.where assert id(bf.where.entityset) == id(bf_c.where.entityset)
def test_slots_being_used(): """ The class is really using __slots__. """ non_slot_instance = C1(x=1, y="test") slot_instance = C1Slots(x=1, y="test") assert "__dict__" not in dir(slot_instance) assert "__slots__" in dir(slot_instance) assert "__dict__" in dir(non_slot_instance) assert "__slots__" not in dir(non_slot_instance) assert set(["x", "y"]) == set(slot_instance.__slots__) if has_pympler: assert asizeof(slot_instance) < asizeof(non_slot_instance) non_slot_instance.t = "test" with pytest.raises(AttributeError): slot_instance.t = "test" assert 1 == non_slot_instance.method() assert 1 == slot_instance.method() assert attr.fields(C1Slots) == attr.fields(C1) assert attr.asdict(slot_instance) == attr.asdict(non_slot_instance)
def process_response(self, request, response): req = request.META['PATH_INFO'] if req.find('static') == -1 and req.find('media') == -1: print req self.end_objects = muppy.get_objects() sum_start = summary.summarize(self.start_objects) sum_end = summary.summarize(self.end_objects) diff = summary.get_diff(sum_start, sum_end) summary.print_(diff) #print '~~~~~~~~~' #cb = refbrowser.ConsoleBrowser(response, maxdepth=2, \ #str_func=output_function) #cb.print_tree() print '~~~~~~~~~' a = asizeof(response) print 'Total size of response object in kB: %s' % \ str(a / 1024.0) print '~~~~~~~~~' a = asizeof(self.end_objects) print 'Total size of end_objects in MB: %s' % \ str(a / 1048576.0) b = asizeof(self.start_objects) print 'Total size of start_objects in MB: %s' % \ str(b / 1048576.0) print '~~~~~~~~~' return response
def push(self, msg): serialized_msg = pickle.dumps(msg) from pympler.asizeof import asizeof print('unpickled: {}, pickled: {}'.format( asizeof(msg), asizeof(serialized_msg) )) self.output(serialized_msg)
def test_adict(self): '''Test asizeof.adict() ''' pdict = PseudoDict() size1 = asizeof.asizeof(pdict) asizeof.adict(PseudoDict) size2 = asizeof.asizeof(pdict) # TODO: come up with useful assertions self.assertEqual(size1, size2)
def test_methods(self): '''Test sizing methods and functions ''' def foo(): pass s1 = asizeof.asizeof(self.test_methods, code=True) s2 = asizeof.asizeof(TypesTest.test_methods, code=True) s3 = asizeof.asizeof(foo, code=True)
def test_globals(self): '''Test globals examples''' self._printf('%sasizeof(%s, limit=%s, code=%s) ... %s', os.linesep, 'globals()', 'MAX', False, '-glob[als]') asizeof.asizeof(globals(), limit=self.MAX, code=False, stats=1) self._print_functions(globals(), 'globals()', opt='-glob[als]') self._printf('%sasizesof(%s, limit=%s, code=%s) ... %s', os.linesep, 'globals(), locals()', 'MAX', False, '-glob[als]') asizeof.asizesof(globals(), locals(), limit=self.MAX, code=False, stats=1) asizeof.asized(globals(), align=0, detail=self.MAX, limit=self.MAX, code=False, stats=1)
def push(self, msg): compressed_msg = zlib.compress(msg) from pympler.asizeof import asizeof print('uncompressed: {}, compressed: {}'.format( asizeof(msg), asizeof(compressed_msg) )) self.output(compressed_msg)
def run(dicp="~/dev/kaggle/fb5/pdic.map", datap="~/dev/kaggle/fb5/train.tab", lr=1., numbats=100, epochs=10): dic, revdic = loaddict(expanduser(dicp)) print len(dic) traindata, golddata = loaddata(expanduser(datap), top=10000) print asizeof(traindata), golddata.dtype m = SpatialEmb(dim=len(dic)) m.train([traindata], golddata).adagrad(lr=lr).cross_entropy()\ .split_validate(splits=100, random=True).cross_entropy().accuracy()\ .train(numbats, epochs)
def getSizeOfMgrs(self): """ get size of object """ appGlobal = config['pylons.app_globals'] result = {} result['threadmgr'] = asizeof(appGlobal.threadMgr) result['packagemgr'] = asizeof(appGlobal.packageMgr) result['montior'] = asizeof(appGlobal.agentMonitor) result['all'] = asizeof(appGlobal) return doneResult(request, response, result = result, controller = self)
def add_results_data(self, results): if SIZE_CONTROL: if not self.MEM_LIMIT: mem_size = asizeof(self.current_task.results) add_size = asizeof(results) if (mem_size + add_size) < 15000000: self._add_results(results) else: self.MEM_LIMIT = True else: self._add_results(results)
def test_asizer(self): '''Test Asizer properties. ''' sizer = asizeof.Asizer() obj = 'unladen swallow' mutable = [obj] sizer.asizeof(obj) self.assertEqual(sizer.total, asizeof.asizeof(obj)) sizer.asizeof(mutable, mutable) self.assertEqual(sizer.duplicate, 1) self.assertEqual(sizer.total, asizeof.asizeof(obj, mutable))
def test_private_slots(self): class PrivateSlot(object): __slots__ = ('__data',) def __init__(self, data): self.__data = data data = [42] * 100 container = PrivateSlot(data) size1 = asizeof.asizeof(container) size2 = asizeof.asizeof(data) self.assertTrue(size1 > size2, (size1, size2))
def dumpMonitorValues(self): ''' dump all monitor values as json string ''' result = {} for (service, mname) in self.__monitorValues: key = '%s.%s' % (service, mname) result[key] = self.__monitorValues[(service, mname)] result['values'] = asizeof(self.__monitorValues) result['tasks'] = asizeof(self.__monitorTasks) result['messages'] = asizeof(self.__monitorMessages) result['tags'] = asizeof(self.__monitorTags) result['messagekeys'] = '%s' % self.__monitorMessages return result
def test_asizeof(self): '''Test asizeof.asizeof() ''' self.assertEqual(asizeof.asizeof(), 0) objs = [Foo(42), ThinFoo("spam"), OldFoo(67)] total = asizeof.asizeof(*objs) sizes = list(asizeof.asizesof(*objs)) sum = 0 for sz in sizes: sum += sz self.assertEqual(total, sum, (total, sum))
def test_closure(self): '''Test sizing closures. ''' def outer(x): def inner(): return x return inner data = [1] * 1000 closure = outer(data) size_closure = asizeof.asizeof(closure, code=True) size_data = asizeof.asizeof(data) self.assertTrue(size_closure >= size_data, (size_closure, size_data))
def split_train_tune_test(self, src_file, src_piv_file, piv_tar_file, tar_file, train_split, test_split): """ Splits the full datafiles into test, tune, and train sets. Receives 4 files as parameters and 2 decimals indicating the percentage of data to be used as train, tune, and test data. If line 1 in src langs is in test, then line 1 in tar langs will also be in test. Etc. """ utilities.make_dir(self.traindir) utilities.make_dir(self.tunedir) utilities.make_dir(self.testdir) self._validate_file(src_file), self._validate_file(src_piv_file) self._validate_file(piv_tar_file), self._validate_file(tar_file) assert train_split + test_split <= 1 , "Invalid size for train, tune, and test splits" train_files, tune_files, test_files = self._ttt_filenames(src_file, src_piv_file, piv_tar_file, tar_file) if utilities.ttt_files_exist(train_files, tune_files, test_files): return else: utilities.ttt_wipe_files(train_files, tune_files, test_files) self._print("""Splitting data into train, tune, and test sets...""") train, tune, test = [[] ,[], [], []], [[], [], [], []], [[], [], [], []] for src_line, src_piv_line, piv_tar_line, tar_line in \ zip_longest(open(src_file), open(src_piv_file), open(piv_tar_file), open(tar_file)): x = numpy.random.sample() if x < train_split: self._add_line_to(train[0], src_line) self._add_line_to(train[1], src_piv_line) self._add_line_to(train[2], piv_tar_line) self._add_line_to(train[3], tar_line) elif x >= train_split and x < train_split + test_split: self._add_line_to(tune[0], src_line) self._add_line_to(tune[1], src_piv_line) self._add_line_to(tune[2], piv_tar_line) self._add_line_to(tune[3], tar_line) else: self._add_line_to(test[0], src_line) self._add_line_to(test[1], src_piv_line) self._add_line_to(test[2], piv_tar_line) self._add_line_to(test[3], tar_line) if asizeof.asizeof(train) + asizeof.asizeof(tune) + \ asizeof.asizeof(test) > self.mem_limit: self._dump_ttt_bufs_to(train, tune, test, train_files, tune_files, test_files) self._dump_ttt_bufs_to(train, tune, test, train_files, tune_files, test_files) self._print("Done\n")
def print_memory_profiles(sm, tr, tr_sm, LOGFILE = None): ''' Prints report on memory profiles IN: sm - SeriesModel - SeriesModel object for this run tr - SummaryTracker - SummaryTracker object for the whole run tr_sm - ClassTrackers - ClassTracker object of SeriesModel LOGFILE - file obj - Open logfile for print output OUT: None ''' ptf( '\nSERIESMODEL profiling', LOGFILE) ptf( 'Look at size of seriesmodel object', LOGFILE) ptf( asizeof.asizeof(sm), LOGFILE) ptf( asizeof.asized(sm, detail=1).format(), LOGFILE) ptf( 'Look at how the SeriesModel class is doing', LOGFILE) tr_sm.create_snapshot() tr_sm.stats.print_summary() tr_sm.stats.print_summary() >> LOGFILE ptf( 'PROFILING', LOGFILE) ptf( 'Look at memory leaks up to this point', LOGFILE) tr.print_diff() >> LOGFILE tr.print_diff()
def test_weakref(self): '''Test sizing weak references. ''' alive = Foo('alive') aref = weakref.ref(alive) dead = Foo('dead') dref = weakref.ref(dead) del dead aref_size = asizeof.asizeof(aref) self.assertTrue(aref_size > asizeof.asizeof(alive), aref_size) refs = asizeof.named_refs(aref) # TODO: Should a weakref return ('ref', obj)? dref_size = asizeof.asizeof(dref) self.assertTrue(dref_size > 0, dref_size) self.assertNotEqual(dref_size, aref_size) refs = asizeof.named_refs(dref)
def test_pickle_features_with_custom_primitive(es): NewMean = make_agg_primitive( np.nanmean, name="NewMean", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Mean, NewMean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() assert any([isinstance(feat, NewMean) for feat in features_no_pickle]) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_pickle_features(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Mean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def run(p="../../../data/atis/atis.pkl", wordembdim=70, lablembdim=70, innerdim=300, lr=0.05, numbats=100, epochs=20, validinter=1, wreg=0.0003, depth=1): tracker = SummaryTracker() train, test, dics = pickle.load(open(p)) word2idx = dics["words2idx"] table2idx = dics["tables2idx"] label2idx = dics["labels2idx"] label2idxrev = {v: k for k, v in label2idx.items()} train = zip(*train) test = zip(*test) print "%d training examples, %d test examples" % (len(train), len(test)) #tup2text(train[0], word2idx, table2idx, label2idx) maxlen = 0 for tup in train + test: maxlen = max(len(tup[0]), maxlen) numwords = max(word2idx.values()) + 2 numlabels = max(label2idx.values()) + 2 # get training data traindata = getdatamatrix(train, maxlen, 0).astype("int32") traingold = getdatamatrix(train, maxlen, 2).astype("int32") trainmask = (traindata > 0).astype("float32") # test data testdata = getdatamatrix(test, maxlen, 0).astype("int32") testgold = getdatamatrix(test, maxlen, 2).astype("int32") testmask = (testdata > 0).astype("float32") res = atiseval(testgold-1, testgold-1, label2idxrev); print res#; exit() print asizeof(traindata) # define model innerdim = [innerdim] * depth m = SimpleSeqTransDec(indim=numwords, inpembdim=wordembdim, outembdim=lablembdim, innerdim=innerdim, outdim=numlabels) # training m = m.train([traindata, shiftdata(traingold), trainmask], traingold).adagrad(lr=lr).grad_total_norm(5.0).seq_cross_entropy().l2(wreg)\ .cross_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(validinter).takebest()\ .train(numbats, epochs) # predict after training s = SeqTransDecSearch(m) testpred = s.decode(testdata) testpred = testpred * testmask evalres = atiseval(testpred-1, testgold-1, label2idxrev); print evalres
def create_snapshot(self, description='', compute_total=False): """ Collect current per instance statistics and saves total amount of memory associated with the Python process. If `compute_total` is `True`, the total consumption of all objects known to *asizeof* is computed. The latter might be very slow if many objects are mapped into memory at the time the snapshot is taken. Therefore, `compute_total` is set to `False` by default. The overhead of the `ClassTracker` structure is also computed. Snapshots can be taken asynchronously. The function is protected with a lock to prevent race conditions. """ try: # TODO: It is not clear what happens when memory is allocated or # released while this function is executed but it will likely lead # to inconsistencies. Either pause all other threads or don't size # individual objects in asynchronous mode. self.snapshot_lock.acquire() timestamp = _get_time() sizer = asizeof.Asizer() objs = [tobj.ref() for tobj in list(self.objects.values())] sizer.exclude_refs(*objs) # The objects need to be sized in a deterministic order. Sort the # objects by its creation date which should at least work for # non-parallel execution. The "proper" fix would be to handle # shared data separately. tracked_objects = list(self.objects.values()) tracked_objects.sort(key=lambda x: x.birth) for tobj in tracked_objects: tobj.track_size(timestamp, sizer) snapshot = Snapshot() snapshot.timestamp = timestamp snapshot.tracked_total = sizer.total if compute_total: snapshot.asizeof_total = asizeof.asizeof(all=True, code=True) snapshot.system_total = pympler.process.ProcessMemoryInfo() snapshot.desc = str(description) # Compute overhead of all structures, use sizer to exclude tracked # objects(!) snapshot.overhead = 0 if snapshot.tracked_total: snapshot.overhead = sizer.asizeof(self) if snapshot.asizeof_total: snapshot.asizeof_total -= snapshot.overhead self.snapshots.append(snapshot) finally: self.snapshot_lock.release()
def test_ignore_frame(self): '''Test whether reference cycles are created ''' gc.collect() gc.disable() s = asizeof.asizeof(all=True, code=True) self.assertEqual(gc.collect(), 0) gc.enable()
def test_exception(self): '''Test sizing exceptions. ''' try: raise Exception("Test exception-sizing.") except Exception: etype, exc, etb = sys.exc_info() try: tb_size = asizeof.asizeof(etb) self.assertTrue(tb_size > 0, tb_size) refs = asizeof.named_refs(etb) ref_names = set([name for name, _ in refs]) self.assertTrue(set(['tb_frame', 'tb_next']) <= ref_names, ref_names) ex_size = asizeof.asizeof(etype, exc) self.assertTrue(ex_size > 0, ex_size) finally: del etb
def get_size_in_mb(obj): """Get the size of a given object in MB. :param obj: Object to get memory usage of :return: Memory used by the given object :rtype: float """ return asizeof(obj) / (1024.0 * 1024.0)
def get_license_index(rules=None): """ Return a LicenseIndex built from a list of rules. """ if not rules: rules = get_all_rules() if DEBUG_PERF: from pympler import asizeof # @UnresolvedImport print('Memory size of rules:', asizeof.asizeof(rules)) idx = LicenseIndex(rules) if DEBUG_PERF: print('Memory size of index:', asizeof.asizeof(idx)) return idx
def write(self,data): print ("=================== Writing Data down to wire from Client ================\n") Cencap = PEEPpacket() calcChecksum = PEEPClient(self.loop) Cencap.Type = 5 Cencap.SequenceNumber = self.update_sequence(data) self.prev_sequence_number = Cencap.SequenceNumber print ("SEQ No:" + str(Cencap.SequenceNumber)) Cencap.Acknowledgement = self.global_number_ack print ("ACK No:" + str(Cencap.Acknowledgement)) Cencap.Data = data print ("Data is", data) print ("Size of data", asizeof.asizeof(data)) Cencap.Checksum = calcChecksum.calculateChecksum(Cencap) bytes = Cencap.__serialize__() self.transport.write(bytes)
async def send_websocket_message(self, message): """ Handles all out going websocket messages to not overflow the size of one message handable for the websocket connection. Thus, this function introduces chunked transfer of the given message if required. The maximum size of one message is set in config with parameter WEBSOCKET_MAX.py :param message: The message to be send as JSON object via the websocket connection. :type message: dict or list """ message_size = asizeof.asizeof(message) if message_size < WEBSOCKET_MAX: await self.send_json(message) else: message_str = json.dumps(message) self.parts_to_send = [ message_str[i:i + WEBSOCKET_MAX] for i in range(0, len(message_str), WEBSOCKET_MAX) ] await self.send_part(0)
def test_long(self): '''Test int and long examples''' try: _L5d = long(1) << 64 _L17d = long(1) << 256 t = '<int>/<long>' except NameError: _L5d = 1 << 64 _L17d = 1 << 256 t = '<int>' self._printf('%sasizeof(%s, align=%s, limit=%s) ... %s', os.linesep, t, 0, 0, '-int') for o in (1024, 1000000000, 1.0, 1.0e100, 1024, 1000000000, self.MAX, 1 << 32, _L5d, -_L5d, _L17d, -_L17d): self._printf(" asizeof(%s) is %s (%s + %s * %s)", _repr(o), asizeof.asizeof(o, align=0, limit=0), asizeof.basicsize(o), asizeof.leng(o), asizeof.itemsize(o))
def test_asized(self): '''Test asizeof.asized() ''' self.assertEqual(list(asizeof.asized(detail=2)), []) self.assertRaises(KeyError, asizeof.asized, **{'all': True}) sized = asizeof.asized(Foo(42), detail=2) self.assertEqual(sized.name, 'Foo') refs = [ref for ref in sized.refs if ref.name == '__dict__'] self.assertEqual(len(refs), 1) self.assertEqual(refs[0], sized.get('__dict__')) refs = [ref for ref in refs[0].refs if ref.name == '[V] data: 42'] self.assertEqual(len(refs), 1, refs) i = 42 self.assertEqual(refs[0].size, asizeof.asizeof(i), refs[0].size) # Size multiple objects sizer = asizeof.Asizer() sized_objs = sizer.asized(Foo(3), Foo(4), detail=2) self.assertEqual(len(sized_objs), 2)
def memory_report(self): """Prints a detailed memory report of the pipeline object to screen. To get better memory estimates make sure the pympler Python package is installed. Without it, sys.getsizeof is used, which can be extremely underestimate memory size of Python objects. """ print("=== Pipeline memory report ===") size = asizeof(self) if size > 500000: # pragma: no cover print("Total pipeline size in memory: {:.2f}Mb".format(size / 1000000)) elif size > 1000: # pragma: no cover print("Total pipeline size in memory: {:.2f}Kb".format(size / 1000)) else: print("Total pipeline size in memory: {:.2f}b".format(size)) print("Per-stage memory structure:") print(self._mem_str(total=size))
def get_collection_sizes(obj, collections: Optional[Tuple]=None, get_only_non_empty=False): """ Iterates over `collections` of the gives object and gives its byte size and number of items in collection """ from pympler import asizeof collections = collections or (list, dict, set, deque, abc.Sized) if not isinstance(collections, tuple): collections = tuple(collections) result = [] for attr_name in dir(obj): attr = getattr(obj, attr_name) if isinstance(attr, collections) and ( not get_only_non_empty or len(attr) > 0): result.append( (attr_name, len(attr), asizeof.asizeof(attr, detail=1))) return result
def put_data(self, key, value, ttl_duration=None): size_of_key = sys.getsizeof(key) size_of_value = asizeof.asizeof(value) if (os.stat(self.__database.name).st_size > 1000000000 - (size_of_key + size_of_value)): raise Exception( "Database size has reached the maximum size(1GB). Cannot add any more data into this database." ) if (self.get_data(key) != None): raise Exception("Given key is already present in the database") if (len(key) > 32): raise Exception("Length of key should not exceed 32 Characters") if (type(value) != dict or size_of_value > 16000): raise Exception("Size of value(data) should not exceed 16KB") if (ttl_duration != None and ttl_duration < 0): raise Exception( "Time-to-live property must be greater than or equal to 0") d = datetime.datetime.now() created_time = None if (ttl_duration != None): created_time = { 'day': d.day, 'month': d.month, 'year': d.year, 'hour': d.hour, 'minute': d.minute, 'second': d.second, 'microsecond': d.microsecond, 'tzinfo': d.tzinfo } data = { "value": value, "ttl": { 'ttl_duration': ttl_duration, 'created_time': created_time } } self.__database.seek(0, io.SEEK_END) self.__database.write(f'"{key}"' + ':' + json.dumps(data) + '\n')
def main_process(): t = ''' q: 当退出Python时,是否释放全部内存? ans:答案是No。循环引用其它对象或引用自全局命名空间的对象的模块,在Python退出时并非完全释放。 另外,也不会释放C库保留的内存部分 ''' print(colored('mycount=', 'red'), t) obj = [1, 2, (3, 4), 'text'] print(asizeof.asizeof(obj)) print(asizeof.asized(obj, detail=1).format()) tr = tracker.SummaryTracker() a = [[random.random() for i in range(2000)] for i in range(2000)] tr.print_diff() gc.collect() from sys import getsizeof print('-'*20, getsizeof(a))
def python_numpy_unique(db, words): print('Standart python functionality (numpy.unique())') # unique_words = [] collection = db.python_numpy_unique collection.delete_many({}) start = time.time() for word in words: unique_words = [elem['word'] for elem in collection.find()] len_unique_words = len(unique_words) unique_words = np.unique(np.append(unique_words, word)) if len(unique_words) > len_unique_words: collection.insert_one({'word': word}) # unique_words = np.unique(unique_words.append(word)) end = time.time() print('[python np.unique()] Time python:: {}'.format(end - start)) print('[python np.unique()] Number of unique words: {}'.format(collection.count_documents({}))) print('[python np.unique()] Size of unique words: {} Mb, {} Kb\n'.format(asizeof.asizeof(unique_words)/1024/1024, asizeof.asizeof(unique_words)/1024)) print('________________________________________________________________\n')
def _send_report(self, report_id): with open(self.report_file) as fh: report_json = json.loads(fh.read()) logger.debug("Unique payload sent with size: {}".format( asizeof(json.dumps(report_json)) )) try: requests.post( self.broadcast_address, json={"run_id": report_id, "report_json": report_json} ) except requests.exceptions.ConnectionError: logger.error(colored_print( "ERROR: Could not establish connection with server. The server" " may be down or there is a problem with your internet " "connection.", "red_bold")) sys.exit(1)
def check_pieces_size(self): Logger().write( LogVerbosity.Important, " _pieces size: " + write_size(asizeof.asizeof(self._pieces))) not_done_pieces = [ piece for piece in self._pieces.values() if not piece.done ] done_pieces = [piece for piece in self._pieces.values() if piece.done] stream_index = [ piece for piece in self._pieces.values() if piece.index < self.torrent.stream_position ] stream_index_50_mb = [ piece for piece in self._pieces.values() if piece.index > self.torrent.stream_position + (50000000 // self.piece_length) ] Logger().write( LogVerbosity.Important, " pieces not done: " + str(len(not_done_pieces)) + " - " + write_size(asizeof.asizeof(not_done_pieces))) Logger().write( LogVerbosity.Important, " pieces done: " + str(len(done_pieces)) + " - " + write_size(asizeof.asizeof(done_pieces))) Logger().write( LogVerbosity.Important, " pieces < stream index: " + str(len(stream_index)) + " - " + write_size(asizeof.asizeof(stream_index))) Logger().write( LogVerbosity.Important, " pieces > stream index + 50mb: " + str(len(stream_index_50_mb)) + " - " + write_size(asizeof.asizeof(stream_index_50_mb))) Logger().write( LogVerbosity.Important, " pieces with initialized blocks: " + str( len([ piece for piece in self._pieces.values() if len(piece._blocks) > 0 ]))) if self.torrent.stream_manager.buffer is not None: data_ready = [ piece for piece in self.torrent.stream_manager.buffer.data_ready ] Logger().write( LogVerbosity.Important, " pieces in data_ready: " + str(len(data_ready)) + " - " + write_size(asizeof.asizeof(data_ready)))
def fully_dynamic(eps, args): """ Run the fully dynamic clustering algorithm with the given parameter eps. """ print('Running the algorithm with eps = {} in Process #{}'.format( eps, os.getpid())) limit = args.limit window = args.window space = 0 # keep track of the space used by the data structures start = time() beta = 0 fdc = FullyDynClus(eps, 20, window) for point in data_stream(limit): # There are some duplicate points in the dataset, for example, # the 71st and 86th points if point in fdc.points: continue fdc.insert(point) # Delete the least recent point in the sliding window if len(fdc.points) >= window + 1: fdc.delete(fdc.points[-window - 1]) beta = max(beta, fdc.get_result()) space = max(space, asizeof(fdc)) print('Finish running the algorithm with eps = {}'.format(eps)) return { str(eps): { 'run_time': round(time() - start, 3), 'space': space, 'op_count': fdc.op_count, 'beta': beta } }
def get_negative_indices(chosen_prods_ids_ls, dataset_type): if dataset_type == MyUtils_flags.FLAG_TRAIN: questions_final_filepath = F.QUESTIONS_FINAL_TRAIN if dataset_type == MyUtils_flags.FLAG_VALID: questions_final_filepath = F.QUESTIONS_FINAL_VALID if dataset_type == MyUtils_flags.FLAG_TEST: questions_final_filepath = F.QUESTIONS_FINAL_TEST negative_indices_dict = {} num_of_questions = 0 for input_segment in pd.read_csv(questions_final_filepath, chunksize=2**10, sep="_"): num_of_questions = num_of_questions + len(input_segment) logging.info("Number of questions in the current dataset: %s", num_of_questions) # 111171 for prod_id in chosen_prods_ids_ls: #logging.info("Debug: %s", prod_id) random_indices = np.random.choice(a=range(num_of_questions), size=NUM_NEGATIVE_CANDIDATES, replace=False, p=None) for rand_index in random_indices: if rand_index not in negative_indices_dict: negative_indices_dict[rand_index] = [] negative_indices_dict[rand_index].append(prod_id) else: negative_indices_dict[rand_index].append(prod_id) #with open(F.PRODS_NEGATIVEINDICES, "wb") as neg_indices_dict_file: ##save the structure to check it # pickle.dump(obj=negative_indices_dict, file=neg_indices_dict_file, ) #Python dictionaries are not ordered --> need to use... a list of #logging.info(negative_indices_dict) logging.info("Size in memory of the dictionary of random indices: %s KB", mem.asizeof(negative_indices_dict) // 2**10) negindices_lts = sorted(negative_indices_dict.items()) return negindices_lts
def test_100mb(): gk_stats = [] files = [] #Collect all file names in collection for filename in glob.glob('./input_files/98mb_genome/*.fastq'): files.append(filename) #building Gk Array time and memory test gk_start = timer() gk_array = GkArray(files, 3) gk_end = timer() # comment this out of if script is taking too long and # set gk_memory = 1 instead # This will skip the memory measurement of the data strucutre # but allow the test script to run faster gk_memory = asizeof.asizeof(gk_array) #query time q_start = timer() gk_array.get_reads("TTG") q_end = timer() gk_time = gk_end - gk_start q_time = q_end - q_start gk_stats.extend(((str(gk_time) + " sec"), (str(gk_memory / 1000) + " kb"), (str(q_time) + " sec"))) # Print performance stats titles = ['', 'Gk Array'] names = ['Build Time', 'Memory', 'Query Time'] data = [titles] + list(zip(names, gk_stats)) print("100 MB Collection Results:") for i, d in enumerate(data): line = '|'.join(str(x).ljust(30) for x in d) print(line) if i == 0: print('-' * len(line))
def test_log(log_name, log_path): print("xes_certific_mem_iterab", log_name) dfg = micropm4py.log.xes_import_traces_file_standard.imp_dfg_file_sten(log_path) net, im, fm = micropm4py.conversion.dfg.dfg_mining.apply(dfg) it = micropm4py.log.xes_import_traces_file_standard.get_it_from_file(log_path) nxt = micropm4py.log.xes_import_traces_file_standard.get_nxt_trace(it) mp_memory = 16384 mp4_memory = asizeof(micropm4py) net_memory = asizeof(net) + 2*asizeof(im) + 2*asizeof(fm) max_it_memory = 0 while nxt: max_it_memory = max(max_it_memory, asizeof(nxt) + asizeof(it)) nxt = micropm4py.log.xes_import_traces_file_standard.get_nxt_trace(it) sum_max_memory = mp_memory + mp4_memory + net_memory + max_it_memory Shared.results.append([log_name, max_it_memory, net_memory, mp4_memory, mp_memory, sum_max_memory])
def test_it_matches_the_expected_size(self): dummy_frame_name = None node = CallGraphNode(frame_name=dummy_frame_name, class_name=None, file_path=None, line_no=None) node.increase_runnable_count() full_recursive_size_of_node = asizeof.asizeof(node) frame_name_size = sys.getsizeof(dummy_frame_name) empty_children_tuple_size = sys.getsizeof(()) assert (MemoryCounter.empty_node_size_bytes == \ (full_recursive_size_of_node \ # The empty size should not include the frame name, so we subtract it - frame_name_size # The empty tuple is always reused by Python, so we also subtract it - empty_children_tuple_size))
def python_set(db, words): print('Standart python functionality (set())') collection = db.python_set collection.delete_many({}) # unique_words = set() start = time.time() for word in words: unique_words = set([elem['word'] for elem in collection.find()]) len_unique_words = len(unique_words) unique_words.add(word) if len(unique_words) > len_unique_words: collection.insert_one({'word': word}) # unique_words.add(word) end = time.time() print('[python set()] Time python:: {}'.format(end - start)) print('[python set()] Number of unique words: {}'.format(collection.count_documents({}))) print('[python set()] Size of unique words: {} Mb, {} Kb\n'.format(asizeof.asizeof(unique_words)/1024/1024, asizeof.asizeof(unique_words)/1024)) print('________________________________________________________________\n') return len(unique_words)
def run_inducing(log, input_path, args): im = InducingMiner(log, args.db_database, args.db_user, args.db_password, args.db_hostname, args.db_port, args.db_authentication, args.ssl, args.project_name, args.repository_url, input_path, repo_from_db=args.input is None) im.collect() log.info("memory for git: %s mb", asizeof.asizeof(im._cg) / 1024 / 1024) # everything with label='validated_bugfix' uses commit.fixed_issue_ids # szz uses commit.szz_issue_ids im.write_bug_inducing(label='adjustedszz_bugfix', inducing_strategy='all', java_only=False, affected_versions=False, ignore_refactorings=False, name='SZZ') # plain szz im.write_bug_inducing(label='issueonly_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=False, ignore_refactorings=True, name='JL+R') # best automatic szz im.write_bug_inducing(label='validated_bugfix', inducing_strategy='all', java_only=False, affected_versions=False, ignore_refactorings=False, name='JLMIV') # plain szz validated labels im.write_bug_inducing(label='validated_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=False, ignore_refactorings=False, name='JLMIV+') # improved szz validated labels im.write_bug_inducing(label='validated_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=True, ignore_refactorings=False, name='JLMIV+AV') # improved szz validated labels, affected versions im.write_bug_inducing(label='validated_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=True, ignore_refactorings=True, name='JLMIV+RAV') # best + AV im.write_bug_inducing(label='validated_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=False, ignore_refactorings=True, name='JLMIV+R') # improved szz validated labels, without refactorings im.write_bug_inducing(label='validated_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=False, ignore_refactorings=False, name='JLMIVLV', only_validated_bugfix_lines=True) # improved szz validated labels, only validated lines im.write_bug_inducing(label='issuefasttext_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=False, ignore_refactorings=True, name='JLIP+R')
def pickle_features_test_helper(es_size, features_original, dir_path): filepath = os.path.join(dir_path, 'test_feature') ft.save_features(features_original, filepath) features_deserializedA = ft.load_features(filepath) assert os.path.getsize(filepath) < es_size os.remove(filepath) with open(filepath, "w") as f: ft.save_features(features_original, f) features_deserializedB = ft.load_features(open(filepath)) assert os.path.getsize(filepath) < es_size os.remove(filepath) features = ft.save_features(features_original) features_deserializedC = ft.load_features(features) assert asizeof(features) < es_size features_deserialized_options = [features_deserializedA, features_deserializedB, features_deserializedC] for features_deserialized in features_deserialized_options: assert_features(features_original, features_deserialized)
def write(self, data): print ("================== Writing Data down to wire from Server ================\n") Sencap = PEEPpacket() calcChecksum = PEEPServerProtocol(self.loop) Sencap.Type = 5 Sencap.SequenceNumber = self.update_sequence(data) self.prev_sequence_number = Sencap.SequenceNumber print ("SEQ No:" + str(Sencap.SequenceNumber)) Sencap.Acknowledgement = self.global_number_ack print ("ACK No:" + str(Sencap.Acknowledgement)) Sencap.Data = data # For debugging print("data is",data) print("size of data",asizeof.asizeof(data)) Sencap.Checksum = calcChecksum.calculateChecksum(Sencap) bytes = Sencap.__serialize__() self.transport.write(bytes)
def viterbiDecoder(self, code): """ Function to decode data with Viterbi Algorithm :param code: numpy.array (bits to decode) :returns: numpy.array (decoded data) """ depth = 0 viterbiTree = Node(state='A', depth=depth, HammingDistance=0) for word in npy.split(npy.array(code), npy.array(code).size / self.n): ConvolutionalCode.viterbi_iter(viterbiTree, word, depth) ConvolutionalCode.pruning(viterbiTree, depth) depth += 1 states = ConvolutionalCode.bestPath(viterbiTree) return [ npy.hstack(ConvolutionalCode.getDecoded_code(states)), asizeof.asizeof(viterbiTree) ]
def test_asizesof(self): '''Test asizeof.asizesof() ''' self.assertEqual(list(asizeof.asizesof()), []) self.assertRaises(KeyError, asizeof.asizesof, **{'all': True}) objs = [Foo(42), ThinFoo("spam"), OldFoo(67)] sizes = list(asizeof.asizesof(*objs)) objs.reverse() rsizes = list(asizeof.asizesof(*objs)) self.assertEqual(len(sizes), 3) rsizes.reverse() self.assertEqual(sizes, rsizes, (sizes, rsizes)) objs.reverse() isizes = [asizeof.asizeof(obj) for obj in objs] self.assertEqual(sizes, isizes) sizer = asizeof.Asizer() asizer_sizes = sizer.asizesof(*objs) self.assertEqual(list(asizer_sizes), sizes) code_sizes = sizer.asizesof(*objs, **dict(code=True)) self.failIfEqual(list(code_sizes), sizes)
def test_generator(self): '''Test generator examples''' self._printf('%sasizeof(%s, code=%s) ... %s', os.linesep, '<generator>', True, '-gen[erator]') def gen(x): i = 0 while i < x: yield i i += 1 a = gen(5) b = gen(50) asizeof.asizeof(a, code=True, stats=1) asizeof.asizeof(b, code=True, stats=1) asizeof.asizeof(a, code=True, stats=1)
def safe_load(self): """ Load data while keeping an eye on memory usage.""" if self.file_counter >= len(self.file_paths): print("No more files to load!") return None # For in-place appending. # S.O.: https://stackoverflow.com/questions/20906474/ list_ = [] # real descriptive :) for i in range(self.file_counter, len(self.file_paths)): # lines=True means "read as json-object-per-line." list_.append(pd.read_json(self.file_paths[i], lines=True)) mem_usage = float(asizeof(list_)) / 1e9 logging.info("Data list has size %.3f GiB", mem_usage) logging.info("Most recent file loaded: %s", self.file_paths[i]) print("\rLoaded file", self.file_paths[i], end="") sys.stdout.flush() if mem_usage > self.max_mem: print("\nPast max capacity:", mem_usage, "Leaving data collection early.") logging.warning( 'Terminated data loading after ' 'reading %d files.', i + 1) logging.info('Files read into df: %r', self.file_paths[:i + 1]) break print() # If the user decides they want to continue loading later # (when memory frees up), we want the file_counter set so that it # starts on the next file. self.file_counter = i + 1 self._next_file_path = self.file_paths[self.file_counter] df = pd.concat(list_).reset_index() logging.info("Number of lines in raw data file: %r", len(df.index)) logging.info("Column names from raw data file: %r", df.columns) logging.info("DataHelper.safe_load: df.head() = %r", df.head()) return df
def test_merge(self): """Test merging of reference trees. """ self.tracker.track_class(FooNew, name='Foo', resolution_level=2) f1 = FooNew() f1.a = list(range(1000)) f2 = FooNew() f2.a = list(range(100)) f2.b = 'This is some stupid spam.' self.tracker.create_snapshot('Merge test') sizer = Asizer() sz1 = sizer.asized(f1) sz2 = sizer.asized(f2) stats = self.tracker.stats for fp in stats.snapshots: if fp.desc == 'Merge test': stats.annotate_snapshot(fp) self.assert_(hasattr(fp, 'classes')) classes = fp.classes stats.annotate_snapshot(fp) self.assertEqual(fp.classes, classes) self.assert_('Foo' in fp.classes, fp.classes) self.assert_('merged' in fp.classes['Foo']) fm = fp.classes['Foo']['merged'] self.assertEqual(fm.size, sz1.size + sz2.size, (fm.size, str(sz1), str(sz2))) refs = {} for ref in fm.refs: refs[ref.name] = ref self.assert_('__dict__' in refs.keys(), refs.keys()) refs2 = {} for ref in refs['__dict__'].refs: refs2[ref.name] = ref self.assert_('[V] a' in refs2.keys(), refs2.keys()) self.assert_('[V] b' in refs2.keys(), refs2.keys()) self.assertEqual(refs2['[V] a'].size, asizeof(f1.a, f2.a))
def information(self, print_level=1): """Print overview information about the options settings, problem statistics, and the solution of the computation. Parameters ---------- print_level : int (default=1) Level of details. """ self._check_is_fitted() if not isinstance(print_level, numbers.Integral) or print_level < 0: raise ValueError("print_level must be an integer >= 0; got {}." .format(print_level)) binning_type = self.__class__.__name__.lower() # Optimizer if self._optimizer is not None: solver = self._optimizer.solver_ time_solver = self._time_solver else: solver = None time_solver = 0 # Sketch memory usage memory_usage = asizeof.asizeof(self._bsketch) * 1e-6 dict_user_options = self.get_params() print_binning_information(binning_type, print_level, self.name, self._status, self.solver, solver, self._time_total, self._time_prebinning, time_solver, self._time_postprocessing, self._n_prebins, self._n_refinements, self._bsketch.n, self._n_add, self._time_streaming_add, self._n_solve, self._time_streaming_solve, memory_usage, dict_user_options)
def depth_file_reader(depth_file): """ Function that parse samtools depth file and creates 3 dictionaries that will be useful to make the outputs of this script, both the tabular file and the json file that may be imported by pATLAS Parameters ---------- depth_file: textIO the path to depth file for each sample Returns ------- depth_dic_coverage: dict dictionary with the coverage per position for each plasmid """ # dict to store the mean coverage for each reference depth_dic_coverage = {} for line in depth_file: tab_split = line.split() # split by any white space reference = "_".join(tab_split[0].strip().split("_")[0:3]) # store # only the gi for the reference position = tab_split[1] num_reads_align = float(tab_split[2].rstrip()) if reference not in depth_dic_coverage: depth_dic_coverage[reference] = {} depth_dic_coverage[reference][position] = num_reads_align logger.info("Finished parsing depth file.") depth_file.close() logger.debug("Size of dict_cov: {} kb".format( asizeof(depth_dic_coverage)/1024)) return depth_dic_coverage
def test_100mb(): h_stats = [] files = [] #Collect all file names in collection for filename in glob.glob('./input_files/98mb_genome/*.fastq'): files.append(filename) #building hash table time and memory test h_start = timer() h_table = hash_table(files) h_end = timer() # comment this out of if script is taking too long and # set h_memory = 1 instead # This will skip the memory measurement of the data strucutre # but allow the test script to run faster h_memory = asizeof.asizeof(h_table) #query time q_start = timer() h_table.find_sequence("TTG") q_end = timer() h_time = h_end - h_start q_time = q_end - q_start h_stats.extend(((str(h_time) + " sec"), (str(h_memory / 1000) + " kb"), (str(q_time) + " sec"))) titles = ['', 'Hash Table'] names = ['Build Time', 'Memory', 'Query Time'] data = [titles] + list(zip(names, h_stats)) print ("100 MB Collection Results:") for i, d in enumerate(data): line = '|'.join(str(x).ljust(30) for x in d) print(line) if i == 0: print('-' * len(line))
def test_50kb(): gk_stats = [] files = [] #Collect all file names in collection for filename in glob.glob('./input_files/50kb_genome/*.fastq'): files.append(filename) #building Gk Array time and memory test gk_start = timer() gk_array = GkArray(files, 3) gk_end = timer() gk_memory = asizeof.asizeof(gk_array) #query time q_start = timer() gk_array.get_reads("TTG") q_end = timer() gk_time = gk_end - gk_start q_time = q_end - q_start gk_stats.extend(((str(gk_time) + " sec"), (str(gk_memory / 1000) + " kb"), (str(q_time) + " sec"))) # Print performance stats titles = ['', 'Gk Array'] names = ['Build Time', 'Memory', 'Query Time'] data = [titles] + list(zip(names, gk_stats)) print("50 kb Collection Results:") for i, d in enumerate(data): line = '|'.join(str(x).ljust(30) for x in d) print(line) if i == 0: print('-' * len(line))
def test(): gName = "./datasets/hp.txt" r = 1000 n = 3133 c = 0.6 t = 10 r_index = readsd(gName, n, r, c, t) readsd_rs = [0.0] * n cmc_rs = [0] * n g = nx.read_edgelist(gName, create_using=nx.DiGraph(), nodetype=int) print(type(r_index)) print("index size", asizeof.asizeof(r_index)) for i in range(0, n): readsd_rs[i] = r_index.queryOne(0, i) cmc_rs[i] = truncated_MC(g, (0, i), R=r) print(i, readsd_rs[i], cmc_rs[i]) print("inserting...") r_index.insEdge(3, 7) r_index.insEdge(5, 55) r_index.insEdge(6, 666) print("finish...") return