def loadChunkList(chunkfnm, workid, worknum): log.message("Load chunk list from file {} ......".format(chunkfnm)) finfo = chunkfnm + nk.sKeychunklistinfo if (not os.path.exists(finfo)) or (not os.path.exists(chunkfnm)): raise Exception("Cant find file {} or {}?".format(chunkfnm, finfo)) """ Load file info """ flist = None chunklist = None with open(finfo) as f: flist = f.readlines() for i in range(len(flist)): flist[i] = cleanline(flist[i]) with open(chunkfnm) as f: chunklist = f.readlines() """ Load chunk info """ for i in range(len(chunklist)): chunklist[i] = cleanline(chunklist[i]) assert len(chunklist) > worknum nchunkblock = int(math.ceil(len(chunklist) / worknum)) startid = nchunkblock * workid stopid = nchunkblock if workid == worknum - 1: stopid = len(chunklist) - nchunkblock * (worknum - 1) retchunklist = chunklist[startid : startid + stopid - 1] return flist, retchunklist
def createProcessor(self, arguments): name = to_lowercase(arguments.name) mnm = "tensorgou.io" + "." + name + "_io" log.message("Import module {}".format(mnm)) return loadmodel(mnm)
def load_word_list(self, isload=True): assert (os.path.exists(self.listfnm)) word2id = None log.message("Load word list file {}......".format(self.listfnm)) with open(self.listfnm) as f: self.listnum = sum(1 for x in f) if not isload: return word2id f.seek(0, 0) log.message("\tFound total {} word list".format(self.listnum)) wordlist = f.readlines() """ split to word dict """ for i in range(len(wordlist)): wordlist[i] = cleanline(wordlist[i]) if self.checkwordlist is True: wordset = set(wordlist) for item in wordset: if(wordlist.count(item) > 1): raise Exception("Found multi same word {} {} in wordlist!" .format(item, wordlist.count(item))) word2id = dict(zip(wordlist, range(len(wordlist)))) if not word2id.has_key("</s>"): raise Exception("Expect </s> word in file {}".format(self.listfnm)) return word2id
def load_config_file(config_file, ignore_names): """ Loads and builds the model from the configuration Arguments: config_file: The configuration file ignore_names: A set of names that should be ignored during the loading. """ config_dicts = parsing.parse_file(config_file) message("Configure file is parsed.") # first load the configuration into a dictionary """ for k in config_dicts: print(k, config_dicts[k]) """ if "main" not in config_dicts: raise Exception("Configuration does not contain the main block.") existing_objects = dict() main_config = config_dicts['main'] configuration = dict() for key, value in main_config.items(): if key not in ignore_names: try: configuration[key] = build_object(value, config_dicts, existing_objects, 0) except Exception: raise Exception("Can't parse key: {}".format(key)) return configuration
def printparameter(self, args): message("Parameter Info:") message("===============") for name in args.__dict__: message("{} = {}".format(name, args.__dict__[name])) message("===============")
def evalit(self, loops, results): l_cost = 0.0 l_correctnum = 0 l_totalnum = 0 l_step = loops - self.steps self.steps = loops for item in results: if len(item) < 2: raise Exception("Bad result type! Expect {}, get {}".format( 3, len(item))) self.cost += item[0] ## cal correct nr = len(results) #add by xjk out = open("/search/odin/tensorflow/lstm_output/out", 'a') if nr > 0: numsteps = Config().num_steps output = results[nr - 1][1] l_cost = results[nr - 1][0] for j in range(self.batchsize): for k in range(numsteps): if k < G_Vals.g_lengths[j] and G_Vals.g_frameweight[k, j] > 0: if G_Vals.g_targets[k, j] == output[k * self.batchsize + j]: # add by xjk out.write( str(G_Vals.g_targets[k, j]) + " " + str(output[k * self.batchsize + j]) + '\n') l_correctnum += 1 l_totalnum += 1 l_accuracy = float(l_correctnum) / float(l_totalnum) count = self.steps * self.batchsize if count <= 0: raise Exception("Bad l_step {}? Expect > 0".format(l_step)) l_avgcost = self.cost / float(count) log.message( "No.{} batch: curr_lost[{:.5f}] curr_prec[{:.6f}] ave_loss[{:.6f}]" .format(loops + 1, l_cost, l_accuracy, l_avgcost)) return False
def filterfiles(files): assert not len(files) == 0 badfiles = [] goodfiles = [] for i in range(len(files)): filenm = cleanline(files[i]) if not os.path.exists(filenm): badfiles.append(filenm) else: goodfiles.append(filenm) if not len(badfiles) == 0: log.error("Can't find data files:") for i in range(len(badfiles)): log.error("\t'{}'".format(badfiles[i])) if len(goodfiles) == 0: raise Exception("No available data files!") log.message("Data files: ") readablefiles = [] for i in range(len(goodfiles)): curflen = os.path.getsize(goodfiles[i]) curflen /= (1024 * 1024) if curflen < 10: log.warnning("File: {} {}M, too small?".format(goodfiles[i], curflen)) else: readablefiles.append(goodfiles[i]) log.message("\tNo.{} file:{}".format(i + 1, goodfiles[i])) """ with open(goodfiles[i]) as f: lines = sum(1 for x in f) if lines == 0: log.warnning("File: {} is empty?".format(goodfiles[i])) else: totallines += lines log.message("\tNo.{} file:{} with {} recodes" .format(i + 1, goodfiles[i], lines)) readablefiles.append(goodfiles[i]) """ if len(readablefiles) == 0: raise Exception("Found 0 records?") return readablefiles
def load_word_dict(self, word2id, isload=True, wedim=None): assert (os.path.exists(self.dictfnm)) eb = None log.message("Load word dict file {}......".format(self.dictfnm)) with open(self.dictfnm) as f: self.dictnum, = struct.unpack("i", f.read(4)) self.worddim, = struct.unpack("i", f.read(4)) if not isload: return if wedim is not None: if not wedim == self.worddim: raise Exception("Embedding size '{}' can't match file '{}' defined size '{}' dictnum='{}'" .format(wedim, self.dictfnm, self.worddim, self.dictnum)) log.message("\tInfo: word_num = {}\tword_dim = {}" .format(self.dictnum, self.worddim)) if self.dictnum < len(word2id): log.warnning("\tword_dict num '{}' < word_list num '{}' ?" .format(self.dictnum, len(word2id))) eb = np.zeros((len(word2id), self.worddim), dtype = np.float32) pb = click.progressbar(length=self.dictnum, label="Load word dict") for i in range(self.dictnum): word_len, = struct.unpack("i", f.read(4)) word_str, = struct.unpack(str(word_len) + "s", f.read(word_len)) if word2id.has_key(word_str): wordid = word2id[word_str] for j in range(self.worddim): elem_value, = struct.unpack("f", f.read(4)) eb[wordid, j] = elem_value else: for j in range(self.worddim): _, = struct.unpack("f", f.read(4)) pb.update(1) del pb print("") return eb
def readerProc(flist, chunklist, batchsize, queue, maxepoch, quitEvent): nchunk = len(chunklist) assert nchunk > 0 assert len(flist) > 0 random.shuffle(chunklist) curepoch = 0 buffer = [] curptr = 0 numchunk = 0 while 1: if quitEvent.is_set(): return ## load next batch if len(buffer) < batchsize: chunkdata, curptr = loadNextChunk(flist, chunklist, batchsize, curptr) buffer.extend(chunkdata) numchunk += 2 if numchunk >= nchunk: curepoch += 1 if curepoch >= maxepoch: quitEvent.set() return random.shuffle(chunklist) curptr = 0 numchunk = 0 else: log.message("Loader: {}/{} chunks loaded ......" .format(numchunk, nchunk)) batch, buffer = loadNextBatch(buffer, batchsize) ## push into queue while 1: try: queue.put_nowait(batch) break except lQueue.Full: if quitEvent.is_set(): return time.sleep(0.1) continue
def getmodule(self, config_dict): if self.inmodule is not None: return if not config_dict.has_key('name'): raise Exception("No parameter 'name' be defined?") ## only support default type here! if config_dict.has_key('type'): raise Exception("Only support default 'type' define in current version!") if not self.defaults.has_key('type'): raise Exception("No parameter 'type' be defined [default]?") name = to_lowercase(config_dict['name']) type = to_lowercase(self.defaults['type']) mnm = "tensorgou.graph" + "." + name + "." + type message("Import module {}".format(mnm)) self.inmodule = loadmodel(mnm) return self.inmodule
def doCreateChunkList(arguments): chunkfnm = os.path.join(arguments.output, nk.sKeychunklist) chunkfnminfo = chunkfnm + nk.sKeychunklistinfo assert not os.path.exists(chunkfnm) log.message("Build train dataset chunklist {} ...".format(chunkfnm)) trainfiles = arguments.trainfnms.split(',') if len(trainfiles) == 0: raise Exception("No train data file be defined?") trainfilelist = filterfiles(trainfiles) fileid = [] filelen = [] for i in range(len(trainfilelist)): log.message("No.{} file {} ......".format(i + 1, trainfilelist[i])) curflen = os.path.getsize(trainfilelist[i]) curflen /= 1024 * 1024 fileid.append(trainfilelist[i]) #modify by xjk #nchunk = int(math.ceil(curflen / 64)) # chunk size == 64M nchunk = int(math.ceil(curflen / 1)) for num in range(nchunk): #modify by xjk #offset = 64 * num offset = 1 * num """ if num == nchunk - 1: rest = curflen - offset assert rest <= 64 assert rest > 0 filelen.append("{}\t{}".format(i, offset * 1024 * 1024)) else: """ filelen.append("{}\t{}".format(i, offset * 1024 * 1024)) """ Write out info """ log.message("Write chunk list file info {} ...".format(chunkfnminfo)) with open(chunkfnminfo, "w") as f: for item in fileid: f.write("%s\n" % item) log.message("Write chunk list file {} ...".format(chunkfnm)) with open(chunkfnm, "w") as f: ## f.write("%s\n" % len(filelen)) for item in filelen: f.write("%s\n" % item)
def evalit(self, loops, results): l_cost = 0.0 l_correctnum = 0 l_step = loops - self.steps self.steps = loops for item in results: if len(item) < 2: raise Exception("Bad result type! Expect {}, get {}".format( 3, len(item))) l_cost += item[0] l_correctnum += item[1] count = l_step * self.batchsize if count <= 0: raise Exception("Bad l_step {}? Expect > 0".format(l_step)) l_avgcost = l_cost / count l_accuracy = l_correctnum / count log.message("No.{} batch: Cost[{:.5f}] Accuracy[{:.5f}]".format( loops + 1, l_avgcost, l_accuracy)) return False
def load_file(self, path): message("Loading INI file: '{}'".format(path)) try: # config_f = codecs.open(path, 'r', 'utf-8') arguments = Namespace() config_dict = load_config_file(path, self.ignored) self.getmodule(config_dict) self.buildparameter() self._check_loaded_conf(config_dict) for name, value in config_dict.items(): if name in self.conditions and not self.conditions[name](value): cond_code = self.conditions[name].__code__ cond_filename = cond_code.co_filename cond_line_number = cond_code.co_firstlineno raise Exception( "Value of field '{}' does not satisfy " "condition defined at {}:{}." .format(name, cond_filename, cond_line_number)) setattr(arguments, name, value) for name, value in self.defaults.items(): if name not in arguments.__dict__: arguments.__dict__[name] = value message("INI file loaded.") except Exception as exc: message("Failed to load INI file: {}".format(exc)) traceback.print_exc() exit(1) self.printparameter(arguments) return arguments, self.inmodule