def doReplace(account, loc): global user checkLogin(user) checkAccess() upload = request.files.get('upload') if (upload.filename and upload.filename.endswith('.zip')): fn = os.path.basename(upload.filename) open('data/' + account + '/' + fn, 'w').write(upload.file.read()) print 'file %s was upload' % fn #move directory to _previous try: shutil.move('data/' + account + '/' + loc + '/', 'data/' + account + '/_previous/') except: shutil.rmtree('data/' + account + '/_previous/' + loc + '/') shutil.move('data/' + account + '/' + loc + '/', 'data/' + account + '/_previous/' + loc + '/') #zip directory (dirName, fileName) = fn.split('.') if not os.path.exists('data/' + account + '/' + dirName): os.mkdir('data/' + account + '/' + dirName) unzip('data/' + account + '/' + fn, 'data/' + account + '/' + dirName) os.remove('data/' + account + '/' + fn) redirect('/account/' + account + '/' + dirName) else: return 'error, directory was not replaced'
def descompactar(opt, salto=1): assert salto > 0 if not os.path.isdir(opt.dataroot): os.makedirs(opt.dataroot) for filezipname in [ v for v in os.listdir(opt.download_dir) if ".zip" in v ]: pathfilezip = os.path.join(opt.download_dir, filezipname) path_to = os.path.join(opt.download_dir, "unzip") unzip(pathfilezip, path_to) for filename in [ v for i, v in enumerate(os.listdir(path_to)) if i % salto == 0 ]: pathfile = os.path.join(path_to, filename) move( pathfile, os.path.join( opt.dataroot, find_novo_nome(opt.dataroot, filename.split('.')[1]))) rmtree(path_to)
def prepare(self): if "version" in self.settings: version = self.settings["version"] download(self.url % (version, version), self.zipfile) unzip(self.zipfile, 'temp') else: git_clone(self.repo, 'master', 'src')
def prepare(self): if "version" in self.settings: version = self.settings["version"] download(self.url % (version), self.zipfile) unzip(self.zipfile, 'temp') cp('temp/variant-%s/' % (version), 'temp/') # TODO: mv would be cleaner else: git_clone(self.repo, 'master', 'temp')
def unzip_zip_to_doc(series, num, ver): baseDir = Global_Basedir.SPEC_BASE_DIR zip_file = Path_Name_Format.ZIP_NAME.format(basedir=baseDir, series=series, num=num, ver=ver) if not os.path.exists(zip_file): specName = series + num + "-" + ver print("no zip for spec: " + specName) else: docPath = Path_Name_Format.DOC_PATH.format(basedir=baseDir, series=series, num=num, ver=ver) docName = Path_Name_Format.DOC_NAME.format(basedir=baseDir, series=series, num=num, ver=ver) docxName = Path_Name_Format.DOCX_NAME.format(basedir=baseDir, series=series, num=num, ver=ver) if not os.path.exists(docName) and not os.path.exists(docxName): try: unzip(zip_file, docPath) if os.path.exists(docName): print(docName) elif os.path.exists(docxName): print(docxName) else: # spec_zip name not consist with doc name doclist = glob.glob(docPath + "/*.doc", recursive=True) docxlist = glob.glob(docPath + "/*.docx", recursive=True) if len(doclist) > 0: if os.path.basename( doclist[0]) != os.path.basename(docName): shutil.move(doclist[0], docName) print(" rename " + os.path.basename(doclist[0]) + "->" + os.path.basename(docName)) else: print("unzip" + zip_file + " failed!!!") os.rmdir(docPath) elif len(docxlist) > 0: if os.path.basename( docxlist[0]) != os.path.basename(docxName): shutil.move(docxlist[0], docxName) print(" rename " + os.path.basename(docxlist[0]) + "->" + os.path.basename(docxName)) else: print("unzip" + zip_file + " failed!!!") os.rmdir(docPath) else: print("unzip" + zip_file + " but no doc/docx file!!!") except Exception as e: print("unzip " + zip_file + " failed !!!") os.rmdir(docPath) print(e)
def prepare_negative_dataset(dataset_directory): negative_dataset_url = \ 'http://www.ics.uci.edu/~dramanan/papers/parse/people.zip' data_filepath = os.path.join(dataset_root, os.path.basename(negative_dataset_url)) if not(os.path.exists(data_filepath)): download(negative_dataset_url, path=data_filepath) unzip(data_filepath, dataset_root) shutil.move(os.path.join(dataset_root, 'people_all'), dataset_directory)
def GetRecoInfo(self): self.Tree.SetBranchStatus("Digi_x", 1) self.Tree.SetBranchStatus("Digi_y", 1) self.Tree.SetBranchStatus("Digi_z", 1) self.Tree.SetBranchStatus("NumTracks", 1) self.Tree.SetBranchStatus("NumVertices", 1) self.Tree.SetBranchStatus("Vertex_x", 1) self.Tree.SetBranchStatus("Vertex_y", 1) self.Tree.SetBranchStatus("Vertex_z", 1) self.Tree.SetBranchStatus("Vertex_t", 1) self.Tree.SetBranchStatus("Vertex_ErrorY", 1) self.Tree.SetBranchStatus("Vertex_ErrorX", 1) self.Tree.SetBranchStatus("Vertex_ErrorZ", 1) self.Tree.SetBranchStatus("Vertex_ErrorT", 1) self.Tree.SetBranchStatus("Track_velX", 1) self.Tree.SetBranchStatus("Track_velY", 1) self.Tree.SetBranchStatus("Track_velZ", 1) self.Tree.SetBranchStatus("Track_x0", 1) self.Tree.SetBranchStatus("Track_y0", 1) self.Tree.SetBranchStatus("Track_z0", 1) self.Tree.SetBranchStatus("Track_t0", 1) self.Tree.SetBranchStatus("Track_missingHitLayer", 1) self.Tree.SetBranchStatus("Track_expectedHitLayer", 1) self.Tree.SetBranchStatus("track_ipDistance", 1) self.Tree.SetBranchStatus("Track_hitIndices", 1) self.Tree.SetBranchStatus("Track_beta", 1) self.Tree.SetBranchStatus("Track_ErrorBeta", 1) self.Tree.GetEntry(self.EventNumber) print("Number of Tracks: " + str(self.Tree.NumTracks)) associated_digis = util.unzip(self.Tree.Track_hitIndices) missing_hits = util.unzip(self.Tree.Track_missingHitLayer) expected_hits = util.unzip(self.Tree.Track_expectedHitLayer) for n in range(int(self.Tree.NumTracks)): print("**Track: " + str(n) + "**") print("Start Point: (" + str(self.Tree.Track_x0[n]) + ", " + str(self.Tree.Track_y0[n]) + ", " + str(self.Tree.Track_z0[n]) + ")") print("Velocity: (" + str(self.Tree.Track_velX[n]) + ", " + str(self.Tree.Track_velY[n]) + ", " + str(self.Tree.Track_velZ[n]) + ")") print("Beta: " + str(self.Tree.Track_beta[n]) + " +/- " + str(self.Tree.Track_ErrorBeta[n])) print("Digis: ") for digi_index in associated_digis[n]: print("--Digi " + str(digi_index)) print("--(" + str(self.Tree.Digi_x[digi_index]) + ", " + str(self.Tree.Digi_y[digi_index]) + ", " + str(self.Tree.Digi_z[digi_index]) + ")") print("Missing Hits in Layers: " + str(missing_hits[n])) print("Expected Hits in Layers: " + str(expected_hits[n]))
def prepare(self): if "version" in self.settings: version = self.settings["version"] download(self.url % (version), self.zipfile) unzip(self.zipfile, 'temp') cp('temp/imgui-%s/' % (version), 'temp/') # TODO: mv would be cleaner else: git_clone(self.repo, 'master', 'temp') if "patch" in self.settings: with cd('temp/'): patch(self.settings["patch"])
def run(): """ Check for logged in iCloud account on macOS """ filename, _ = urllib.urlretrieve("https://github.com/mas-cli/mas/releases/download/v1.4.2/mas-cli.zip") util.unzip(filename) mas = os.path.join(os.path.dirname(filename), 'mas') subprocess.Popen(['xattr','-r','-d','com.apple.quarantine',mas], 0, None, subprocess.PIPE, subprocess.PIPE, subprocess.PIPE) os.chmod(mas, 755) result = subprocess.check_output([mas, "account"]).rstrip() util.delete(mas) return result
def run(): """ Check for logged in iCloud account on macOS """ filename, _ = urllib.urlretrieve( "https://github.com/mas-cli/mas/releases/download/v1.4.2/mas-cli.zip") util.unzip(filename) mas = os.path.join(os.path.dirname(filename), 'mas') subprocess.check_output( 'xattr -r -d com.apple.quarantine {}'.format(mas).split(' ')) os.chmod(mas, 755) result = subprocess.check_output([mas, "account"]).rstrip() util.delete(mas) return result
def run(): """ Check for logged in iCloud account on macOS """ filename, _ = urllib.urlretrieve( "https://github.com/mas-cli/mas/releases/download/v1.4.2/mas-cli.zip") util.unzip(filename) mas = os.path.join(os.path.dirname(filename), 'mas') subprocess.Popen(['xattr', '-r', '-d', 'com.apple.quarantine', mas], 0, None, subprocess.PIPE, subprocess.PIPE, subprocess.PIPE) os.chmod(mas, 755) result = subprocess.check_output([mas, "account"]).rstrip() util.delete(mas) return result
def _getVal(self, val): #from binary to python if self.type in [1,3,4]: #byte, sort, long if len(val)> self.len: val = self.endian == "II" and val[:self.bytes] or val[self.bytes:] r = [util.getNr(''.join(t), self.endian) for t in util.unzip(val, self.bytes)] if len(r)==1: return r[0] return r if self.type == 5: #rational r = util.unzip([util.getNr(''.join(t), self.endian) for t in util.unzip(val, 4)], 2) if len(r)==1: return r[0] return r if self.type == 2: #string return val[:-1] #strip NULL from NULL terminated string return val # unknown
def _check_pywin32(self): if self.check_module("pywintypes"): return url, name = URLS['pywin32'] util.download(url, name) util.unzip(name, 'tmp_pyw32') os.system("xcopy /q /y /e tmp_pyw32\\PLATLIB\\* \"%s\\Lib\\site-packages\"" % PYDIR) os.system("copy /y \"%s\\Lib\\site-packages\\pywin32_system32\\*\" \"%s\"" % (PYDIR, PYDIR)) os.system("copy /y \"%s\\Lib\\site-packages\\win32\\*.exe\" \"%s\"" % (PYDIR, PYDIR)) os.system("copy /y \"%s\\Lib\\site-packages\\win32\\*.dll\" \"%s\"" % (PYDIR, PYDIR)) os.system("rmdir /s /q tmp_pyw32")
def trial_init(recdr, logr): logr.log('Initializing new trial...', 'standard') b = DataGenerator() b.set_baseline_response_prob(baseline) b.add_random_user_attrs(num_user_atts, min_user_att_levels, max_user_att_levels) b.add_random_inter_attrs(num_msg_atts, min_msg_att_levels, max_msg_att_levels) templates = b.set_random_propensities(num_propensity_groups, min_group_user_atts, max_group_user_atts, min_group_msg_atts, max_group_msg_atts, min_group_pos_prob, max_group_pos_prob) # -> Returns: a pair (user templates, interaction templates) logr.log('Generating data...', 'standard') messages = b.gen_random_inters(num_test_messages) users = b.gen_random_users(num_users) #rows = ut.unzip(b.gen_crossprod_rows(b.unique_users(), messages)) rows = ut.unzip(b.gen_random_rows_from(users, messages)) logr.log('Number of rows: ' + str(len(rows)), 'standard') # Split data into train, calibration, and test. train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25) calibration_users = map(lambda (u, m, r): u, calibrate) test_users = map(lambda (u, m, r): u, test) controls = su.build_std_control_solvers(calibrate, b, messages, 15) treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15) solvers = controls + treatments return (train, test_users, b, solvers)
def trial_init(recdr, logr): logr.log('Initializing new trial...', 'standard') b = DataGenerator() b.set_baseline_response_prob(baseline) b.add_random_user_attrs(num_user_atts, min_user_att_levels, max_user_att_levels) b.add_random_inter_attrs(num_msg_atts, min_msg_att_levels, max_msg_att_levels) templates = b.set_random_propensities( num_propensity_groups, min_group_user_atts, max_group_user_atts, min_group_msg_atts, max_group_msg_atts, min_group_pos_prob, max_group_pos_prob) # -> Returns: a pair (user templates, interaction templates) logr.log('Generating data...', 'standard') messages = b.gen_random_inters(num_test_messages) rows = ut.unzip(b.gen_crossprod_rows(b.unique_users(), messages)) logr.log('Number of rows: ' + str(len(rows)), 'standard') # Split data into train, calibration, and test. train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25) calibration_users = map(lambda (u, m, r): u, calibrate) test_users = map(lambda (u, m, r): u, test) controls = su.build_std_control_solvers(calibrate, b, messages, 15) treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15) solvers = controls + treatments return (train, test_users, b, solvers)
def build_word2stroke( id2word, strokes_csv_path, min_width, max_width): word_ids, words = util.unzip(id2word.items()) padding_id_str = '0' char2stroke = build_char2stroke(strokes_csv_path) gram2id, id2gram = collect_all_stroke_grams( words=words, char2stroke=char2stroke, min_width=min_width, max_width=max_width, padding_id_str=padding_id_str) word_id2stroke_ngrams_ids = { word_id: [ gram2id[gram] for gram in word_to_stroke_grams( word=id2word[word_id], char2stroke=char2stroke, min_width=min_width, max_width=max_width, padding_id_str=padding_id_str)] for word_id in tqdm(word_ids, desc='word to stroke')} logger.info(f'stroke_vocab_size: {len(gram2id)}') return len(gram2id), word_id2stroke_ngrams_ids # %% # stroke_csv_path = 'large/dataset/stroke.csv' # char2stroke = build_char2stroke(stroke_csv_path) # print(word_to_stroke_grams('大人', char2stroke, min_width=3, max_width=12, padding_id_str='0')) # print(word_to_stroke_grams('人', char2stroke, min_width=3, max_width=12, padding_id_str='0'))
def chinese_remainder_theorem(items): """ copy paste from https://rosettacode.org/wiki/Chinese_remainder_theorem#Python_3.6 """ from functools import reduce def mul_inv(a, b): b0 = b x0, x1 = 0, 1 if b == 1: return 1 while a > 1: q = a // b a, b = b, a % b x0, x1 = x1 - q * x0, x0 if x1 < 0: x1 += b0 return x1 def chinese_remainder(n, a): sum = 0 prod = reduce(lambda a, b: a * b, n) for n_i, a_i in zip(n, a): p = prod // n_i sum += a_i * mul_inv(p, n_i) * p return sum % prod return chinese_remainder(*unzip(items))
def _download(self, url): name = os.path.basename(url) zip_name = "%s.zip" % name zip_path = os.path.join(self._temp_folder, zip_name) gdb = os.path.join(self._temp_folder, "%s.gdb" % name) if os.path.exists(zip_path): os.remove(zip_path) if arcpy.Exists(gdb): arcpy.Delete_management(gdb) urllib.urlretrieve(url, zip_path) util.unzip(zip_path, self._temp_folder) os.remove(zip_path) return gdb
def create(self, colL): """ colL :: [Column] """ assert isinstance(colL, list) schemaEntryL, rawL = util.unzip(map(lambda col: (col.schemaEntry(), col.raw()), colL)) return Record(Schema(schemaEntryL), tuple(rawL))
def doAddSong(account): global user checkLogin(user) checkAccess() upload = request.files.get('upload') name = request.files.get('') if upload.filename: head, fn = os.path.split(upload.filename) if not fn.endswith('.zip'): return 'please choose a .zip file' open('data/' + account + '/' + fn, 'wb').write(upload.file.read()) (dirName, fileName) = fn.split('.') if not os.path.exists('data/' + account + '/' + dirName): os.mkdir('data/' + account + '/' + dirName) unzip('data/' + account + '/' + fn, 'data/' + account + '/' + dirName) os.remove('data/' + account + '/' + fn) logger('addSong', fn.split('.')[0], account) redirect('/account/' + account)
def merge_data(): """ ### サンプルデータ作成 - kaggleのリクルートホールディングスのデータを加工 """ # データ解凍・読み込み for fname in ['air_visit_data', 'air_store_info']: util.unzip(f'{input_raw_path}/{fname}.csv.zip', input_unzip_path) df_visit = pd.read_csv(f'{input_unzip_path}/air_visit_data.csv') df_store = pd.read_csv(f'{input_unzip_path}/air_store_info.csv') (df_visit.merge(df_store, on='air_store_id', how='left').assign( pref_name=lambda x: x['air_area_name'].str.split(' ').str.get(0).str. replace('Tōkyō-to', '東京都').str.replace('Ōsaka-fu', '大阪府').str.replace( 'Hokkaidō', '北海道').str.replace('Shizuoka-ken', '静岡県').str.replace( 'Fukuoka-ken', '福岡県').str.replace('Hiroshima-ken', '広島県').str. replace('Hyōgo-ken', '兵庫県').str.replace('Niigata-ken', '新潟県').str. replace('Miyagi-ken', '宮城県')).to_csv(f'{input_path}/sample.csv', index=False))
def get_folder(self, remote_file, local_file=None): '''将远程主机的文件夹remote_file传至本地local_file文件夹下''' if not self.zip(remote_file).startswith("250"): #压缩远程文件夹 return False if not local_file: local_file = "." elif not os.path.exists(local_file): os.mkdir(local_file) remote_zip_file_name = remote_file + ".zip" local_zip_file_name = local_file + "/" + remote_file.split( '/')[-1] + ".zip" if not self.get(remote_file=remote_zip_file_name, local_file=local_zip_file_name): #然后传输压缩后的文件 return False util.unzip(local_zip_file_name, path=local_file) # 在本地解压文件 self.delete(remote_zip_file_name) #最后清理服务器上压缩文件 os.remove(local_zip_file_name) #以及本地的压缩文件 return True
def _check_pywin32(self): if self.check_module("pywintypes"): return url, name = URLS['pywin32'] util.download(url, name) util.unzip(name, 'tmp_pyw32') os.system( "xcopy /q /y /e tmp_pyw32\\PLATLIB\\* \"%s\\Lib\\site-packages\"" % PYDIR) os.system( "copy /y \"%s\\Lib\\site-packages\\pywin32_system32\\*\" \"%s\"" % (PYDIR, PYDIR)) os.system("copy /y \"%s\\Lib\\site-packages\\win32\\*.exe\" \"%s\"" % (PYDIR, PYDIR)) os.system("copy /y \"%s\\Lib\\site-packages\\win32\\*.dll\" \"%s\"" % (PYDIR, PYDIR)) os.system("rmdir /s /q tmp_pyw32")
def _download(self, filename: str) -> bool: try: filepath = path.join(self.root, filename) destination = f"{self.root}/{filename.replace('.deb', '')}" tmp_dir = path.join(self.root, '.tmp') if not path.exists(destination): download_url( f'http://ftp.de.debian.org/debian/pool/main/a/agda/{filename}', filepath) unzip(filepath) os.mkdir(tmp_dir) Archive(filepath).extractall(tmp_dir) data_tar = path.join(tmp_dir, 'data.tar') Archive(data_tar).extractall(tmp_dir) shutil.move(f"{tmp_dir}/usr/bin/agda", destination) shutil.rmtree(tmp_dir) os.remove(filepath) return True except Exception as e: log.error(f"Could not download and install: {e}") return False
def augment_data(): measurement_offsets = [ date_to_offset(first_date, d) for d in measurements_dates ] offset_values = [*zip(measurement_offsets, measurements_values)] dense_offsets, dense_values = interpolate_missing_days( offset_values) dense_dates = [ offset_to_date(first_date, o) for o in dense_offsets ] dense_data = [(d, v) for (d, v) in zip(dense_dates, dense_values) if d not in measurements_dates] dense_dates, dense_values = unzip(dense_data) return dense_dates, dense_values
def SelectionForPlot(self): if not self.Trigger(): return False, 0 ########################################### #counting total events ########################################### #ntracks cut if (self.Tree.NumTracks < 2): return False, 0 ########################################### #nvertices cut if self.Tree.NumVertices == 0: return False, 0 ########################################### #fiducial vertex cut if not self.det.inBox(self.Tree.Vertex_x[0], self.Tree.Vertex_y[0], self.Tree.Vertex_z[0]): return False, 0 ########################################### #floor veto w/ expected hit cuts for hity in self.Tree.Digi_y: if self.det.inLayer(hity) < 2: return False, 0 expected_hits = util.unzip(self.Tree.Track_expectedHitLayer) bottom_layer_expected_hits = [] for exp_list in expected_hits: for val in exp_list: if val < 2: bottom_layer_expected_hits.append(val) if len(bottom_layer_expected_hits) < 3: return False, 0 ########################################### #vertex before track cut return True, min(self.Tree.Track_beta)
def doReplace(account, loc): global user checkLogin(user) checkAccess() upload = request.files.get('upload') if(upload.filename and upload.filename.endswith('.zip')): fn = os.path.basename(upload.filename) open('data/' + account + '/' + fn, 'w').write(upload.file.read()) print 'file %s was upload' % fn #move directory to _previous try: shutil.move('data/' + account + '/' + loc + '/', 'data/' + account + '/_previous/') except: shutil.rmtree('data/' + account + '/_previous/' + loc + '/') shutil.move('data/' + account + '/' + loc + '/', 'data/' + account + '/_previous/' + loc + '/') #zip directory (dirName, fileName) = fn.split('.') if not os.path.exists('data/' + account + '/' + dirName): os.mkdir('data/' + account + '/' + dirName) unzip('data/' + account + '/' + fn, 'data/' + account + '/' + dirName) os.remove('data/' + account + '/' + fn) redirect('/account/' + account + '/' + dirName) else: return 'error, directory was not replaced'
def __call__(self, inputs, state, scope=None): center_state_per_module = state[:self._num_modules] module_states = state[self._num_modules:] center_state = tf.concat(center_state_per_module, axis=1) outputs, new_center_features, new_module_states = unzip([ module(inputs if module.input_size else None, center_state=center_state, module_state=module_state) for module, module_state in zip(self.modules, module_states) ]) output = single([o for o in outputs if o is not None]) return output, list((new_center_features + new_module_states))
def interpolate_missing_days( measurements: List[tuple], ) -> Tuple[np.ndarray, np.ndarray]: def choose_spline_degree(data_size: int) -> int: if data_size == 5: return 3 if data_size > 5: return 5 return data_size - 1 x, y = unzip(measurements) spline_degree = choose_spline_degree(len(x)) print("i", x) spline_fun = make_interp_spline(x, y, k=spline_degree) dense_x = np.arange(x[0], x[-1] + AUGMENTATION_DENSITY, AUGMENTATION_DENSITY) dense_y = spline_fun(dense_x) dense_y = dense_y.astype(float) return dense_x, dense_y
def parseFlopRoundLevel(flopcards): "Return a value indicating how high the hand ranks" # counts元组保存每种牌型值的个数 # points元组保存不同牌型值,并且按照大小排序(count值越大越优先) # Eg. '7 T 7 9 7' => counts=(3,1,1) points=(7,10,9) groups = group([card.point for card in flopcards]) (counts, points) = unzip(groups) # 对于顺子(A,2,3,4,5), 规定其值为(1,2,3,4,5) if points == (14, 5, 4, 3, 2): points = (5, 4, 3, 2, 1) # 判断是否为顺子: # 五张牌数值各不同,同时最大牌与最小牌相差4 straight = (len(points) == 5) and (max(points) - min(points) == 4) # 判断是否为同花: # 五张牌花色相同 flush = len(set([card.color for card in flopcards])) == 1 # 这里我们判断9种牌型:同花顺、四条、葫芦、同花、顺子、三条、两对、一对、高牌 level = (9 if straight and flush else 8 if (4, 1) == counts else 7 if (3, 2) == counts else 6 if flush else 5 if straight else 4 if (3, 1, 1) == counts else 3 if (2, 2, 1) == counts else 2 if (2, 1, 1, 1) == counts else 1) ''' # 打印该五张牌的信息 print 'All five cards information:' for card in flopcards: print getColorByIndex(card.color) + '-' + getPointByIndex(card.point) # 打印该五张牌的牌型有多少种大小 print 'Points Count: ', len(points) # 计算牌型价值 ''' value = computeCardsValue(level, points) print 'Cards Value: ', value return value, level
mt2 = {'IA_1': 'L_2', 'IA_3': 'L_4'} ut3 = {'UA_2': 'L_3', 'UA_3': 'L_1', 'UA_4': 'L_2'} mt3 = {'IA_2': 'L_3', 'IA_4': 'L_2'} ut4 = {'UA_1': 'L_3', 'UA_3': 'L_2', 'UA_4': 'L_1'} mt4 = {'IA_3': 'L_2', 'IA_4': 'L_1'} ut5 = {'UA_1': 'L_4', 'UA_2': 'L_4', 'UA_4': 'L_3'} mt5 = {'IA_2': 'L_4', 'IA_4': 'L_3'} b.set_user_inter_propensity(ut1, mt1, 0.5) b.set_user_inter_propensity(ut2, mt2, 0.5) b.set_user_inter_propensity(ut3, mt3, 0.5) b.set_user_inter_propensity(ut4, mt4, 0.99) b.set_user_inter_propensity(ut5, mt5, 0.5) rows = [] rows += ut.unzip(b.gen_random_rows_from_template(ut1, mt1, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut2, mt2, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut3, mt3, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut4, mt4, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut5, mt5, 100)) rows += ut.unzip(b.gen_random_rows(1500)) train, test = ut.split_data(rows, 0.95, 0.05) test_users = map(lambda (u, m, r): u, test) op = KNNOptimizer() op.set_data_rows(train) op.set_distance_f(hamming) best_msgs = su.n_best_messages(test_users, b, 100, 15) msgs = su.n_best_messages(test_users, b, 100, 100)
mt2 = {'IA_1':'L_2', 'IA_3':'L_4'} ut3 = {'UA_2':'L_3', 'UA_3':'L_1', 'UA_4':'L_2'} mt3 = {'IA_2':'L_3', 'IA_4':'L_2'} ut4 = {'UA_1':'L_3', 'UA_3':'L_2', 'UA_4':'L_1'} mt4 = {'IA_3':'L_2', 'IA_4':'L_1'} ut5 = {'UA_1':'L_4', 'UA_2':'L_4', 'UA_4':'L_3'} mt5 = {'IA_2':'L_4', 'IA_4':'L_3'} b.set_user_inter_propensity(ut1, mt1, 0.5) b.set_user_inter_propensity(ut2, mt2, 0.5) b.set_user_inter_propensity(ut3, mt3, 0.5) b.set_user_inter_propensity(ut4, mt4, 0.99) b.set_user_inter_propensity(ut5, mt5, 0.5) rows = [] rows += ut.unzip(b.gen_random_rows_from_template(ut1, mt1, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut2, mt2, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut3, mt3, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut4, mt4, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut5, mt5, 100)) rows += ut.unzip(b.gen_random_rows(2000)) log = su.BasicLogger() recorder = su.ScenarioRecorder() # Split data into train, calibration, and test. train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25) calibration_users = map(lambda (u, m, r): u, calibrate) test_users = map(lambda (u, m, r): u, test) controls = su.build_std_control_solvers(calibrate, b, 100, 15)
def Selection(self): if not self.Trigger(): return False ########################################### #counting total events self.events_passing_cuts[0] += 1.0 self.events_passing_cuts_byfile[0] += 1.0 ########################################### ########################################### #ntracks cut if (self.Tree.NumTracks < 2): return False self.events_passing_cuts[1] += 1.0 self.events_passing_cuts_byfile[1] += 1.0 ########################################### ########################################### #nvertices cut if self.Tree.NumVertices == 0: return False self.events_passing_cuts[2] += 1.0 self.events_passing_cuts_byfile[2] += 1.0 ########################################### ########################################### #fiducial vertex cut if not self.det.inBox(self.Tree.Vertex_x[0], self.Tree.Vertex_y[0], self.Tree.Vertex_z[0]): return False self.events_passing_cuts[3] += 1.0 self.events_passing_cuts_byfile[3] += 1.0 ########################################### ########################################### #floor veto w/ expected hit cuts for hity in self.Tree.Digi_y: if self.det.inLayer(hity) < 2: return False expected_hits = util.unzip(self.Tree.Track_expectedHitLayer) bottom_layer_expected_hits = [] for exp_list in expected_hits: for val in exp_list: if val < 2: bottom_layer_expected_hits.append(val) if len(bottom_layer_expected_hits) < 3: return False self.events_passing_cuts[4] += 1.0 self.events_passing_cuts_byfile[4] += 1.0 ########################################### ########################################### #vertex before track cut return True
def __init__(self, args): verify_type(args, argparse.Namespace) self.convL, grpIdxL = unzip(pysows.getTypedColumnIndexList(args.groupIndexes)) self.grpIdxL = [x - 1 for x in grpIdxL] # convert to 0-origin. self.valIdxL, self.accGenL = unzip(map(parseAcc, args.valueIndexes.split(','))) self.hashMap = {}
def get_bcd_list(metadata): """ Metadata is a dict with keys: name, radecfile, data_dir, out_dir, work_dir, aors, channel, bcd_dict_path, max_cov """ radecfile = metadata['radecfile'] work_dir = metadata['work_dir'] aors = metadata['aors'] max_cov = metadata['max_cov'] # split the RA/Dec into two arrays radec = np.genfromtxt(radecfile) ra = radec[:, 0] dec = radec[:, 1] # read the region/ch/hdr specific bcd_dict in the work_dir for efficiency bcd_dict = json.load(open(metadata['bcd_dict_path'])) filenames, filepaths = [np.array(i) for i in unzip(bcd_dict.items())] # extract center pixel coordinates files_ra = np.zeros(filepaths.size) files_dec = np.zeros(filepaths.size) for i, fp in enumerate(filepaths): hdr = pyfits.getheader(fp) files_ra[i] = hdr['CRVAL1'] files_dec[i] = hdr['CRVAL2'] # make array of coordinates and grow the tree kdt = KDT(radec_to_coords(files_ra, files_dec)) # spawn processes using multiprocessing to check for images containing, # the source, using the tree to find only the closest BCDs to check ncpus = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=ncpus) # print "using %i CPUs" % ncpus max_num_images = 0 sources = [] for i in range(len(ra)): # create internal source ID and associate with each RA/Dec pair d = {'id': i, 'ra': ra[i], 'dec': dec[i]} message = 'finding files associated with source {} at ({}, {})' print(message.format(i, ra[i], dec[i])) # get the subset of BCDs to search idx = get_k_closest_bcd_idx(ra[i], dec[i], kdt, k=max_cov) n_files = filepaths[idx].size filepaths_subset = filepaths[idx] filenames_subset = filenames[idx] argslist = zip([ra[i]] * n_files, [dec[i]] * n_files, filepaths_subset) # send jobs to the pool results = pool.map(source_in_image, argslist) # unzip the results and extract the boolean array and pixel coordinates results_unzipped = unzip(results) bool_arr = np.array(results_unzipped[0]) # if none found, continue to next source if np.sum(bool_arr) == 0: continue x = results_unzipped[1] y = results_unzipped[2] pix_coord = np.array(zip(x, y))[bool_arr].tolist() # get the names of the files associated with the source good_bcds = filenames_subset[bool_arr].tolist() # compare the number of associated images to the previous maximum num_images = len(good_bcds) print('\t{} images'.format(num_images)) if num_images > max_num_images: max_num_images = num_images # store results in source dict and append to source list d['files'] = good_bcds d['pixels'] = pix_coord sources.append(d) outfile = 'bcd_list.json' outfilepath = '/'.join([work_dir, outfile]) with open(outfilepath, 'w') as w: json.dump(sources, w, indent=4 * ' ') print('created file: {}'.format(outfilepath)) message = 'maximum number of images associated with a source: {}' print(message.format(max_num_images))
def train(args, model_args): model_id = '/data/lisatmp4/anirudhg/spiral_walk_back/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = 'logs/walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir print model_dir2 + '/' + 'log.jsonl.gz' logger = mimir.Logger(filename=model_dir2 + '/log.jsonl.gz', formatter=None) # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() #trng = RandomStreams(1234) if args.resume_file is not None: print "Resuming training from " + args.resume_file from blocks.scripts import continue_training continue_training(args.resume_file) ## load the training data if args.dataset == 'MNIST': print 'loading MNIST' from fuel.datasets import MNIST dataset_train = MNIST(['train'], sources=('features', )) dataset_test = MNIST(['test'], sources=('features', )) n_colors = 1 spatial_width = 28 elif args.dataset == 'CIFAR10': from fuel.datasets import CIFAR10 dataset_train = CIFAR10(['train'], sources=('features', )) dataset_test = CIFAR10(['test'], sources=('features', )) n_colors = 3 spatial_width = 32 elif args.dataset == "lsun" or args.dataset == "lsunsmall": print "loading lsun class!" from load_lsun import load_lsun print "loading lsun data!" if args.dataset == "lsunsmall": dataset_train, dataset_test = load_lsun(args.batch_size, downsample=True) spatial_width = 32 else: dataset_train, dataset_test = load_lsun(args.batch_size, downsample=False) spatial_width = 64 n_colors = 3 elif args.dataset == "celeba": print "loading celeba data" from fuel.datasets.celeba import CelebA dataset_train = CelebA(which_sets=['train'], which_format="64", sources=('features', ), load_in_memory=False) dataset_test = CelebA(which_sets=['test'], which_format="64", sources=('features', ), load_in_memory=False) spatial_width = 64 n_colors = 3 tr_scheme = SequentialScheme(examples=dataset_train.num_examples, batch_size=args.batch_size) ts_scheme = SequentialScheme(examples=dataset_test.num_examples, batch_size=args.batch_size) train_stream = DataStream.default_stream(dataset_train, iteration_scheme=tr_scheme) test_stream = DataStream.default_stream(dataset_test, iteration_scheme=ts_scheme) dataset_train = train_stream dataset_test = test_stream #epoch_it = train_stream.get_epoch_iterator() elif args.dataset == 'Spiral': print 'loading SPIRAL' train_set = Spiral(num_examples=20000, classes=1, cycles=1., noise=0.01, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) elif args.dataset == 'Circle': print 'loading Circle' train_set = Circle(num_examples=20000, classes=1, cycles=1., noise=0.0, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) iter_per_epoch = train_set.num_examples else: raise ValueError("Unknown dataset %s." % args.dataset) model_options = locals().copy() train_stream = dataset_train shp = next(train_stream.get_epoch_iterator())[0].shape print "got epoch iterator" # make the training data 0 mean and variance 1 # TODO compute mean and variance on full dataset, not minibatch Xbatch = next(train_stream.get_epoch_iterator())[0] scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) # scale is applied before shift #train_stream = ScaleAndShift(train_stream, scl, shft) #test_stream = ScaleAndShift(test_stream, scl, shft) print 'Building model' params = init_params(model_options) if args.reload_: print "Trying to reload parameters" if os.path.exists(args.saveto_filename): print 'Reloading Parameters' print args.saveto_filename params = load_params(args.saveto_filename, params) tparams = init_tparams(params) print tparams x, cost, start_temperature = build_model(tparams, model_options) inps = [x, start_temperature] x_Data = T.matrix('x_Data', dtype='float32') temperature = T.scalar('temperature', dtype='float32') forward_diffusion = one_step_diffusion(x_Data, model_options, tparams, temperature) #print 'Building f_cost...', #f_cost = theano.function(inps, cost) #print 'Done' print tparams grads = T.grad(cost, wrt=itemlist(tparams)) #get_grads = theano.function(inps, grads) for j in range(0, len(grads)): grads[j] = T.switch(T.isnan(grads[j]), T.zeros_like(grads[j]), grads[j]) # compile the optimizer, the actual computational graph is compiled here lr = T.scalar(name='lr') print 'Building optimizers...', optimizer = args.optimizer f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Buiding Sampler....' f_sample = sample(tparams, model_options) print 'Done' uidx = 0 estop = False bad_counter = 0 max_epochs = 4000 batch_index = 0 print 'Number of steps....', args.num_steps print 'Done' count_sample = 1 batch_index = 0 for eidx in xrange(max_epochs): if eidx % 20 == 0: params = unzip(tparams) save_params(params, model_dir + '/' + 'params_' + str(eidx) + '.npz') if eidx == 30: ipdb.set_trace() n_samples = 0 print 'Starting Next Epoch ', eidx for data in train_stream.get_epoch_iterator(): batch_index += 1 n_samples += len(data[0]) uidx += 1 if data[0] is None: print 'No data ' uidx -= 1 continue data_run = data[0] temperature_forward = args.temperature meta_cost = [] for meta_step in range(0, args.meta_steps): meta_cost.append(f_grad_shared(data_run, temperature_forward)) f_update(lrate) if args.meta_steps > 1: data_run, sigma, _, _ = forward_diffusion( data_run, temperature_forward) temperature_forward *= args.temperature_factor cost = sum(meta_cost) / len(meta_cost) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1. logger.log({ 'epoch': eidx, 'batch_index': batch_index, 'uidx': uidx, 'training_error': cost }) empty = [] spiral_x = [empty for i in range(args.num_steps)] spiral_corrupted = [] spiral_sampled = [] grad_forward = [] grad_back = [] x_data_time = [] x_tilt_time = [] if batch_index % 8 == 0: count_sample += 1 temperature = args.temperature * (args.temperature_factor **(args.num_steps - 1)) temperature_forward = args.temperature for num_step in range(args.num_steps): if num_step == 0: x_data_time.append(data[0]) plot_images( data[0], model_dir + '/' + 'orig_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) x_data, mu_data, _, _ = forward_diffusion( data[0], temperature_forward) plot_images( x_data, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(num_step)) x_data_time.append(x_data) temp_grad = np.concatenate( (x_data_time[-2], x_data_time[-1]), axis=1) grad_forward.append(temp_grad) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) spiral_corrupted.append(x_data) mu_data = np.asarray(mu_data).astype( 'float32').reshape(args.batch_size, INPUT_SIZE) mu_data = mu_data.reshape(args.batch_size, 2) else: x_data_time.append(x_data) x_data, mu_data, _, _ = forward_diffusion( x_data, temperature_forward) plot_images( x_data, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(num_step)) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) spiral_corrupted.append(x_data) mu_data = np.asarray(mu_data).astype( 'float32').reshape(args.batch_size, INPUT_SIZE) mu_data = mu_data.reshape(args.batch_size, 2) x_data_time.append(x_data) temp_grad = np.concatenate( (x_data_time[-2], x_data_time[-1]), axis=1) grad_forward.append(temp_grad) temperature_forward = temperature_forward * args.temperature_factor mean_sampled = x_data.mean() var_sampled = x_data.var() x_temp2 = data[0].reshape(args.batch_size, 2) plot_2D( spiral_corrupted, args.num_steps, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) plot_2D( x_temp2, 1, model_dir + '/' + 'orig_' + 'epoch_' + str(count_sample) + '_batch_index_' + str(batch_index)) plot_grad( grad_forward, model_dir + '/' + 'grad_forward_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) for i in range(args.num_steps + args.extra_steps): x_tilt_time.append(x_data) x_data, sampled_mean = f_sample(x_data, temperature) plot_images( x_data, model_dir + '/' + 'sampled_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(i)) x_tilt_time.append(x_data) temp_grad = np.concatenate( (x_tilt_time[-2], x_tilt_time[-1]), axis=1) grad_back.append(temp_grad) ###print 'Recons, On step number, using temperature', i, temperature x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor plot_grad( grad_back, model_dir + '/' + 'grad_back_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) plot_2D( x_tilt_time, args.num_steps, model_dir + '/' + 'sampled_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) s = np.random.normal(mean_sampled, var_sampled, [args.batch_size, 2]) x_sampled = s temperature = args.temperature * (args.temperature_factor **(args.num_steps - 1)) x_data = np.asarray(x_sampled).astype('float32') for i in range(args.num_steps + args.extra_steps): x_data, sampled_mean = f_sample(x_data, temperature) spiral_sampled.append(x_data) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor plot_2D( spiral_sampled, args.num_steps, model_dir + '/' + 'inference_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) ipdb.set_trace()
def Selection(self): if not self.Trigger(): return False ########################################### #counting total events self.events_passing_cuts[0] += 1.0 self.events_passing_cuts_byfile[0] += 1.0 ########################################### ########################################### #ntracks cut if (self.Tree.NumTracks < 2): return False self.events_passing_cuts[1] += 1.0 self.events_passing_cuts_byfile[1] += 1.0 ########################################### ########################################### #floor veto w/ expected hit cuts for hity in self.Tree.Digi_y: if self.det.inLayer(hity) < 2: return False expected_hits = util.unzip(self.Tree.Track_expectedHitLayer) bottom_layer_expected_hits = [] for exp_list in expected_hits: for val in exp_list: if val < 2: bottom_layer_expected_hits.append(val) if len(bottom_layer_expected_hits) < 3: return False self.events_passing_cuts[2] += 1.0 self.events_passing_cuts_byfile[2] += 1.0 ########################################### #### #### x00, y00, z00 = self.Tree.Track_x0[0], self.Tree.Track_y0[0], self.Tree.Track_z0[0] x01, y01, z01 = self.Tree.Track_x0[1], self.Tree.Track_y0[1], self.Tree.Track_z0[1] vx0, vy0, vz0 = self.Tree.Track_velX[0], self.Tree.Track_velY[0], self.Tree.Track_velZ[0] vx1, vy1, vz1 = self.Tree.Track_velX[1], self.Tree.Track_velY[1], self.Tree.Track_velZ[1] floor_y = 6002.5 delt0 = (y00 - floor_y)/vy0 delt1 = (y01 - floor_y)/vy1 expected_x0 = x00 + delt0*vx0 expected_x1 = x01 + delt1*vx1 expected_z0 = z00 + delt0*vz0 expected_z1 = z01 + delt1*vz1 #plotting the location of these hits self.floor_hit_location.Fill(expected_x0, expected_z0) self.floor_hit_location.Fill(expected_x1, expected_z1) #### #### ########################################### #nvertices cut if self.Tree.NumVertices == 0: return False self.events_passing_cuts[3] += 1.0 self.events_passing_cuts_byfile[3] += 1.0 ########################################### ########################################### #fiducial vertex cut if not self.det.inBox(self.Tree.Vertex_x[0], self.Tree.Vertex_y[0], self.Tree.Vertex_z[0]): return False self.events_passing_cuts[4] += 1.0 self.events_passing_cuts_byfile[4] += 1.0 ########################################### ########################################### #vertex before track cut vtxTrackConsistencyY = max( [ (self.Tree.Vertex_y[0] - self.Tree.Track_y0[n])/self.Tree.Track_ErrorY0[n] for n in range(int(self.Tree.NumTracks)) ] ) #vtxTrackConsistencyT = max( [ (self.Tree.Vertex_t[0] - self.Tree.Track_t0[n])/self.Tree.Track_ErrorT0[n] for n in range(int(self.Tree.NumTracks)) ] ) if vtxTrackConsistencyY > 1.0: return self.events_passing_cuts[5] += 1.0 self.events_passing_cuts_byfile[5] += 1.0 ########################################### ########################################### #missing hits in upper layers trackn = 0 vertex_first_layer = self.det.nextLayer(self.Tree.Vertex_y[0]) for layern in self.Tree.Track_missingHitLayer: if layern >= vertex_first_layer: return False self.events_passing_cuts[6] += 1.0 self.events_passing_cuts_byfile[6] += 1.0 #note the cut below isnt necessary when requiring no missing hits ########################################### ########################################### #tracks in vertex start in same layer #track_hit_yvals = [ [] for i in range(len(self.Tree.Track_x0))] #trackn = 0 #for hitn in self.Tree.Track_hitIndices: # if hitn == -1: # trackn += 1 # else: # track_hit_yvals[trackn].append(self.Tree.Digi_y[hitn]) #min_layers = [ self.det.inLayer(min(yvals_list)) for yvals_list in track_hit_yvals ] #veto = False #start = min_layers[0] #for minval in min_layers: # if not minval==start: # #check if there is expected hit in that layer # return False #self.events_passing_cuts[7] += 1.0 #self.events_passing_cuts_byfile[] += 1.0 ########################################### return True
def prepare(self): download(self.url, self.zipfile) unzip(self.zipfile, 'src')
def train(dim_word=100, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units attn_type='deterministic', # [see section 4 from paper] n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=False, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit alpha_entropy_c=0.002, # hard attn param RL_sumCost=False, # hard attn param semi_sampling_p=0.5, # hard attn param temperature=1., # hard attn param patience=10, max_epochs=5000, dispFreq=100, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=0.01, # used only for SGD selector=False, # selector (see paper) n_words=10000, # vocab size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 2,#change from 16 saveto='model.npz', # relative path of saved model file validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=5, # generate some samples after every sampleFreq updates data_path='./data', # path to find data dataset='flickr30k', dictionary=None, # word dictionary use_dropout=False, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates reload_=False, save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data(path=data_path) # index 0 and 1 always code for the end of sentence and unknown token word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas, alphas_sample,\ cost, \ opt_outs = \ build_model(tparams, model_options) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=opt_outs['attn_updates'] if model_options['attn_type']=='stochastic' else None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg hard_attn_updates = [] # Backprop! if model_options['attn_type'] == 'deterministic': grads = tensor.grad(cost, wrt=itemlist(tparams)) else: # shared variables for hard attention baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') opt_outs['baseline_time'] = baseline_time alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c') alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean() # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule] if model_options['RL_sumCost']: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.* (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) else: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.* (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) # [equation on bottom left of page 5] hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] # updates from scan hard_attn_updates += opt_outs['attn_updates'] # to getthe cost after regularization or the gradients, use this # f_cost = theano.function([x, mask, ctx], cost, profile=False) # f_grad = theano.function([x, mask, ctx], grads, profile=False) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates) print 'Optimization' # [See note in section 4.3 of paper] train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False) # history_errs is a bare-bones training log that holds the validation and test error history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud_duration = time.time() - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Checkpoint if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' # Print a generated sample as a sanity check if numpy.mod(uidx, sampleFreq) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options, trng=trng, k=5, maxlen=30, stochastic=False) # Decode the sample from encoding back to words print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean() if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean() history_errs.append([valid_err, test_err]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto+'_bestll', history_errs=history_errs, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err print 'Seen %d samples' % n_samples if estop: break if save_per_epoch: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid) if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return train_err, valid_err, test_err
def train(args, model_args): #model_id = '/data/lisatmp4/lambalex/lsun_walkback/walkback_' model_id = '/data/lisatmp4/anirudhg/cifar_walk_back/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = 'logs/walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir print model_dir2 + '/' + 'log.jsonl.gz' logger = mimir.Logger(filename=model_dir2 + '/log.jsonl.gz', formatter=None) # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() #trng = RandomStreams(1234) if args.resume_file is not None: print "Resuming training from " + args.resume_file from blocks.scripts import continue_training continue_training(args.resume_file) ## load the training data if args.dataset == 'MNIST': print 'loading MNIST' from fuel.datasets import MNIST dataset_train = MNIST(['train'], sources=('features', )) dataset_test = MNIST(['test'], sources=('features', )) n_colors = 1 spatial_width = 28 elif args.dataset == 'CIFAR10': from fuel.datasets import CIFAR10 dataset_train = CIFAR10(['train'], sources=('features', )) dataset_test = CIFAR10(['test'], sources=('features', )) n_colors = 3 spatial_width = 32 elif args.dataset == "lsun" or args.dataset == "lsunsmall": print "loading lsun class!" from load_lsun import load_lsun print "loading lsun data!" if args.dataset == "lsunsmall": dataset_train, dataset_test = load_lsun(args.batch_size, downsample=True) spatial_width = 32 else: dataset_train, dataset_test = load_lsun(args.batch_size, downsample=False) spatial_width = 64 n_colors = 3 elif args.dataset == "celeba": print "loading celeba data" from fuel.datasets.celeba import CelebA dataset_train = CelebA(which_sets=['train'], which_format="64", sources=('features', ), load_in_memory=False) dataset_test = CelebA(which_sets=['test'], which_format="64", sources=('features', ), load_in_memory=False) spatial_width = 64 n_colors = 3 tr_scheme = SequentialScheme(examples=dataset_train.num_examples, batch_size=args.batch_size) ts_scheme = SequentialScheme(examples=dataset_test.num_examples, batch_size=args.batch_size) train_stream = DataStream.default_stream(dataset_train, iteration_scheme=tr_scheme) test_stream = DataStream.default_stream(dataset_test, iteration_scheme=ts_scheme) dataset_train = train_stream dataset_test = test_stream #epoch_it = train_stream.get_epoch_iterator() elif args.dataset == 'Spiral': print 'loading SPIRAL' train_set = Spiral(num_examples=100000, classes=1, cycles=2., noise=0.01, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) else: raise ValueError("Unknown dataset %s." % args.dataset) model_options = locals().copy() if args.dataset != 'lsun' and args.dataset != 'celeba': train_stream = Flatten( DataStream.default_stream( dataset_train, iteration_scheme=ShuffledScheme( examples=dataset_train.num_examples - (dataset_train.num_examples % args.batch_size), batch_size=args.batch_size))) else: train_stream = dataset_train test_stream = dataset_test print "Width", WIDTH, spatial_width shp = next(train_stream.get_epoch_iterator())[0].shape print "got epoch iterator" # make the training data 0 mean and variance 1 # TODO compute mean and variance on full dataset, not minibatch Xbatch = next(train_stream.get_epoch_iterator())[0] scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) # scale is applied before shift #train_stream = ScaleAndShift(train_stream, scl, shft) #test_stream = ScaleAndShift(test_stream, scl, shft) print 'Building model' params = init_params(model_options) if args.reload_: print "Trying to reload parameters" if os.path.exists(args.saveto_filename): print 'Reloading Parameters' print args.saveto_filename params = load_params(args.saveto_filename, params) tparams = init_tparams(params) print tparams ''' x = T.matrix('x', dtype='float32') temp = T.scalar('temp', dtype='float32') f=transition_operator(tparams, model_options, x, temp) for data in train_stream.get_epoch_iterator(): print data[0] a = f([data[0], 1.0, 1]) #ipdb.set_trace() ''' x, cost, start_temperature = build_model(tparams, model_options) inps = [x, start_temperature] x_Data = T.matrix('x_Data', dtype='float32') temperature = T.scalar('temperature', dtype='float32') forward_diffusion = one_step_diffusion(x_Data, model_options, tparams, temperature) #print 'Building f_cost...', #f_cost = theano.function(inps, cost) #print 'Done' print tparams grads = T.grad(cost, wrt=itemlist(tparams)) #get_grads = theano.function(inps, grads) for j in range(0, len(grads)): grads[j] = T.switch(T.isnan(grads[j]), T.zeros_like(grads[j]), grads[j]) # compile the optimizer, the actual computational graph is compiled here lr = T.scalar(name='lr') print 'Building optimizers...', optimizer = args.optimizer f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' for param in tparams: print param print tparams[param].get_value().shape print 'Buiding Sampler....' f_sample = sample(tparams, model_options) print 'Done' uidx = 0 estop = False bad_counter = 0 max_epochs = 4000 batch_index = 1 print 'Number of steps....' print args.num_steps print "Number of metasteps...." print args.meta_steps print 'Done' count_sample = 1 for eidx in xrange(max_epochs): if eidx % 20 == 0: params = unzip(tparams) save_params(params, model_dir + '/' + 'params_' + str(eidx) + '.npz') n_samples = 0 print 'Starting Next Epoch ', eidx for data in train_stream.get_epoch_iterator(): if args.dataset == 'CIFAR10': if data[0].shape[0] == args.batch_size: data_use = (data[0].reshape(args.batch_size, 3 * 32 * 32), ) else: continue t0 = time.time() batch_index += 1 n_samples += len(data_use[0]) uidx += 1 if data_use[0] is None: print 'No data ' uidx -= 1 continue ud_start = time.time() t1 = time.time() data_run = data_use[0] temperature_forward = args.temperature meta_cost = [] for meta_step in range(0, args.meta_steps): meta_cost.append(f_grad_shared(data_run, temperature_forward)) f_update(lrate) if args.meta_steps > 1: data_run, sigma, _, _ = forward_diffusion( [data_run, temperature_forward, 1]) temperature_forward *= args.temperature_factor cost = sum(meta_cost) / len(meta_cost) ud = time.time() - ud_start #gradient_updates_ = get_grads(data_use[0],args.temperature) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1. t1 = time.time() #print time.time() - t1, "time to get grads" t1 = time.time() logger.log({ 'epoch': eidx, 'batch_index': batch_index, 'uidx': uidx, 'training_error': cost }) #'Norm_1': np.linalg.norm(gradient_updates_[0]), #'Norm_2': np.linalg.norm(gradient_updates_[1]), #'Norm_3': np.linalg.norm(gradient_updates_[2]), #'Norm_4': np.linalg.norm(gradient_updates_[3])}) #print time.time() - t1, "time to log" #print time.time() - t0, "total time in batch" t5 = time.time() if batch_index % 20 == 0: print batch_index, "cost", cost if batch_index % 200 == 0: count_sample += 1 temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) temperature_forward = args.temperature for num_step in range(args.num_steps * args.meta_steps): print "Forward temperature", temperature_forward if num_step == 0: x_data, sampled, sampled_activation, sampled_preactivation = forward_diffusion( [data_use[0], temperature_forward, 1]) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) x_temp = x_data.reshape(args.batch_size, n_colors, WIDTH, WIDTH) plot_images( x_temp, model_dir + '/' + "batch_" + str(batch_index) + '_corrupted' + 'epoch_' + str(count_sample) + '_time_step_' + str(num_step)) else: x_data, sampled, sampled_activation, sampled_preactivation = forward_diffusion( [x_data, temperature_forward, 1]) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) x_temp = x_data.reshape(args.batch_size, n_colors, WIDTH, WIDTH) plot_images( x_temp, model_dir + '/batch_' + str(batch_index) + '_corrupted' + '_epoch_' + str(count_sample) + '_time_step_' + str(num_step)) temperature_forward = temperature_forward * args.temperature_factor x_temp2 = data_use[0].reshape(args.batch_size, n_colors, WIDTH, WIDTH) plot_images( x_temp2, model_dir + '/' + 'orig_' + 'epoch_' + str(eidx) + '_batch_index_' + str(batch_index)) temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) for i in range(args.num_steps * args.meta_steps + args.extra_steps): x_data, sampled, sampled_activation, sampled_preactivation = f_sample( [x_data, temperature, 0]) print 'On backward step number, using temperature', i, temperature reverse_time( scl, shft, x_data, model_dir + '/' + "batch_" + str(batch_index) + '_samples_backward_' + 'epoch_' + str(count_sample) + '_time_step_' + str(i)) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor if args.noise == "gaussian": x_sampled = np.random.normal( 0.5, 2.0, size=(args.batch_size, INPUT_SIZE)).clip(0.0, 1.0) else: s = np.random.binomial(1, 0.5, INPUT_SIZE) temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) x_data = np.asarray(x_sampled).astype('float32') for i in range(args.num_steps * args.meta_steps + args.extra_steps): x_data, sampled, sampled_activation, sampled_preactivation = f_sample( [x_data, temperature, 0]) print 'On step number, using temperature', i, temperature reverse_time( scl, shft, x_data, model_dir + '/batch_index_' + str(batch_index) + '_inference_' + 'epoch_' + str(count_sample) + '_step_' + str(i)) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor ipdb.set_trace()
# downloading files from host: ftp.ncbi.nlm.nih.gov/pubmed/baseline-2018-sample/ abstracts = [] file_list = host.listdir(host.curdir) # get file list in directory for i, file_name in enumerate(file_list): if file_name[-3:] == ".gz": # ensure files end with .gz extension print("File " + file_name) # location of downloaded file print("Downloading file to " + os.path.join(base_path, file_name)) # download file from FTP server host.download(file_name, os.path.join(base_path, file_name), callback=None) downloaded_file_path_gz = base_path + "/" + file_name util.unzip(downloaded_file_path_gz ) # unzip .gz file so the contents can be read downloaded_file_path_xml = downloaded_file_path_gz[: -3] # path of the .xml file file_reader = XMLReader( downloaded_file_path_xml ) # create an XMLReader object (supply path to .xml file) file_reader.read() # read the .xml file file_reader.get(enzyme=None, enzyme_list=enzyme_list) # abstracts_with_enzymes = file_reader.get("TRANSCARBAMOYLASE") if len(file_reader.abstracts) == 0: print("File does not contain any enzymes from list") else: abstracts.extend(file_reader.abstracts) os.remove(downloaded_file_path_xml)
def train( dim_word=300, # word vector dimensionality ctx_dim=300, # context vector dimensionality semantic_dim=300, dim=1000, # the number of LSTM units cnn_dim=4096, # CNN feature dimension n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=True, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit cutoff=10, patience=5, max_epochs=30, dispFreq=500, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=1e-4, # used only for SGD selector=False, # selector (see paper) maxlen=30, # maximum length of the description optimizer='rmsprop', pretrained='', batch_size=256, saveto='model', # relative path of saved model file saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates embedding='../Data/GloVe/vocab_glove.pkl', cnn_type='vgg', prefix='../Data', # path to find data dataset='coco', criterion='Bleu_4', switch_test_val=False, use_cnninit=True, use_dropout=True, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if os.path.exists('%s.pkl' % saveto): print "Reloading options" with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(model_options['dataset']) # Load data from data path if 'switch_test_val' in model_options and model_options['switch_test_val']: train, valid, worddict = load_data(path=osp.join( model_options['prefix'], model_options['dataset']), options=model_options, load_train=True, load_test=True) else: train, valid, worddict = load_data(path=osp.join( model_options['prefix'], model_options['dataset']), options=model_options, load_train=True, load_val=True) # Automatically calculate the update frequency validFreq = len(train[0]) / model_options['batch_size'] print "Validation frequency is %d" % validFreq word_idict = {vv: kk for kk, vv in worddict.iteritems()} model_options['n_words'] = len(worddict) # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) # Initialize it with glove if 'VCemb' in params: params['VCemb'] = read_pkl( model_options['embedding']).astype('float32') # If there is a same experiment, don't use pretrained weights if os.path.exists('%s.npz' % saveto): print "Reloading model" params = load_params('%s.npz' % saveto, params) elif pretrained != '': params = load_params(pretrained, params, False) # Only pretrain the Language model # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas,\ cost, \ opt_outs = \ build_model(tparams, model_options) # Load evaluator to calculate bleu score evaluator = cocoEvaluation(model_options['dataset']) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = sum([ alpha_c * ((1. - alpha.sum(0))**2).sum(0).mean() for alpha in alphas ]) cost += alpha_reg # Backprop! grads = tensor.grad(cost, wrt=itemlist(tparams)) # to getthe cost after regularization or the gradients, use this # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) print 'Optimization' train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=model_options['maxlen']) # history_bleu is a bare-bones training log, reload history history_bleu = [] if os.path.exists('%s.npz' % saveto): history_bleu = numpy.load('%s.npz' % saveto)['history_bleu'].tolist() start_epochs = len(history_bleu) best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(start_epochs, model_options['max_epochs']): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx, cnn_feats = prepare_data(caps, train[1], train[2], worddict, model_options) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', model_options[ 'maxlen'] continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx, cnn_feats) print "Epoch %d, Updates: %d, Cost is: %f" % (eidx, uidx, cost) f_update(model_options['lrate']) ud_duration = time.time( ) - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Print a generated sample as a sanity check if numpy.mod(uidx, model_options['sampleFreq']) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score, alphas = gen_sample( f_init, f_next, ctx_s[jj], cnn_feats[jj], model_options, trng=trng, maxlen=model_options['maxlen']) # Decode the sample from encoding back to words print 'Truth ', jj, ': ', print seqs2words(x_s[:, jj], word_idict) for kk, ss in enumerate([sample[0]]): print 'Sample (', kk, ') ', jj, ': ', print seqs2words(ss, word_idict) # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) # Do evaluation on validation set imgid = collapse([elem[-1] for elem in valid[0]]) caps = process_examples([f_init], [f_next], imgid, valid[1], valid[2], word_idict, model_options) folder = osp.join('../output', '%s_%s' % (saveto, 'val')) if not osp.exists(folder): os.mkdir(folder) with open(osp.join(folder, 'captions_val2014_results.json'), 'w') as f: json.dump(caps, f) eva_result = evaluator.evaluate(folder, False) if model_options['criterion'] == 'combine': history_bleu.append(eva_result['Bleu_4'] + eva_result['CIDEr']) else: history_bleu.append(eva_result[model_options['criterion']]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or history_bleu[-1] == max(history_bleu): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto + '_bestll', history_bleu=history_bleu, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if len(history_bleu) > model_options[ 'patience'] and history_bleu[-1] <= max( history_bleu[:-model_options['patience']]): bad_counter += 1 if bad_counter > model_options['patience']: print 'Early Stop!' estop = True break print ' BLEU-4 score ', history_bleu[-1] # Checkpoint if numpy.mod(uidx, model_options['saveFreq']) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_bleu=history_bleu, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' print 'Seen %d samples' % n_samples if estop: break if model_options['save_per_epoch']: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_bleu=history_bleu, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_bleu=history_bleu, **params)
def set_data_rows(self, tuples): self.set_data(*ut.unzip(tuples))
def get_bcd_list(metadata): """ Metadata is a dict with keys: name, radecfile, data_dir, out_dir, work_dir, aors, channel, bcd_dict_path, max_cov """ radecfile = metadata["radecfile"] work_dir = metadata["work_dir"] aors = metadata["aors"] max_cov = metadata["max_cov"] # split the RA/Dec into two arrays radec = np.genfromtxt(radecfile) ra = radec[:, 0] dec = radec[:, 1] # read the region/ch/hdr specific bcd_dict in the work_dir for efficiency bcd_dict = json.load(open(metadata["bcd_dict_path"])) filenames, filepaths = [np.array(i) for i in unzip(bcd_dict.items())] # extract center pixel coordinates files_ra = np.zeros(filepaths.size) files_dec = np.zeros(filepaths.size) for i, fp in enumerate(filepaths): hdr = pyfits.getheader(fp) files_ra[i] = hdr["CRVAL1"] files_dec[i] = hdr["CRVAL2"] # make array of coordinates and grow the tree kdt = KDT(radec_to_coords(files_ra, files_dec)) # spawn processes using multiprocessing to check for images containing, # the source, using the tree to find only the closest BCDs to check ncpus = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=ncpus) # print "using %i CPUs" % ncpus max_num_images = 0 sources = [] for i in range(len(ra)): # create internal source ID and associate with each RA/Dec pair d = {"id": i, "ra": ra[i], "dec": dec[i]} message = "finding files associated with source {} at ({}, {})" print(message.format(i, ra[i], dec[i])) # get the subset of BCDs to search idx = get_k_closest_bcd_idx(ra[i], dec[i], kdt, k=max_cov) n_files = filepaths[idx].size filepaths_subset = filepaths[idx] filenames_subset = filenames[idx] argslist = zip([ra[i]] * n_files, [dec[i]] * n_files, filepaths_subset) # send jobs to the pool results = pool.map(source_in_image, argslist) # unzip the results and extract the boolean array and pixel coordinates results_unzipped = unzip(results) bool_arr = np.array(results_unzipped[0]) # if none found, continue to next source if np.sum(bool_arr) == 0: continue x = results_unzipped[1] y = results_unzipped[2] pix_coord = np.array(zip(x, y))[bool_arr].tolist() # get the names of the files associated with the source good_bcds = filenames_subset[bool_arr].tolist() # compare the number of associated images to the previous maximum num_images = len(good_bcds) print("\t{} images".format(num_images)) if num_images > max_num_images: max_num_images = num_images # store results in source dict and append to source list d["files"] = good_bcds d["pixels"] = pix_coord sources.append(d) outfile = "bcd_list.json" outfilepath = "/".join([work_dir, outfile]) with open(outfilepath, "w") as w: json.dump(sources, w, indent=4 * " ") print("created file: {}".format(outfilepath)) message = "maximum number of images associated with a source: {}" print(message.format(max_num_images))
def train(dim_word=100, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units attn_type='stochastic', # [see section 4 from paper] n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=False, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit alpha_entropy_c=0.002, # hard attn param RL_sumCost=True, # hard attn param semi_sampling_p=0.5, # hard attn param temperature=1., # hard attn param patience=10, max_epochs=5000, dispFreq=100, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=0.01, # used only for SGD selector=False, # selector (see paper) n_words=10000, # vocab size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 16, saveto='model.npz', # relative path of saved model file validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates data_path='./data', # path to find data dataset='flickr8k', dictionary=None, # word dictionary use_dropout=False, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates reload_=False, save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data(path=data_path) if dataset == 'coco': valid, _ = valid # the second one contains all the validation data # index 0 and 1 always code for the end of sentence and unknown token word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas, alphas_sample,\ cost, \ opt_outs = \ build_model(tparams, model_options) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=opt_outs['attn_updates'] if model_options['attn_type']=='stochastic' else None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg hard_attn_updates = [] # Backprop! if model_options['attn_type'] == 'deterministic': grads = tensor.grad(cost, wrt=itemlist(tparams)) else: # shared variables for hard attention baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') opt_outs['baseline_time'] = baseline_time alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c') alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean() # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule] if model_options['RL_sumCost']: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.* (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) else: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.* (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) # [equation on bottom left of page 5] hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] # updates from scan hard_attn_updates += opt_outs['attn_updates'] # to getthe cost after regularization or the gradients, use this # f_cost = theano.function([x, mask, ctx], cost, profile=False) # f_grad = theano.function([x, mask, ctx], grads, profile=False) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates) print 'Optimization' # [See note in section 4.3 of paper] train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False) # history_errs is a bare-bones training log that holds the validation and test error history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud_duration = time.time() - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Checkpoint if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' # Print a generated sample as a sanity check if numpy.mod(uidx, sampleFreq) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options, trng=trng, k=5, maxlen=30, stochastic=False) # Decode the sample from encoding back to words print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean() if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean() history_errs.append([valid_err, test_err]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto+'_bestll', history_errs=history_errs, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err print 'Seen %d samples' % n_samples if estop: break if save_per_epoch: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid) if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return train_err, valid_err, test_err