def perform_training_and_testing(training_stage, args, data): ''' Returns ------- The validation error. A quantity that we want to minimize. ''' stats = None with rasengan.tictoc(training_stage): with rasengan.debug_support(): if args.perform_training or args.perform_testing: with rasengan.tictoc("Circuit Compilation"): ttns = get_train_test_namespace(args) with rasengan.tictoc("Loading Parameters"): load_params_from_pklfile(ttns, args) pass rasengan.decrease_print_indent() print_pklfn_performance(args) rasengan.increase_print_indent() # Train if args.perform_training: with rasengan.tictoc("Training"): stats = lstm_seqlabel_training.training(args, data, ttns) # Test (IF asked) if args.perform_testing: with rasengan.tictoc("Testing"): stats = lstm_seqlabel_validation.testing(args, data, ttns) return (100 - stats) if stats is None: return 100 else: best_epoch_id = stats['best_epoch_id'] return (100 - stats['validation_result'][best_epoch_id]['f1'])
def main(): with rasengan.tictoc("Loading Graph"): graph = igraph.read(args.graph_fn) graph.to_undirected(mode="collapse", combine_edges=dict(weights="first")) with rasengan.tictoc("Creating Adjacent Node List"): adjacent_edge_list = graph.get_inclist() adjacent_node_list = graph.get_adjlist() # 1234 is just a random number. assert (adjacent_node_list[1234][0] in graph.es[adjacent_edge_list[1234][0]].tuple) total_vertices = float(len(adjacent_edge_list)) with rasengan.tictoc("Creating Local Node Prob List"): edge_prob_list = [] for idx, edges in enumerate(adjacent_edge_list): if idx % 1000 == 0: print idx / total_vertices * 100 weights = np.array(graph.es[edges]["weight"]) edge_prob_list.append(weights / weights.sum()) queries = read_queries(args.query_fn) for qid, query in queries.iteritems(): for start in query: weighted_random_walk(graph, start, adjacent_node_list, edge_prob_list, path_maxlength=args.path_maxlength, n_runs=args.n_runs)
def main(): import argparse arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}') arg_parser.add_argument( '--ci_pkl_fn', default='data/dbpedia_cat_index.pkl', type=str, help='A map from categories to integer') arg_parser.add_argument( '--wdc_pkl_fn', default='data/wikilink_dbpedia_categories.pkl', type=str, help='A list from url to their categories') arg_parser.add_argument( '--out_pkl_fn', default='data/wikilink_category_to_url_and_count_reverse_index.pkl', type=str) arg_parser.add_argument( '--out_tsv_fn', default='data/wikilink_category_to_count.tsv', type=str) arg_parser.add_argument( '--admissible_url_fn', default='data/dbpedia_people.list', type=str) args = arg_parser.parse_args() with rasengan.tictoc('Loading pkl'): ci = pkl.load(open(args.ci_pkl_fn)) url_to_cat_cnt = pkl.load(open(args.wdc_pkl_fn)) admissible_url = set([e.strip() for e in open(args.admissible_url_fn)]) with rasengan.tictoc('Creating WRI'): wri = WikilinkReverseIndex(ci, url_to_cat_cnt, admissible_url) with open(args.out_pkl_fn, 'wb') as f: pkl.dump(wri, f)
def svd_1(a, debug=True, inplace=True): assert a.flags.c_contiguous if debug: print_config() # NOTE: scipy.linalg.blas.ssyrk(1, a, trans=1, lower=1) # causes an unnecessary copy, because a is c_contiguous. with tictoc('Computing b'): b = scipy.linalg.blas.ssyrk(1, a.T, trans=0, lower=1) if debug: print_config() with tictoc('Computing eigh'): [bs, bu] = scipy.linalg.eigh(b, turbo=True, overwrite_a=True, check_finite=True) with tictoc('Scale bu inplace'): for i in xrange(bu.shape[1]-1, -1, -1): scalar = (1/numpy.sqrt(bs[i]) if bs[i] > 1e-6 else 0) scipy.linalg.blas.sscal(scalar, bu, n=bu.shape[0], offx=i*bu.shape[0]) if debug: print_config(msg='i=%d'%i) with tictoc('Inplace Matmul'): c = matrix_multiply_inplace.matmul(a, bu) del bu del bs if debug: print_config() return [c, i]
def call_impl(self, cat, train_idx, test_idx): with rasengan.tictoc('Fitting'): # 2.1s self.fit(self.smat[train_idx], train_idx=train_idx) self.smat = self.smat.tocsr() with rasengan.tictoc('Prediction'): # 20s scores = self.score(self.smat) self.pa(cat, scores, train_idx, test_idx, scratch=self.scratch) self.scratch = {}
def main(): global args args = populate_args() rnr = ExperimentRunner( datacfg=DATACONFIG, ppcfg=CONFIG[args.ppcfg], expcfg=EXPCONFIG[args.expcfg],) rnr() with rasengan.tictoc('Saving Results'): rnr.save_results(fn=args.out_pkl_fn) with rasengan.tictoc('Reporting'): rnr.report()
def __init__(self, datacfg, ppcfg, expcfg): # Init Part 0 self.datacfg = datacfg self.ppcfg = ppcfg self.expcfg = expcfg with rasengan.tictoc('Init Part 1 : The Datacfg'): self.cp = DbfilenameShelf( r'%s/%s'%(uc.get_pfx(),self.datacfg.cp_fn), protocol=-1, flag='r') self.url_list = self.cp['__URL_LIST__'] self.TM = self.cp['__TOKEN_MAPPER__'] # self.TM.final must be patched to work with older # versions of TokenMapper that are in the pickle. if not hasattr(self.TM, 'final'): self.TM.final = False if self.is_malignull(): self.TM([self.expcfg.NULL_KEY]) self.bos_idx = self.TM.finalize() self.pa = Aggregator( datacfg=datacfg, ppcfg=ppcfg, expcfg=expcfg, url_list=self.url_list, TM=self.TM) self.cat_folds = pkl.load(uc.proj_open(self.datacfg.fold_fn)) self.cat2url = uc.load_cat2url(uc.proj_open(self.datacfg.cat2url_fn)) self.url_to_idx = dict((b,a) for a,b in enumerate(self.url_list)) self.scratch = {} pass with rasengan.tictoc('Init Part 2 : The PP CFG'): print 'Reading', 'catpeople_pp_%d'%args.ppcfg self.smat = io.mmread(uc.proj_open('catpeople_pp_%d'%args.ppcfg)) assert scipy.sparse.isspmatrix_coo(self.smat) if self.pp_prefix_is([UNIVEC, BIVEC, MALIGNER, DSCTOKVEC]): self.vectors = np.load(uc.proj_open('catpeople_pp_%d.vec'%args.ppcfg)) pass if self.is_malignull(): self.NULL_VEC = np.zeros((1,self.vectors.shape[1])) if self.exp_prefix_is([NBKERNEL, KERMACH, MALIGNER]): assert self.pp_prefix_is([UNIVEC, BIVEC, DSCTOKVEC]) if self.expcfg.rm_fn_word: # Internally Manipulates smat self.remove_fn_word() if self.expcfg.weight_method.endswith('/df'): self.populate_idf() return
def main(): new_dict = {} with rasengan.tictoc('Extracting Features'): for k in vertex_dict.keys(): v = vertex_dict[k] new_dict[k] = Entity(v.guid, v.name, v.confidence, v.featsets, extract_feature_from_entity(v)) with rasengan.tictoc('Pickling'): with open(os.path.expanduser(args.out_fn), 'wb') as f: pkl.dump(dict(vertex_dict=new_dict, edgelist=edgelist, TOTAL_FEATURES=TOTAL_FEATURES, PERFECT_HASH=PERFECT_HASH), f, protocol=-1)
def main(): with open(args.graph_fn) as f: data = pickle.load(f) pass adjacent_node_dict = data['graph'] edge_prob_dict = data['graph_weights'] for vertex in edge_prob_dict: s = edge_prob_dict[vertex].sum() edge_prob_dict[vertex] = edge_prob_dict[vertex] / s queries = read_flatfile_query_fn(args.query_fn) vertex_count = len(adjacent_node_dict) query_visit = {} with rasengan.tictoc("Random Walks"): for qid, query in queries.iteritems(): for start in query: start = int(start) query_visit[qid, start] = weighted_random_walk( vertex_count, start, adjacent_node_dict, edge_prob_dict, path_maxlength=args.path_maxlength, n_runs=args.n_runs) with open(args.out_fn, 'wb') as ofh: pickle.dump(query_visit, ofh) return
def read_queries(query_fn): queries = {} with rasengan.tictoc("Reading Queries"): with open(query_fn) as f: for row in f: row = row.strip().split() qid = row[0] queries[qid] = [int(e) for e in row[1:]] return queries
def entity_list_to_dscfeat_csr_mat(cfg, catpeople): url_list = catpeople['__URL_LIST__'] yield_suf = cfg._name.startswith(DSCSUF) shape = (len(url_list), get_width_for_unisuf() if yield_suf else len(TM)) with rasengan.tictoc('Loading Parses'): # 1 min PARSES = pkl.load(util_catpeople.proj_open(cfg.parsefn)) print 'Total Rows:', len(url_list) iterator = (get_dscfeat_from_catpeople_entity(catpeople[url], cfg, PARSES, yield_suf) for url in url_list) return csr_mat_builder(iterator, shape=shape, verbose=0)
def ls_exp_call_impl(self, fold_idx, cat, train_idx, test_idx): mat, features = self.create_mat_features(cat, train_idx) print 'Using %d Features For Category %s' % (len(features), cat) if len(features) == 0: self.pa(cat, [], [], []) return # ------------------------ # # Start Training / Testing # # ------------------------ # set_train_idx = set(train_idx) set_test_idx = set(test_idx) features = sorted(features) mat, needles_in_haystack = self.get_mat_needles_in_haystack( features, [ i for i in xrange(self.smat.shape[0]) if (i not in set_train_idx) ]) labels = [(i in set_test_idx) for i in xrange(self.smat.shape[0]) if (i not in set_train_idx)] if self.expcfg.lsvc_loss == 'logloss': classifier = LogisticRegression(C=self.expcfg.lsvc_C, intercept_scaling=1000, penalty=self.expcfg.lsvc_penalty, dual=False, tol=1e-4, fit_intercept=True, verbose=0, random_state=args.seed, max_iter=1000) else: classifier = LinearSVC(C=self.expcfg.lsvc_C, intercept_scaling=1000, penalty=self.expcfg.lsvc_penalty, loss=self.expcfg.lsvc_loss, dual=False, tol=1e-4, fit_intercept=True, verbose=0, random_state=args.seed, max_iter=1000) with rasengan.tictoc('Fitting', timer='total_time'): classifier.fit(needles_in_haystack, labels) scores = classifier.decision_function(mat) # classifier.sparsify() self.pa(cat, scores, train_idx, test_idx, scratch=dict(coef=classifier.coef_, intercept=classifier.intercept_, features=features)) return mat, classifier
def create_AT(self, arr_gen, intmdt_fn=None): # TODO: Make a shortcut, it the array to be generated already exists try: I = arr_gen.I except AttributeError: I = arr_gen[0].shape[0] AT_arr_shape = (I, self.intermediate_dim * len(arr_gen)) if intmdt_fn is None: print "Allocating array of size", AT_arr_shape AT_arr = numpy.empty(AT_arr_shape, dtype='float32', order='C') else: AT_arr = numpy.memmap(intmdt_fn, dtype='float32', mode='w+', shape=AT_arr_shape, order='C') transform_f = VT.parse(self.view_transform) for arr_idx, _arr in enumerate(arr_gen): # arr = numpy.asfortranarray(transform_f(_arr)) arr_ = transform_f(_arr) arr = arr_.tocsc() if arr is not _arr: del _arr if arr is not arr_: del arr_ print >> sys.stderr, arr_idx, arr.shape, arr.max(), arr.min() print_config(msg='Started SVDS') with tictoc('Timing SVD', override='stderr'): [A, S, B] = sparse_svd(arr, self.intermediate_dim, method=self.svd_method) print_config(msg='Finished SVDS') if self.mean_center: if B is not None: [A, S, B] = lib_linalg.mean_center(A, S, B, arr) del B else: [A, S] = lib_linalg.mean_center(A, S, arr) A *= self.create_T(S) begin = self.intermediate_dim * arr_idx end = self.intermediate_dim * (arr_idx + 1) AT_arr[:, begin:end] = A del A, S, B print_config(msg='Finished processing Array: ' + str(arr_idx)) if intmdt_fn is not None: AT_arr.flush() return AT_arr
def update_shelf(): url_mention = DbfilenameShelf(args.in_shelf, protocol=-1) TM = url_mention['__TOKEN_MAPPER__'] TM.finalize(catpeople_baseline_nb_config.MAX_TOK) E = url_mention['__URL_LIST__'] n_doc = 10000 with rasengan.tictoc('Extracting Contexts'): df_obj = TextualClueObject(E[:n_doc], url_mention, TM) df = defaultdict(int) for features in df_obj.features.itervalues(): for f in features: df[f] += 1 for f in df.keys(): df[f] = df[f] / float(n_doc) url_mention['__DF__'] = dict(df) url_mention.close() return
def make(args, force=False, pipeline=False): ''' In each training run we have to check whether a trained pickle file `saveto` for that stage already exists. If it exists then we skip the training. Otherwise we load the trained parameters and when we leave then we set the pretrained_param_pklfile as the pkl file that we just saved parameters to (or that already exists). Also we restore the state of perform_training and saveto to defaults. Params ------ args : saveto : force : (default False) Returns ------- ''' saveto = os.path.join(args.folder, args.pkl_name) with rasengan.tictoc('Making ' + saveto): rasengan.ensure_dir(args.folder, verbose=1, treat_as_dir=1) if hasattr(args, 'saveto'): assert args.saveto == saveto, str((args.saveto, saveto)) else: args.saveto = saveto print 'Set args.saveto=', args.saveto # Check whether we need to do any training unless forced # explicitly. pt = args.perform_training if not force and os.path.exists(args.saveto): args.perform_training = 0 rasengan.increase_print_indent() #----# yield #----# rasengan.decrease_print_indent() args.perform_training = pt if pipeline: # Set the pretrained_param_pklfile field to a value after saving # parameters to that location. args.pretrained_param_pklfile = args.saveto # Reset args.saveto to null args.saveto = None
def entity_list_to_ngram_csr_mat(cfg, catpeople, width=None, n=0, add_governor_arc_label=False): assert n in [0, 1] url_list = catpeople['__URL_LIST__'] shape = (len(url_list), len(TM) if width is None else width) PARSES = None if add_governor_arc_label: assert n == 0 with rasengan.tictoc('Loading Parses'): # 1 min PARSES = pkl.load(util_catpeople.proj_open(cfg.parsefn)) iterator = (get_ngrams_from_catpeople_entity(n, catpeople[url], cfg, PARSES, yield_nsuf=True) for url_idx, url in enumerate(url_list)) else: iterator = (get_ngrams_from_catpeople_entity(n, catpeople[url], cfg, None) for url_idx, url in enumerate(url_list)) return csr_mat_builder(iterator, shape=shape, verbose=0)
def main(): import argparse arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}') arg_parser.add_argument('--emb_pkl_fn', default='data/demonstrate_similarity_idea.emb.pkl', type=str) arg_parser.add_argument( '--feat_file', default='data/random/details/89c0c894.American_women_writers', type=str) arg_parser.add_argument('--ctag', default=None, type=int) arg_parser.add_argument('--mode_count', default=5, type=int) arg_parser.add_argument('--method', default='fast_relax', type=str, choices=[ 'brute_force', 'fast_relax', 'annealed_gibbs', 'maxproduct-bp', 'variational_inference', 'dc_programming' ]) args = arg_parser.parse_args() import random random.seed(args.seed) numpy.random.seed(args.seed) cfg.mode_count = args.mode_count tags_to_remove = defaultdict(list) with rasengan.tictoc('Loading pkl'): embeddings = pkl.load(open(args.emb_pkl_fn)) if cfg.introduce_NULL_embedding: embeddings[cfg.NULL_KEY] = numpy.zeros( next(embeddings.itervalues()).shape) with rasengan.debug_support(): for mode_idx in range(cfg.mode_count): print 'mode_idx=', mode_idx entity_tags = {} entities = [] for row in open(args.feat_file): _e, _tags = [e.strip() for e in row.strip().split('|||')] entities.append(_e) entity_tags[_e] = set([ t.lower() for t in (e.strip().split(':')[0] for e in _tags.split()) if t.lower() in embeddings ]) total_tags = set( rasengan.flatten([list(e) for e in entity_tags.values()])) assert all(e in embeddings for e in total_tags) print( 'For each of these people our goal is to select one word.' ' That word should be as similar to other words picked for other' ' entities as possible') problem = rasengan.OrderedDict_Indexable_By_StringKey_Or_Index() for (a, b) in entity_tags.items(): b = list(b) print 'Entity: ', a, 'tags to remove: ', tags_to_remove[a] for ttr in tags_to_remove[a]: tolerant_remove(b, ttr) if cfg.introduce_NULL_embedding and cfg.NULL_KEY not in b: b.append(cfg.NULL_KEY) # print '%-25s' % a, '|||', ', '.join(b) problem[a] = DataFrame(data=numpy.concatenate( [(scale_to_unit(embeddings[e]) if cfg.scale_to_unit else embeddings[e])[None, :] for e in b], axis=0), index=b) if args.ctag is None: initial_assignment = dict( (__a, 0) for __b, __a in enumerate(entities)) else: ctag = 'war'.split()[args.ctag] initial_assignment = dict( (__e, (cfg.NULL_KEY if ctag not in entity_tags[__e] else ctag)) for __e in entities) print 'Initial chosen tags::', chosen_tags(problem, initial_assignment) initial_objective = dp_objective_efficient_impl( problem, initial_assignment) print 'initial_objective=', initial_objective assert numpy.isclose( dp_objective_naive_impl(problem, initial_assignment), initial_objective) final_assignment = optimize_assignment(problem, initial_assignment, method=args.method) final_objective = dp_objective_efficient_impl( problem, final_assignment) for (fa_entity, fa_tag_idx) in final_assignment.iteritems(): tags_to_remove[fa_entity].append( liloc(problem[fa_entity], fa_tag_idx).name) print 'mode_idx=', mode_idx, print 'initial_objective=', initial_objective, print 'final_objective=', final_objective, print 'Final chosen tags=', chosen_tags(problem, final_assignment) return
print preamble, 'AUPR=%.3f' % rasengan.rank_metrics.average_precision( testing_output), testing_output = [(1 if e in set_I else 0) for e in range(total_persons) if e not in set_train_idx] sto = sum([e for e in testing_output]) e0_to = [e for e in testing_output] print 'CORRECTAUPR=%.3f' % rasengan.rank_metrics.average_precision( e0_to), \ 'CORRECTP@10=%.3f' % (rasengan.rank_metrics.precision_at_k( e0_to, 10) if sto > 10 else -1), \ 'CORRECTP@100=%.3f' % (rasengan.rank_metrics.precision_at_k( e0_to, 100) if sto > 100 else -1) continue with rasengan.tictoc('Writing graph file'): with open('graph_file', 'wb') as f: for row, col, val in zip(*scipy.sparse.find(s_features)): if col != predicate_idx and (dnd != 'wo_doc' or col not in docfeat_idx): f.write('%d\t%d\t%.4f\n' % (row, total_persons + col, val)) with rasengan.tictoc('Writing Seed File'): with open('seed_file', 'wb') as f: for node in train_idx: if node in set_I: f.write('%d\tL1\t1\n' % node) else: f.write('%d\tL0\t1\n' % node)
from rasengan import rank_metrics import os import ipdb as pdb import igraph import argparse import itertools arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument('--seed', default=0, type=int) arg_parser.add_argument('--rw_walk_num', default=10, type=int) arg_parser.add_argument('--rw_max_step', default=3, type=int) args = arg_parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) with rasengan.tictoc('Setup'): IDX_PKL_FN = r'../../scratch/relational_bbn2_train_test_idx.pkl' fn = os.path.expanduser( '~/data/tackbp2015bbn2/basicfeaturization_relational_bbn2.pkl') data = pkl.load(open(fn)) vertex_dict = data['vertex_dict'] edgelist = data['edgelist'] TOTAL_FEATURES = data['TOTAL_FEATURES'] F2I_MAP = data['PERFECT_HASH'] I2F_MAP = dict((a, b) for (b, a) in F2I_MAP.iteritems()) guid_list = vertex_dict.keys() vertices = [vertex_dict[e] for e in guid_list] features = [v.features for v in vertices] row_names = [v.name for v in vertices] def index_row_names(idi):
def process_AT(self, AT_arr, debug=False): print_config(msg='Started svd_1') with tictoc('Performing Final SVD', override='stderr'): [G, i] = lib_linalg.svd_1(AT_arr, debug=debug) print_config(msg='Finished svd_1') return G
| Last-Updated: Thu Sep 1 13:42:50 2016 (-0400) | By: Pushpendre Rastogi | Update #: 10 ''' import cPickle as pkl from wikilink_category_to_url_and_count_reverse_index import WikilinkReverseIndex import argparse import rasengan arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument( '--caturl_pkl', default='data/wikilink_category_to_url_and_count_reverse_index.pkl', type=str) arg_parser.add_argument('--chosen_cat', default='data/chosen_wikilink_categories', type=str) arg_parser.add_argument('--mention_thresh', default=10, type=int) args = arg_parser.parse_args() chosen_cat = set( [e.strip().split()[0] for e in open(args.chosen_cat) if e != '\n']) with rasengan.tictoc('loading pkl', override='stderr'): caturl = pkl.load(open(args.caturl_pkl)) for row in open(args.chosen_cat): if row == '\n': continue cat = row.strip().split()[0] for url, cnt in caturl[cat]: if cnt >= args.mention_thresh: print cat, url
default=None, type=str, help='Default={None}') arg_parser.add_argument('--cache_fn', default=None, type=str, help='Default={None}') arg_parser.add_argument('--leaf_fn', default=None, type=str, help='Default={None}') args = arg_parser.parse_args() # ------------------- # # Initialize Globals. # # ------------------- # with rasengan.tictoc('Initializing Globals'): CFG = rasengan.deep_namespacer( yaml.load(open('relationalize_base_graph.yaml').read())) FOREIGN_NS, BASE_NS = pickle.load(open(args.cache_fn, 'rb')) ORG_TYPES = [ 'adept-core#OrgHeadquarter', 'adept-core#Organization', 'adept-core#OrganizationWebsite', 'adept-core#StartOrganization', 'adept-core#EndOrganization', 'adept-core#Membership', 'adept-core#Subsidiary' ] NONRELATIONAL_TYPES = [ 'adept-base#Date', 'adept-core#Crime', 'adept-core#GeoPoliticalEntity', 'adept-core#Person', 'adept-core#Title', 'adept-core#URL' ] # ---------------- #
def train_transducer_lbfgs( train_lex, train_y, args, ttns, training_stats, batch_size=None): ''' This function completes a training epoch by doing one run of LBFGS. `ts` abbreviates `train_stack` in entire function Params ------ train_lex : A list of input_strings (the strings are represented as np arrays) train_y : A list of output strings batch_size : UNUSED : (default None) ''' assert args.clipping_value < 0 assert args.projection_threshold < 0 ts_param_name = [ str(e) for e in ttns.train_stack_config.updatable_parameters()] print 'The following params will be trained by lbfgs', ts_param_name ts_param_shape_list = [ttns.train_stack_config[name].get_value().shape for name in ts_param_name] ts_param_shape_map = dict(zip(ts_param_name, ts_param_shape_list)) total_param = sum(numpy.prod(shape) for shape in ts_param_shape_map.values()) def set_entries_in_ttns(param_vec): ''' Set entries in ttns.train_stack_config with corresponding values in param_vec. ''' param_vec = param_vec.astype('float32') offset = 0 for name in ts_param_name: shape = ts_param_shape_map[name] numel = numpy.prod(shape) ttns.train_stack_config[name].set_value( param_vec[offset:offset + numel].reshape(shape)) offset += numel pass return def vectorize(param_list, dtype='float32'): param_vec = numpy.zeros((total_param,), dtype=dtype) offset = 0 for idx, param in enumerate(param_list): shape = param.shape assert shape == ts_param_shape_list[idx] numel = numpy.prod(shape) param_vec[offset:offset + numel] = param.reshape((numel,)).astype(dtype) offset += numel pass return param_vec def get_entries_in_ttns(): ''' Set entries in ttns.train_stack_config with corresponding values in param_vec. ''' return vectorize( [ttns.train_stack_config[name].get_value() for name in ts_param_name]) def loss_over_corpus(param_vec): ''' Compute the loss value over the entire corpus. ''' set_entries_in_ttns(param_vec) corpus_cost = 0 for idx in range(len(train_lex)): input_string = train_lex[idx] output_string = train_y[idx] corpus_cost += ttns.train_f_cost(input_string, output_string) return corpus_cost / len(train_lex) def gradient_over_corpus(param_vec): set_entries_in_ttns(param_vec) corpus_grad = numpy.zeros((total_param,), dtype='float64') for idx in range(len(train_lex)): input_string = train_lex[idx] output_string = train_y[idx] tmp_grad = ttns.train_f_grad(input_string, output_string) corpus_grad += vectorize(tmp_grad, 'float64') return corpus_grad / len(train_lex) with rasengan.tictoc("Training %d epoch"%training_stats['epoch_id']): init_param = get_entries_in_ttns() rasengan.warn('Skipped FD Check') # print 'Check grad output: Error=', scipy.optimize.check_grad(func=loss_over_corpus, grad=gradient_over_corpus, x0=init_param) opt_param = scipy.optimize.fmin_l_bfgs_b( loss_over_corpus, init_param, fprime=gradient_over_corpus, disp=2, maxiter=1000)[0] set_entries_in_ttns(opt_param) return
#!/usr/bin/env python ''' | Filename : wikimic_create_entity_token_set.ipkl.py | Description : Convert a giant pickle file that can't be processed | conveniently with other data into a streaming pkl. | Author : Pushpendre Rastogi | Created : Fri Aug 5 11:37:23 2016 (-0400) | Last-Updated: Fri Aug 5 11:47:29 2016 (-0400) | By: Pushpendre Rastogi | Update #: 1 ''' import argparse from rasengan import sPickle, tictoc arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument('--in_fn', type=str) arg_parser.add_argument('--out_fn', type=str) args = arg_parser.parse_args() with tictoc('Loading Data'): import cPickle as pkl data = pkl.load(open(args.in_fn)) with tictoc('Writing Data'): with open(args.out_fn, 'wb') as f: sPickle.s_dump(data.iteritems(), f)
| Update #: 24 ''' import cPickle as pkl import rasengan import yaml # yaml_data = yaml.load(open('data/women_writer_manual_clues.yaml')) # tags = set( # rasengan.flatten([(rasengan.flatten(b[1::2])) for b in # yaml_data.values()])) tags = {} for row in open('data/entity_descriptors_procoref~1.psv'): entity, _tags = [e.strip() for e in row.strip().split('|||')] for t in (e.strip().split(':')[0] for e in _tags.split()): tags[t] = None tags[t.lower()] = None print len(tags) with rasengan.tictoc('Loading MVLSA emb'): data = pkl.load(open( '/Users/pushpendrerastogi/data/embedding/mvlsa/combined_embedding_0.emb.pkl')) tag_emb = {} for tag in tags: try: tag_emb[tag] = data[tag] except KeyError: print tag with open('data/demonstrate_similarity_idea.emb.pkl', 'wb') as f: pkl.dump(tag_emb, f, protocol=-1)
len(fold[1])] + self.get_fold_stats(fold)) def fold_stats(self, add_cat=False): if add_cat: return [[cat, fold_idx] + self.get_fold_stats(fold) for cat in self.record for fold_idx, fold in enumerate(self.record[cat])] else: return [ self.get_fold_stats(fold) for cat in self.record for fold in self.record[cat] ] def __str__(self): fold_stats = self.fold_stats() return ('(AUPR %.3f %.3f) (P@10 %.3f %.3f) (P@100 %.3f %.3f) ' '(MRR %.3f %.3f)') % tuple( numpy.array(fold_stats).mean(axis=0).tolist()) if __name__ == '__main__': import cPickle as pkl for f in [ '/export/b15/prastog3/catpeople_experiment.ppcfg~8.expcfg~303.pkl' ]: print '--------- FILE: ', f with tictoc('Loading Pkl'): data = pkl.load(open(f)) # data.limit=1000 print data
| Update #: 28 ''' import cPickle as pkl import os.path import bz2 import rasengan from collections import defaultdict opj = os.path.join dbpdir = os.path.expanduser('~/Downloads/dbpedia') from util_wikiurl import simplify_wiki_url DBPEDIA_PREF_LEN = len('<http://dbpedia.org/resource/') CAT_PREF_LEN = len('Category:') cat_index = {} row_idx = 0 with rasengan.tictoc('LOADING CATEGORY INDEX FROM DBPEDIA'): # 100s for row in bz2.BZ2File(opj(dbpdir, 'article_categories_en.ttl.bz2')): # Discard rows that start with '#' since they are comments. if row.startswith('#'): continue row = row.strip().split() cat = row[2][DBPEDIA_PREF_LEN + CAT_PREF_LEN:-1] if cat not in cat_index: cat_index[cat] = row_idx row_idx += 1 with open('data/dbpedia_cat_index.pkl', 'wb') as f: pkl.dump(cat_index, f) with rasengan.tictoc('LOADING ARTICLE to CATEGORY MAP FROM DBPEDIA'): # 121s art_cat = defaultdict(list)
def main(): with rasengan.tictoc('Yaml Loading'): cfg = rasengan.deep_namespacer( yaml.load(open('relationalize_base_graph.yaml'))) feat_strings = [] for k in (_ for _ in cfg.features if _ not in ['adept-core#ChargeIndict', 'adept-core#BeBorn']): v = [ e for e in cfg.features[k].keys() if e not in ['person', 'document', 'confidence'] ] for e in v: feat_strings.append(k + '~' + e + '~name~') fn = ('/Users/pushpendrerastogi/data/' 'tackbp2015bbn2/basicfeaturization_relational_bbn2.pkl') data = pkl.load(open(fn)) vertex_dict = data['vertex_dict'] edgelist = data['edgelist'] TOTAL_FEATURES = data['TOTAL_FEATURES'] F2I_MAP = data['PERFECT_HASH'] I2F_MAP = dict((a, b) for (b, a) in F2I_MAP.iteritems()) TOTAL_PERSONS = len(vertex_dict) with rasengan.tictoc('docs creation'): docs = set([ fs['~document'] for v in vertex_dict.values() for fs in v.featsets ]) with rasengan.tictoc('s_features creation'): guid_list = vertex_dict.keys() vertices = [vertex_dict[e] for e in guid_list] features = [v.features for v in vertices] row_names = [v.name for v in vertices] data = [] row = [] col = [] for r, f in enumerate(features): for c, d in f: data.append(d) col.append(c) row.append(r) s_features = csc_matrix((data, (row, col)), shape=[len(vertex_dict), TOTAL_FEATURES]) feature_occurrence = lambda rpfx: [(e, v, s_features[:, v].getnnz( ), s_features[:, v].sum()) for e, v in F2I_MAP.items() if e.startswith(rpfx)] high_occurrence_feat = lambda rpfx: sorted( [e for e in feature_occurrence(rpfx) if e[2] > 1], key=lambda x: x[2], reverse=True) # adept-core#Origin~origin~name~"Israel" # adept-core#Resident~location~name~"United States" # adept-core#Die~pod~name~"Iraq" # adept-core#Die~pod~name~"United States" # adept-core#Die~pod~name~"Pakistan" # adept-core#Leadership~subject_org~name~"Democrats" # adept-core#StudentAlum~almamater~name~"Harvard University" # adept-core#InvestorShareholder~invested_org~name~"Chrysler" # adept-core#InvestorShareholder~invested_org~name~"Boston Globe" # adept-core#InvestorShareholder~invested_org~name~"New York Post" # adept-core#EmploymentMembership~employer~name~"United States" # adept-core#Role~role~name~"manager" # adept-core#Founder~founded_org~name~"Church" # adept-core#Founder~founded_org~name~"Solamere Capital" # adept-core#Founder~founded_org~name~"Tesla Motors" # We used the high_occurrence_feat function to find out the right # features to use. if LOAD_DATA: data_used = pkl.load(open(IDX_PKL_FN, 'rb')) else: data_used = defaultdict(dict) for feat in [ 'adept-core#EmploymentMembership~employer~name~"Army"', 'adept-core#EmploymentMembership~employer~name~"White House"', 'adept-core#Leadership~subject_org~name~"Democratic"', 'adept-core#Leadership~subject_org~name~"Parliament"', 'adept-core#Origin~origin~name~"American"', 'adept-core#Origin~origin~name~"Russia"', 'adept-core#Resident~location~name~"Chinese"', 'adept-core#Resident~location~name~"Texas"', 'adept-core#Role~role~name~"author"', 'adept-core#Role~role~name~"director"', 'adept-core#StudentAlum~almamater~name~"Harvard"', 'adept-core#StudentAlum~almamater~name~"Stanford"' ]: feat_idx = F2I_MAP[feat] I = list(s_features[:, feat_idx].nonzero()[0]) sI = set(I) Ic = list([_ for _ in range(s_features.shape[0]) if _ not in sI]) random.shuffle(I) random.shuffle(Ic) # preamble = '\npfx=%s feature_rank=%d feat=%s' % ( # pfx, feature_rank, feat) for trials in range(5): preamble = 'feat=%s' % feat try: if len(I) < 10: raise DatasetTooSmall ds = Dataset(row_names, s_features, I, Ic, I2F_MAP, perma_mask=[feat_idx], test_size_by2=min(25, len(I) / 2)) if not LOAD_DATA: data_used[feat][trials] = dict( train=ds.get_train_set_idx(5), test=ds.get_test_set_idx()) assert feat_idx not in ds.col_idx_to_keep() except DatasetTooSmall: print preamble, '\nTest Set too big' continue if not LOAD_DATA: continue for train_size_by2 in [5]: # (2, 5, 10, 20): for mask_pattern in (re.compile('~document~.*'), re.compile('XXXX')): print preamble, 'train_size_by2=%d' % train_size_by2, \ 'mask_pattern.pattern=%s' % mask_pattern.pattern train_idx = (data_used[feat][trials]['train'] if LOAD_DATA else None) if USE_SMALL_TEST_SET: test_idx = (data_used[feat][trials]['test'] if LOAD_DATA else None) else: set_train_idx = set(train_idx) test_idx = ([ i for i in range(TOTAL_PERSONS) if i not in set_train_idx ] if LOAD_DATA else None) binary_linear_classifier_diagnostics( ds, train_size_by2=train_size_by2, mask_pattern=mask_pattern, train_idx=train_idx, test_idx=test_idx) # The feature runs are over. if not LOAD_DATA: with open(IDX_PKL_FN, 'wb') as f: pkl.dump(dict(data_used), f, protocol=-1)
return (float(x) * y) / (x + y) / 2 # --------------------------- # # BEGIN SCRIPT FUNCTIONALITY # # --------------------------- # import argparse arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument('--cnt_transform', default='GM_SQRT_FREQ_SQRT_COUNT', type=str) arg_parser.add_argument('--print_entity_list', default=1, type=int) arg_parser.add_argument('--intervene_modes', default=1, type=int) args = arg_parser.parse_args() with tictoc('Loading emb pkl'): dcr2emb = pkl.load(open('data/demonstrate_similarity_idea.emb.pkl')) for e in dcr2emb: dcr2emb[e] = scale_to_unit(dcr2emb[e]) cat2mode = get_cat2mode() CONSTANT = (lambda x, t: 1) COUNT = (lambda x, t: x) LOG_COUNT = (lambda x, t: math.log(1 + x)) SQRT_COUNT = (lambda x, t: math.sqrt(x)) FREQ = (lambda x, t: float(x + 1) / (t + 1)) SQ_FREQ = (lambda x, t: (float(x + 1) / (t + 1))**2) SQRT_FREQ = (lambda x, t: math.sqrt(float(x + 1) / (t + 1))) PROD_SQRT_FREQ_SQRT_COUNT = ( lambda x, t: SQRT_COUNT(x, t) * SQRT_FREQ(x, t)) GM_SQRT_FREQ_SQRT_COUNT = ( lambda x, t: math.sqrt(SQRT_COUNT(x, t) * SQRT_FREQ(x, t)))
def main(args): with rasengan.debug_support(): with rasengan.tictoc("Loading Data"): data_list = rasengan.namespacer( read_data(args.train_fn)) val_data_list = rasengan.namespacer( read_data(args.dev_fn)) if args.partition_dev_into_train > 0: lim = args.partition_dev_into_test data_list.extend(val_data_list[lim:]) val_data_list = val_data_list[:lim] if args.partition_dev_into_test > 0: lim = args.partition_dev_into_test test_data_list = val_data_list[lim:] val_data_list = val_data_list[:lim] else: test_data_list = rasengan.namespacer( read_data(args.test_fn)) # data_list = val_data_list = [(u'jason', u'eisner')] lst_char = get_lst_char(data_list + val_data_list + test_data_list) data_list = add_bos(data_list) val_data_list = add_bos(val_data_list) test_data_list = add_bos(test_data_list) warnings.warn(''' NOTE: While preparing sigma, we add 1 to the index returned by enumerate because the transducer unit that Ryan wrote uses index 0 as the index for the epsilon symbol. So essentially the epsilon symbol and the integer 0 are reserved symbols that cannot appear in the vocabulary. ALSO, we need to add 1 to the vocsize because of that. ''') # sigma :: char -> int sigma = dict((b, a+1) for (a,b) in enumerate(lst_char)) # sigma_inv :: int -> char sigma_inv = dict((a+1, b) for (a,b) in enumerate(lst_char)) if args.limit_corpus > 0: data_list = data_list[:args.limit_corpus] train_data = numerize(data_list, sigma, args.win) val_data = numerize(val_data_list, sigma, args.win) test_data = numerize(test_data_list, sigma, args.win) data = rasengan.Namespace() #-------------------------------------------------------------# # Add sets that would be used by the tensorflow seq2seq # # model. See~$PY/tensorflow/models/rnn/translate/translate.py # #-------------------------------------------------------------# data.train_data = data_list data.val_data = val_data_list data.test_data = test_data_list data.train_set = train_data data.dev_set = val_data data.test_set = test_data data.vocsize = len(sigma) + 1 data.idx2label = sigma_inv data.label2idx = sigma data.train_lex = [e[0] for e in train_data] data.train_y = [e[1] for e in train_data] data.valid_lex = [e[0] for e in val_data] data.valid_y = util_lstm_seqlabel.convert_id_to_word( [e[1] for e in val_data], data.idx2label) data.test_lex = [e[0] for e in test_data] data.test_y = util_lstm_seqlabel.convert_id_to_word( [e[1] for e in test_data], data.idx2label) data.words_train = [] data.words_valid = [] data.words_test = [] return data
def train_transducer_lbfgs(train_lex, train_y, args, ttns, training_stats, batch_size=None): ''' This function completes a training epoch by doing one run of LBFGS. `ts` abbreviates `train_stack` in entire function Params ------ train_lex : A list of input_strings (the strings are represented as np arrays) train_y : A list of output strings batch_size : UNUSED : (default None) ''' assert args.clipping_value < 0 assert args.projection_threshold < 0 ts_param_name = [ str(e) for e in ttns.train_stack_config.updatable_parameters() ] print('The following params will be trained by lbfgs', ts_param_name) ts_param_shape_list = [ ttns.train_stack_config[name].get_value().shape for name in ts_param_name ] ts_param_shape_map = dict(zip(ts_param_name, ts_param_shape_list)) total_param = sum( numpy.prod(shape) for shape in ts_param_shape_map.values()) def set_entries_in_ttns(param_vec): ''' Set entries in ttns.train_stack_config with corresponding values in param_vec. ''' param_vec = param_vec.astype('float32') offset = 0 for name in ts_param_name: shape = ts_param_shape_map[name] numel = numpy.prod(shape) ttns.train_stack_config[name].set_value( param_vec[offset:offset + numel].reshape(shape)) offset += numel pass return def vectorize(param_list, dtype='float32'): param_vec = numpy.zeros((total_param, ), dtype=dtype) offset = 0 for idx, param in enumerate(param_list): shape = param.shape assert shape == ts_param_shape_list[idx] numel = numpy.prod(shape) param_vec[offset:offset + numel] = param.reshape( (numel, )).astype(dtype) offset += numel pass return param_vec def get_entries_in_ttns(): ''' Set entries in ttns.train_stack_config with corresponding values in param_vec. ''' return vectorize([ ttns.train_stack_config[name].get_value() for name in ts_param_name ]) def loss_over_corpus(param_vec): ''' Compute the loss value over the entire corpus. ''' set_entries_in_ttns(param_vec) corpus_cost = 0 for idx in range(len(train_lex)): input_string = train_lex[idx] output_string = train_y[idx] corpus_cost += ttns.train_f_cost(input_string, output_string) return corpus_cost / len(train_lex) def gradient_over_corpus(param_vec): set_entries_in_ttns(param_vec) corpus_grad = numpy.zeros((total_param, ), dtype='float64') for idx in range(len(train_lex)): input_string = train_lex[idx] output_string = train_y[idx] tmp_grad = ttns.train_f_grad(input_string, output_string) corpus_grad += vectorize(tmp_grad, 'float64') return corpus_grad / len(train_lex) with rasengan.tictoc("Training %d epoch" % training_stats['epoch_id']): init_param = get_entries_in_ttns() rasengan.warn('Skipped FD Check') # print 'Check grad output: Error=', scipy.optimize.check_grad(func=loss_over_corpus, grad=gradient_over_corpus, x0=init_param) opt_param = scipy.optimize.fmin_l_bfgs_b(loss_over_corpus, init_param, fprime=gradient_over_corpus, disp=2, maxiter=1000)[0] set_entries_in_ttns(opt_param) return
methods like ADAGrad can handle it. 8. I need to do mean normalization. ''' if __name__ == '__main__': import argparse arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument('--seed', default=0, type=int) arg_parser.add_argument( '--config', type=str, default='Mvlsa@intermediate_dim~300@view_transform~SQROOT@mean_center~0' ) arg_parser.add_argument('--I', default=config.TREC_WEB_N_ENTITIES, type=int) arg_parser.add_argument('--fn', default=config.TREC_WEB_HIT_LIST_NPZ, type=str) arg_parser.add_argument('--test', default=0, type=int) args = arg_parser.parse_args() import random random.seed(args.seed) numpy.random.seed(args.seed) out_fn = os.path.join(config.TREC_WEB_STORAGE, args.config) if args.test: args.fn = None G = embed(args.config, args.I, args.fn, save_intmdt_fn=out_fn + '.AT_arr') with tictoc('Pickling G'): with open(out_fn, 'wb') as f: numpy.save(f, G, allow_pickle=False)