def order_unigrams_and_bigrams(self): self.fS = {} self.features = {} for e, Se in itertools.izip(self.S, self.Se_list): e_feat = self.create_ngrams(Se, 1) e_feat.update(self.create_ngrams(Se, 2)) self.features[e] = e_feat self.fS = set(rasengan.flatten(( list(v) for v in self.features.itervalues()))) return
def catpeople_stats(): ''' Show Statistics about the CatPeople Dataset ''' (url_mention, TM, E, cat_folds, cat2url, performance_aggregator, _) = setup() print 'Total Number of entities', len(E) print 'Total Number of Categories', len(cat2url) print 'Total Number of URLs', len( set(rasengan.flatten(cat2url.itervalues()))) print 'Total Number of mentions', sum(len(url_mention[e]) for e in E) return
def process_soup(page_html): soup = BeautifulSoup(page_html, 'html.parser') data = [] cur_div = soup.body.div assert cur_div is not None while cur_div is not None: try: style = Style(cur_div['style']) span_style = Style(cur_div.span['style']) all_span_strings = rasengan.flatten( list([list(e.stripped_strings) for e in cur_div.children])) data.append((style, span_style, all_span_strings)) except: pass cur_div = cur_div.next_sibling return (Style(soup.span['style']), data)
def random_perf(g, node_set, train_node_idx, label_list): _, test_data = get_train_test_data(g, node_set, train_node_idx) # -------------------------------------------------------------------- # # Figure out how many times does a test president/vicepresident really # # has multiple parties/labels? Count this amongst test node in 10 fold # # -------------------------------------------------------------------- # # node2label = defaultdict(list) # for label, nodes in test_data.iteritems(): # for node in nodes: # node2label[node].append(label) # print "Test Nodes With Multiple Labels", \ # sum((len(labels) > 2) for labels in node2label.values()) node_rating = {} for node in rasengan.flatten(test_data.values()): node_rating[node] = {} return mad_rating(node_rating, label_list, test_data)
def get_cautious_update_f(updates, lr, x, y, cost): ''' Sometimes the theano optimizer may layout the updates improperly. so that some parameters get updated in place before their effects on other gradients and updates have been computed. This can happen during nested scans for example. One simple strategy to overcome this problem is to stage the `troublemaking` variables into their own staging area. Update the staging area and then to copy the updates over. This way updates won't suffer from a race condition. Params ------ updates : The update expression list of tuples, First element of tuple is the paraemter to be updated and then second one is the update expression. lr : The learning rate. x : The input sentence tv. y : The gold output tv. cost : The cost tv. Returns ------- A function for updating the variables. ''' print 'Using Cautious Updates' params = [e[0] for e in updates] updates = [e[1] for e in updates] staging_area = [theano.shared(e.get_value()) for e in params] update_stage_1 = theano.function( flatten([lr, x, y]), cost, updates=zip(staging_area, updates), name='f_update_stage_1') update_stage_2 = theano.function( [], [], updates=zip(params, staging_area), name='f_update_stage_2') # Instead of using the lambda notation one can define a sequential function. def f_update(p1, p2, p3): update_stage_1(p1, p2, p3) return update_stage_2() # f_update = (lambda p1, p2, p3: # (lambda _: update_stage_2())( # update_stage_1(p1, p2, p3))) f_update.name = 'f_update' return f_update
def randomwalk_perf(adj, labels, train_node_idx, test_data, label_list): node_rating = {} adj_list = [] adj = np.maximum(adj, adj.T) vertex_to_label = [] min_label = min(label_list) for row in labels: vertex_to_label.append(list(row.nonzero()[0] + min_label)) for row in adj: adj_list.append(list(row.nonzero()[0])) # --------------------------------------------------------------------------- # # For Each query point(test vertex) Create a ranked list of potential labels. # # --------------------------------------------------------------------------- # for node in rasengan.flatten(test_data.values()): node_rating[node] = rw_get_label_rating(node, adj_list, vertex_to_label, train_node_idx) return mad_rating(node_rating, label_list, test_data)
def get_train_x(featset_name, predicate_name, predicate_idx, train_idx, train_y, use_only_positive_feat=False, create_conjunctive_feat=False): if featset_name == 's_features_backoff': backoff_feat_idx = backoff_feat_name_to_idx[get_backoff_feature_name( predicate_name)] train_x, feat_name = get_s_features_backoff(backoff_feat_idx) elif featset_name == 's_features_backoff_nodoc': backoff_feat_idx = backoff_feat_name_to_idx[get_backoff_feature_name( predicate_name)] bkoff_part, bkoff_names = get_s_features_backoff(backoff_feat_idx) nodoc_part, nodoc_names = get_s_features_nodoc(predicate_idx) train_x = scipy.sparse.hstack([bkoff_part, nodoc_part]) feat_name = (bkoff_names + nodoc_names) elif featset_name == 's_features_nodoc': train_x, feat_name = get_s_features_nodoc(predicate_idx) elif featset_name == 's_features_doc': train_x, feat_name = get_s_features_doc(predicate_idx) elif featset_name == 'random': return None # print [feat_name[_] for _ in scipy.sparse.find(train_x[41935])[1]] # pdb.set_trace() assert train_x.shape[1] == len(feat_name) idx_to_use = ([e for e, _ in zip(train_idx, train_y) if _ == 1] if use_only_positive_feat else train_idx) non_zero_train_col_set = set( rasengan.flatten(scipy.sparse.find(train_x[idx_to_use])[1])) non_zero_train_col = list(non_zero_train_col_set) train_x = train_x[:, non_zero_train_col] feat_name = [feat_name[i] for i in non_zero_train_col_set] # print [feat_name[_] for _ in scipy.sparse.find(train_x[41935])[1]] # pdb.set_trace() assert train_x.shape[1] == len(feat_name) if not create_conjunctive_feat: return train_x, feat_name train_x_2, feat_name_2 = make_conjunctive_feat(train_x, feat_name) # print [feat_name[_] for _ in scipy.sparse.find(train_x[41935])[1]] # pdb.set_trace() return train_x_2, feat_name_2
def get_cautious_update_f(updates, lr, x, y, cost): ''' Sometimes the theano optimizer may layout the updates improperly. so that some parameters get updated in place before their effects on other gradients and updates have been computed. This can happen during nested scans for example. One simple strategy to overcome this problem is to stage the `troublemaking` variables into their own staging area. Update the staging area and then to copy the updates over. This way updates won't suffer from a race condition. Params ------ updates : The update expression list of tuples, First element of tuple is the paraemter to be updated and then second one is the update expression. lr : The learning rate. x : The input sentence tv. y : The gold output tv. cost : The cost tv. Returns ------- A function for updating the variables. ''' print('Using Cautious Updates') params = [e[0] for e in updates] updates = [e[1] for e in updates] staging_area = [theano.shared(e.get_value()) for e in params] update_stage_1 = theano.function(flatten([lr, x, y]), cost, updates=zip(staging_area, updates), name='f_update_stage_1') update_stage_2 = theano.function([], [], updates=zip(params, staging_area), name='f_update_stage_2') # Instead of using the lambda notation one can define a sequential function. def f_update(p1, p2, p3): update_stage_1(p1, p2, p3) return update_stage_2() # f_update = (lambda p1, p2, p3: # (lambda _: update_stage_2())( # update_stage_1(p1, p2, p3))) f_update.name = 'f_update' return f_update
def create_ngrams(self, Se, n, predicate=None, verbose=False): # NOTE: We are implementing the Binarized MNB model where we dont # multiple count features that occur in a single # document/utterange/sentence. assert self.binarized_mnb features = {} max_tok = self.max_tok total_iter = 0 if predicate is None: predicate = lambda x: True for mention in Se: tokens = (rasengan.flatten(mention[0]) if self.entire else mention[0][mention[1]]) if n == 1: for t in tokens: total_iter += 1 if predicate(t): features[t] = None elif n == 2: # This is the BOS token idx pt = self.BOS for ct in tokens: total_iter += 1 bigram_feature = max_tok * (1 + pt) + ct pt = ct if predicate(bigram_feature): features[bigram_feature] = None elif n == 12: pt = self.BOS for ct in tokens: total_iter += 1 bigram_feature = max_tok * (1 + pt) + ct pt = ct if predicate(bigram_feature): features[bigram_feature] = None if predicate(ct): features[ct] = None else: raise NotImplementedError if verbose: print 'Total_iter', total_iter return features
def compile_update_fn(x, y, lr, cost, updates, stack_config, grads): ''' Compile the inputs and outputs to produce an update functions. Params ------ x : y : lr : cost : updates : ''' if y is not None: f_cost = theano.function(flatten([x, y]), cost, name='f_cost') f_grad = theano.function(flatten([x, y]), grads, name='f_grad') else: f_intermediate = theano.function(inputs=flatten([x]), outputs=stack_config.stack_ns.pred_y, name='f_intermediate') def f_cost(str1, str2): ''' The stack_config['endpoint'] contains a func method that has three inputs. it takes in a 1. stringA 2. stringB 3. tensor. Params ------ *args : The args right now are designed to be just a tuple of two integer sequences. args[0] is the left integer sequence. representing a string. Returns ------- ''' assert str1.ndim == 2 cols = (str1.shape[1] - 1) assert cols % 2 == 0 assert str2.ndim == 1 intermediate_tensor = f_intermediate(str1).astype('float64') return stack_config['endpoint'].func(str1[1:, int(cols / 2)], str2, intermediate_tensor) f_grad_intermediate = theano.function(inputs=flatten([x, cost]), outputs=grads, name='f_grad_intermediate') def f_grad(str1, str2): assert str1.ndim == 2 cols = (str1.shape[1] - 1) assert cols % 2 == 0 assert str2.ndim == 1 intermediate_tensor = f_intermediate(str1).astype('float64') intermediate_grad = numpy.array(stack_config['endpoint'].grad( str1[1:, int(cols / 2)], str2, intermediate_tensor), dtype='float32') return f_grad_intermediate(str1, intermediate_grad) pass if stack_config['cautious_update']: f_update = get_cautious_update_f(updates, lr, x, y, cost) else: on_unused_input = ('ignore' if updates == [] else 'raise') if y is not None: f_update = theano.function( # Input is the learning rate, and supervised example. flatten([lr, x, y]), # Output is the cost/loss that has to be minimized. cost, updates=updates, name='f_update', on_unused_input=on_unused_input) else: f_update_intermediate = theano.function( flatten([lr, x, cost]), [], updates=updates, name='f_update_intermediate', on_unused_input=on_unused_input) def f_update(lr, str1, str2): ''' f_update in this case receives a tuple of Params ------ *args : Returns ------- ''' assert str1.ndim == 2 cols = (str1.shape[1] - 1) assert cols % 2 == 0 assert str2.ndim == 1 intermediate_tensor = f_intermediate(str1).astype('float64') intermediate_grad = numpy.array(stack_config['endpoint'].grad( str1[1:, int(cols / 2)], str2, intermediate_tensor), dtype='float32') f_update_intermediate(lr, str1, intermediate_grad) return (intermediate_tensor, intermediate_grad) if y is not None: f_classify = theano.function(inputs=flatten( [stack_config.stack_ns.absolute_input_tv]), outputs=stack_config.stack_ns.pred_y, name='f_classify') f_update.individual_updates = {} for (_, u) in updates: f_update.individual_updates[u.wrt_name] = theano.function( flatten([lr, x, y]), u, name='f_update') else: def f_classify(in_str): assert in_str.ndim == 2 cols = (in_str.shape[1] - 1) assert cols % 2 == 0 return stack_config['endpoint'].decode( in_str[1:, int(cols / 2)], f_intermediate(in_str).astype('float64')) nsf = Namespace() nsf.f_cost = f_cost nsf.f_update = f_update nsf.f_classify = f_classify nsf.f_grad = f_grad return nsf
def main(): import argparse arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}') arg_parser.add_argument('--emb_pkl_fn', default='data/demonstrate_similarity_idea.emb.pkl', type=str) arg_parser.add_argument( '--feat_file', default='data/random/details/89c0c894.American_women_writers', type=str) arg_parser.add_argument('--ctag', default=None, type=int) arg_parser.add_argument('--mode_count', default=5, type=int) arg_parser.add_argument('--method', default='fast_relax', type=str, choices=[ 'brute_force', 'fast_relax', 'annealed_gibbs', 'maxproduct-bp', 'variational_inference', 'dc_programming' ]) args = arg_parser.parse_args() import random random.seed(args.seed) numpy.random.seed(args.seed) cfg.mode_count = args.mode_count tags_to_remove = defaultdict(list) with rasengan.tictoc('Loading pkl'): embeddings = pkl.load(open(args.emb_pkl_fn)) if cfg.introduce_NULL_embedding: embeddings[cfg.NULL_KEY] = numpy.zeros( next(embeddings.itervalues()).shape) with rasengan.debug_support(): for mode_idx in range(cfg.mode_count): print 'mode_idx=', mode_idx entity_tags = {} entities = [] for row in open(args.feat_file): _e, _tags = [e.strip() for e in row.strip().split('|||')] entities.append(_e) entity_tags[_e] = set([ t.lower() for t in (e.strip().split(':')[0] for e in _tags.split()) if t.lower() in embeddings ]) total_tags = set( rasengan.flatten([list(e) for e in entity_tags.values()])) assert all(e in embeddings for e in total_tags) print( 'For each of these people our goal is to select one word.' ' That word should be as similar to other words picked for other' ' entities as possible') problem = rasengan.OrderedDict_Indexable_By_StringKey_Or_Index() for (a, b) in entity_tags.items(): b = list(b) print 'Entity: ', a, 'tags to remove: ', tags_to_remove[a] for ttr in tags_to_remove[a]: tolerant_remove(b, ttr) if cfg.introduce_NULL_embedding and cfg.NULL_KEY not in b: b.append(cfg.NULL_KEY) # print '%-25s' % a, '|||', ', '.join(b) problem[a] = DataFrame(data=numpy.concatenate( [(scale_to_unit(embeddings[e]) if cfg.scale_to_unit else embeddings[e])[None, :] for e in b], axis=0), index=b) if args.ctag is None: initial_assignment = dict( (__a, 0) for __b, __a in enumerate(entities)) else: ctag = 'war'.split()[args.ctag] initial_assignment = dict( (__e, (cfg.NULL_KEY if ctag not in entity_tags[__e] else ctag)) for __e in entities) print 'Initial chosen tags::', chosen_tags(problem, initial_assignment) initial_objective = dp_objective_efficient_impl( problem, initial_assignment) print 'initial_objective=', initial_objective assert numpy.isclose( dp_objective_naive_impl(problem, initial_assignment), initial_objective) final_assignment = optimize_assignment(problem, initial_assignment, method=args.method) final_objective = dp_objective_efficient_impl( problem, final_assignment) for (fa_entity, fa_tag_idx) in final_assignment.iteritems(): tags_to_remove[fa_entity].append( liloc(problem[fa_entity], fa_tag_idx).name) print 'mode_idx=', mode_idx, print 'initial_objective=', initial_objective, print 'final_objective=', final_objective, print 'Final chosen tags=', chosen_tags(problem, final_assignment) return
description='Do a basic featurization of the BBN2 data.') arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}') arg_parser.add_argument('--in_fn', default="~/data/tackbp2015bbn2/relational_bbn2.pkl", type=str) arg_parser.add_argument( '--out_fn', default="~/data/tackbp2015bbn2/basicfeaturization_relational_bbn2.pkl", type=str) args = arg_parser.parse_args() vertex_dict, edgelist = pkl.load(open(os.path.expanduser(args.in_fn))) assert all(e.confidence == '"1.0"' for e in vertex_dict.itervalues()) # The total features potential_feat = list( set( rasengan.flatten( [e.keys() for _ in vertex_dict.itervalues() for e in _.featsets]))) STRING_FEAT = [e for e in potential_feat if e.endswith('~name')] other_feat = ['~crime', '~time', '~document'] ''' print [e for e in potential_feat if e.endswith('~name')] print [e for e in potential_feat if e.endswith('~type')] print [e for e in potential_feat if e.endswith('~confidence')] print [e for e in potential_feat if not e.endswith('~name') and not e.endswith('~type')
labels = s_features[:, predicate_idx] I = list(scipy.sparse.find(labels)[0]) set_I = set(I) for featset_name in [ 's_features_nodoc', 's_features_doc', ]: for trials in range(5): train_idx = IDX_DATA[predicate_name][trials]['train'] preamble = 'predicate_name=%s trials=%d featset_name=%s ' % ( predicate_name, trials, featset_name) g = get_igraph(predicate_idx, featset_name) counter = Counter() for origin in train_idx: counter.update((e for e in rasengan.flatten( g.random_walk(origin, args.rw_max_step) for walk_idx in range(args.rw_walk_num)) if e < total_persons)) # pr = g.personalized_pagerank(damping=0.85, reset_vertices=[0]) # print len(pr) # A large number of vertices are completely left untouched # by the random walks, because of their low tendency to discover # new things. Imagine a star like graph. # Most of the time you'd reach a spoke, and then turn back to # the hub. The rate of going from one hub to the other is too # low. So _testing_output = [[ (1 if _i in set_I else 0), _i ] for _i in itertools.chain(( i for i, _ in sorted( counter.items(), key=lambda x: x[1], reverse=True)
def compile_update_fn(x, y, lr, cost, updates, stack_config, grads): ''' Compile the inputs and outputs to produce an update functions. Params ------ x : y : lr : cost : updates : ''' if y is not None: f_cost = theano.function(flatten([x, y]), cost, name='f_cost') f_grad = theano.function(flatten([x, y]), grads, name='f_grad') else: f_intermediate = theano.function( inputs=flatten([x]), outputs=stack_config.stack_ns.pred_y, name='f_intermediate') def f_cost(str1, str2): ''' The stack_config['endpoint'] contains a func method that has three inputs. it takes in a 1. stringA 2. stringB 3. tensor. Params ------ *args : The args right now are designed to be just a tuple of two integer sequences. args[0] is the left integer sequence. representing a string. Returns ------- ''' assert str1.ndim == 2 cols = (str1.shape[1]-1) assert cols % 2 == 0 assert str2.ndim == 1 intermediate_tensor = f_intermediate(str1).astype('float64') return stack_config['endpoint'].func( str1[1:, cols/2], str2, intermediate_tensor) f_grad_intermediate = theano.function( inputs=flatten([x, cost]), outputs=grads, name='f_grad_intermediate') def f_grad(str1, str2): assert str1.ndim == 2 cols = (str1.shape[1]-1) assert cols % 2 == 0 assert str2.ndim == 1 intermediate_tensor = f_intermediate(str1).astype('float64') intermediate_grad = numpy.array( stack_config['endpoint'].grad( str1[1:, cols/2], str2, intermediate_tensor), dtype='float32') return f_grad_intermediate(str1, intermediate_grad) pass if stack_config['cautious_update']: f_update = get_cautious_update_f(updates, lr, x, y, cost) else: on_unused_input=('ignore' if updates == [] else 'raise') if y is not None: f_update = theano.function( # Input is the learning rate, and supervised example. flatten([lr, x, y]), # Output is the cost/loss that has to be minimized. cost, updates=updates, name='f_update', on_unused_input=on_unused_input) else: f_update_intermediate = theano.function( flatten([lr, x, cost]), [], updates=updates, name='f_update_intermediate', on_unused_input=on_unused_input) def f_update(lr, str1, str2): ''' f_update in this case receives a tuple of Params ------ *args : Returns ------- ''' assert str1.ndim == 2 cols = (str1.shape[1]-1) assert cols % 2 == 0 assert str2.ndim == 1 intermediate_tensor = f_intermediate(str1).astype( 'float64') intermediate_grad = numpy.array( stack_config['endpoint'].grad( str1[1:, cols/2], str2, intermediate_tensor), dtype='float32') f_update_intermediate(lr, str1, intermediate_grad) return (intermediate_tensor, intermediate_grad) if y is not None: f_classify = theano.function( inputs=flatten([stack_config.stack_ns.absolute_input_tv]), outputs=stack_config.stack_ns.pred_y, name='f_classify') f_update.individual_updates = {} for (_, u) in updates: f_update.individual_updates[u.wrt_name] = theano.function( flatten([lr, x, y]), u, name='f_update') else: def f_classify(in_str): assert in_str.ndim == 2 cols = (in_str.shape[1]-1) assert cols % 2 == 0 return stack_config['endpoint'].decode( in_str[1:, cols/2], f_intermediate(in_str).astype('float64')) nsf = Namespace() nsf.f_cost = f_cost nsf.f_update = f_update nsf.f_classify = f_classify nsf.f_grad = f_grad return nsf
# rasengan.warn('I am deleting Resident_location_to_person') # del data_dict['Resident_location_to_person'] idx_to_name = dict((idx, entity_to_name[e]) for e, idx in entity_to_int_map.iteritems()) int_to_entity_map = dict((a, b) for (b, a) in entity_to_int_map.iteritems()) dv = data_dict.values() edge_to_label_map = {} for label, edges in data_dict.iteritems(): for v1, v2 in edges: edge_to_label_map[(v1, v2)] = label edge_to_label_map[(v2, v1)] = label + '_inv' all_nodes = set(rasengan.flatten(dv)) all_edges = reduce(lambda x, y: x + y, dv) set_all_edges = set(tuple(e) for e in all_edges) # print '\n'.join([str(e) # for e # in sorted(list(set([(idx_to_name[e[1]], # g.degree(e[1])) for e in # data_dict['Role_person_to_role'] # if g.degree(e[1]) == 2])), # key=lambda x: x[1], reverse=True)]) def shortest_path(g, v1, v2): d = {} v = None pv = None