def load_matrix(self, filename): self.matrix = load_matrix(filename)
def train(self, corpus, token_type='sentences', stoplist=None, n_columns=None, env_matrix=None, placeholder=None, right_permutation=None, left_permutation=None, lmbda=7): if env_matrix == None: env_model = BeagleEnvironment() env_model.train(corpus, token_type, stoplist, n_columns) else: env_model = BeagleEnvironment(env_matrix) __shape = env_model.matrix.shape order_fn.env_matrix = env_model.matrix del env_model del env_matrix temp_dir = tempfile.mkdtemp() order_fn.temp_dir = temp_dir order_fn.lmbda = lmbda if not placeholder: placeholder = np.random.random(__shape[1]) placeholder *= 2 placeholder -= 1 placeholder /= np.sum(placeholder**2)**(1./2) order_fn.placeholder = placeholder print 'Placeholder:', order_fn.placeholder print 'Norm of placeholder', np.sum(order_fn.placeholder**2)**(1./2) if not right_permutation or not left_permutation: permutations = RandomPermutations(__shape[1], 2) if right_permutation: order_fn.right_permutation = right_permutation else: order_fn.right_permutation = permutations.permutations[0] if left_permutation: order_fn.left_permutation = left_permutation else: order_fn.left_permutation = permutations.permutations[1] print 'Right permutation', order_fn.right_permutation(np.arange(__shape[1])) print 'Left permutation', order_fn.left_permutation(np.arange(__shape[1])) sentences = corpus.view_tokens(token_type) # number of sentences in a chunk of sentences n = 500 sent_lists = np.split(np.asarray(sentences, dtype=np.object_), np.arange(n, len(sentences), n)) ind_sent_lists = list(enumerate(sent_lists)) # Map p = mp.Pool() results = p.map(order_fn, ind_sent_lists, 1) p.close() del order_fn.env_matrix # Reduce self.matrix = np.zeros(__shape, dtype=np.float32) for result in results: print 'Reducing', result summand = load_matrix(result) for i,row in summand.iteritems(): self.matrix[i,:] += row # self.matrix += summand # Clean up print 'Deleting temporary directory\n'\ ' ', temp_dir shutil.rmtree(temp_dir)
def train(self, corpus, token_type='sentences', stoplist=list(), n_columns=None, env_matrix=None): if env_matrix == None: env_model = BeagleEnvironment() env_model.train(corpus, token_type, stoplist, n_columns) else: env_model = BeagleEnvironment(env_matrix) #Apply stoplist to environment matrix env_model.filter_rows(stoplist) __shape = env_model.matrix.shape context_fn.env_matrix = env_model.matrix del env_model del env_matrix temp_dir = tempfile.mkdtemp() context_fn.temp_dir = temp_dir sentences = corpus.view_tokens(token_type) # number of sentences in a chunk of sentences n = 500 sent_lists = np.split(np.asarray(sentences, dtype=np.object_), np.arange(n, len(sentences), n)) ind_sent_lists = list(enumerate(sent_lists)) # Map p = mp.Pool() results = p.map(context_fn, ind_sent_lists, 1) p.close() del context_fn.env_matrix # Reduce self.matrix = np.zeros(__shape, dtype=np.float32) for result in results: print 'Reducing', result summand = load_matrix(result) # self.matrix += summand for i,row in summand.iteritems(): self.matrix[i,:] += row # Clean up print 'Deleting temporary directory\n'\ ' ', temp_dir shutil.rmtree(temp_dir)