def filter_and_count(self, filepath_in, filepath_out): print("Filtering numbers ...") import re import glob step = 0 # support for directory mode, saving into 1 single file dump if os.path.isdir(filepath_in): # directory mode files = glob.glob(filepath_in + '/*.txt') else: files = [filepath_in] output = open(filepath_out, 'w', encoding='utf-8') for fpath in files: with codecs.open(fpath, 'r', encoding='utf-8') as file: s = file.read() sent = s.strip().split() sent_filtered = [] for token in sent: if is_numeral(token): # ['-32.000'] to ['-32'] # prevent '-haha' like token, double check number = str(to_numeral(token)) self.nc[number] = self.nc.get(number, 0) + 1 sent_filtered.append(number) else: self.wc[token] = self.wc.get(token, 0) + 1 sent_filtered.append(token) output.write( bytes(' '.join(sent_filtered), 'utf-8').decode('utf-8') + '\n') output.close() print("filtering corpus done")
def get_item(self, iword, owords): """ form a proper data structure :param iword: :param owords: :return: """ item = [None, [], 0, None, [0] * 2 * self.window, []] # [ # iword, # [list of owords], # 0 or 1, indicator of iwords, # None if iword is a token, numeral float if iword is a numeral, # [one-hot indicator of owords], # [list of numerals] # ] # # For example: if She is the center word and the window size is 2 # oh , (She) is 1.67 m # [12, [99, 4, 5, 0], 0, None, [0,0,0,1], [1.67]] if is_numeral(iword): item[0] = self.word2idx[self.unk] item[2] = 1 item[3] = to_numeral(iword) else: item[0] = self.word2idx[iword] for j in range(len(owords)): flag, oword = to_numeral_if_possible(owords[j]) if flag: item[1].append(self.word2idx[self.unk]) item[4][j] = 1 item[5].append(oword) else: item[1].append(self.word2idx[oword]) return item
if __name__ == '__main__': # nc path nc = pickle.load( open( '../data/wikipedia/preprocess0.05Bnotable/NumeralAsNumeral/nc.dat', 'rb')) gmm_save_dir = 'gmm' if not os.path.exists(gmm_save_dir): os.makedirs(gmm_save_dir) random.seed(100) # unfold and shuffle nc data data = [] for k, v in nc.items(): if to_numeral(k) == None: print('invalid numeral {}'.format(k)) else: data += [[to_numeral(k)]] * v print('total number of different numerals: ', len(nc)) print('total number of numeral samples: ', len(data)) random.shuffle(data) data = np.array(data).reshape(-1, 1) prototypes = pickle.load( open( '../data/wikipedia/preprocess0.05Bnotable/NumeralAsNumeral/som/prototypes-50-0.6-1.0.dat', 'rb'))
idx2word = pickle.load( open(os.path.join(args.preprocess_dir, 'idx2word.dat'), 'rb')) wc = pickle.load(open(os.path.join(args.preprocess_dir, 'wc.dat'), 'rb')) nc = pickle.load(open(os.path.join(args.preprocess_dir, 'nc.dat'), 'rb')) # filter nc for k, v in nc.copy().items(): f = np.float32( k ) # caution need to be float32 cause we use float32 in further caculation if np.isnan(f) or np.isinf(f): nc.pop(k) print(f) numeral2idx = { to_numeral(numeral): idx for idx, numeral in enumerate(list(nc.keys())) } wf = np.array([wc[word] for word in idx2word]) w_sum = wf.sum() wf = wf / w_sum ws = 1 - np.sqrt(args.ss_t / wf) ws = np.clip(ws, 0, 1) vocab_size = len(idx2word) token_weights = wf if args.weights else None nf = np.array(list(nc.values())) n_sum = nf.sum() nf = nf / n_sum numerals = np.array(list(nc.keys()))
def filter_and_count(self, filepath_in, filepath_out): print("Filtering numbers ...") import re import glob step = 0 # the re for all possible token number RE_NUM = re.compile( r"(((-?\d+(,\d{3})*(\.\d+)?)\/(-?\d+(,\d{3})*(\.\d+)?))|(-?\d+(,\d{3})*(\.\d+)?))", re.UNICODE) # support for directory mode, saving into 1 single file dump if os.path.isdir(filepath_in): # directory mode files = glob.glob(filepath_in + '/*.txt') else: files = [filepath_in] output = open(filepath_out, 'w', encoding='utf-8') for fpath in files: with codecs.open(fpath, 'r', encoding='utf-8') as file: for line in file: step += 1 if not step % 1000: print("\n working on {}kth line in file {}".format( step // 1000, fpath)) line = line.strip() if not line: continue sent = line.split() sent_filtered = [] for token in sent: # we treat word and numerals differently # match numerals res = re.findall(RE_NUM, token) if res != []: target = number_handler(token) # we do not want nc to record '' if target == '': continue if type(target) is list: # ['u-32'] to ['u','-'.'32'] # [1997/07] to ['1997','/','7'] for i in target: ww = str( to_numeral(i)) if is_numeral(i) else i self.wc[ww] = self.wc.get( ww, 0) + 1 # change to wc sent_filtered.append(ww) elif is_numeral(target): # ['-32.000'] to ['-32'] # prevent '-haha' like token, double check number = str(to_numeral(target)) self.wc[number] = self.wc.get( number, 0) + 1 # change to wc sent_filtered.append(number) else: self.wc[token] = self.wc.get(token, 0) + 1 sent_filtered.append(token) output.write( bytes(' '.join(sent_filtered), 'utf-8').decode('utf-8') + '\n') output.close() print("filtering and counting done")
def train_gmm(self, components=20, iters=100, gmm_init_mode='rd', gmm_type='soft', prototype_path=None, log_space=False): # print('<<<<<<<<<<INITIALIZING>>>>>>>>>> \n means: {} \n sigma: {}\n, weights: {}'.format(gmm.means_, gmm.covariances_, gmm.weights_)) assert gmm_init_mode in ['rd', 'fp', 'km'] assert gmm_type in ['soft', 'hard'] nc = pickle.load(open(os.path.join(self.save_dir, 'nc.dat'), 'rb')) # we use fix random seed # random.seed(100) # unfold and shuffle nc data data = [] for k, v in nc.items(): if to_numeral(k) == None: print('invalid numeral {}'.format(k)) else: data += [[to_numeral(k)]] * v print('total number of different numerals: ', len(nc)) print('total number of numeral samples: ', len(data)) # shuffle and subsample for MEM problem random.shuffle(data) if len(data) > 2000000: data = data[:2000000] if log_space: data = [weighted_log(x[0]) for x in data] print('subsampled to {}'.format(len(data))) data = np.array(data).reshape(-1, 1) # getting initialization parameters if gmm_init_mode == 'km': if gmm_type == 'soft': gmm = GaussianMixture(components, max_iter=iters, n_init=1, verbose=10, init_params='kmeans') else: gmm = HardEMGaussianMixture(components, max_iter=iters, n_init=1, verbose=10, init_params='kmeans') else: # random select means if gmm_init_mode == 'rd': prototypes = np.random.choice(data.reshape(-1), components) else: assert prototype_path is not None if log_space: path = os.path.join(self.save_dir, 'som_log') else: path = os.path.join(self.save_dir, 'som') path = os.path.join(path, prototype_path) prototypes = pickle.load(open(path, 'rb')) assert len(prototypes) == components mus = prototypes min_sigma = 1e-6 diff = np.abs(data.reshape(len(data)) - mus[:, np.newaxis]) amin = np.argmin(diff, axis=0) K = len(prototypes) clusters = [[0] for i in range(K)] for i in range(len(data)): clusters[amin[i]].append(data[i]) means = np.array([np.mean(i) for i in clusters]).reshape(-1, 1) covs = np.array([ np.std(i) if len(i) > 1 else min_sigma for i in clusters ]).reshape(-1, 1, 1) precision = np.linalg.inv(covs) weights = np.array([len(c) for c in clusters]) weights = weights / np.sum(weights) if gmm_type == 'soft': gmm = GaussianMixture(components, max_iter=iters, n_init=1, verbose=10, means_init=means, precisions_init=precision, weights_init=weights) else: gmm = HardEMGaussianMixture(components, max_iter=iters, n_init=1, verbose=10, means_init=means, precisions_init=precision, weights_init=weights) gmm.fit(data) if log_space: gmm_save_dir = os.path.join(self.save_dir, 'gmm_log') else: gmm_save_dir = os.path.join(self.save_dir, 'gmm') if not os.path.exists(gmm_save_dir): os.makedirs(gmm_save_dir) def single_variable_gaussian(x, mu, sigma): return 1. / (np.sqrt(2. * np.pi) * sigma) * np.exp(-np.power( (x - mu) / sigma, 2.) / 2) def draw(gmm, X): x_min, x_max = min(X), max(X) # x = np.linspace(x_min, x_max, 10000) # x = np.array([]) # for i in range(len(gmm.means_)): # range_min, range_max = gmm.means_[i][0]-2 * gmm.covariances_[i][0], gmm.means_[i][0] + 2 * gmm.covariances_[i][0] # x = np.append(x, np.linspace(range_min, range_max, 20)) # x.sort() # print(x) print(gmm.means_) print(gmm.covariances_) print(gmm.weights_) X.sort() sum_y = np.zeros_like(X) plt.figure(0) plt.title('components') for i in range(len(gmm.means_)): y = single_variable_gaussian(X, gmm.means_[i][0], gmm.covariances_[i][0]) y[y > 1] = 0 # set to 0 for better plot! sum_y += y * gmm.weights_[i] # yp = single_variable_gaussian(X, gmm.means_[i][0], gmm.covariances_[i][0]) # yp[yp > 1] = 0 # sum_yp += yp plt.plot(X, y) plt.savefig( os.path.join(gmm_save_dir, 'components-{}.png'.format(components))) plt.figure(1) plt.title('mixtures') plt.plot(X, sum_y, 'g-') plt.savefig( os.path.join(gmm_save_dir, 'mixture-{}.png'.format(components))) # 'rd' indicates for random initialization, 'fp' for 'from prototypes' pickle.dump( gmm, open( os.path.join( gmm_save_dir, 'gmm-{}-{}-{}.dat'.format(components, gmm_init_mode, gmm_type)), 'wb')) print('means: {} \n sigma: {}\n, weights: {}'.format( gmm.means_, gmm.covariances_, gmm.weights_)) if log_space: data_points = np.array([ weighted_log(x) for x in np.array(list(nc.keys()), dtype=np.float32) ]).reshape(-1, 1) else: data_points = np.array(list(nc.keys()), dtype=np.float32).reshape(-1, 1) posterior = gmm.predict_proba(data_points) path = os.path.join( gmm_save_dir, 'gmm_posterior-{}-{}-{}.dat'.format(components, gmm_init_mode, gmm_type)) pickle.dump(posterior, open(path, 'wb')) print('...Saving trained GMMs objects to {}'.format(path))
def train_som(self, prototypes=10, sigma=0.03, lr=0.3, iters=10000, log_space=False): """ :param nc_path: path under the save_directory :param prototypes: number of SOM neurons :param sigma: sigma of SOM :param lr: learning rate of SOM :return: None Train a simple SOM, and save it's neuron weights as prototypes, given numeral counts """ nc = pickle.load(open(os.path.join(self.save_dir, 'nc.dat'), 'rb')) # unfold and shuffle nc data data = [] for k, v in nc.items(): if to_numeral(k) == None: print('invalid numeral {}'.format(k)) else: data += [[to_numeral(k)]] * v print('total number of different numerals: ', len(nc)) print('total number of numeral samples: ', len(data)) random.shuffle(data) if log_space: data = [[weighted_log(x[0])] for x in data] som = SOM(prototypes, 1, 1, sigma=sigma, learning_rate=lr, random_seed=random_seed) # initialization print("Training SOMs...") # som.random_weights_init(data) som.train_random(data, iters) # trains the SOM with 1000 iterations print("...Ready!") # win_map = som.win_map(data) self.prototypes = som.get_weights().reshape(prototypes) # nd array if log_space: som_save_dir = os.path.join(self.save_dir, 'som_log') else: som_save_dir = os.path.join(self.save_dir, 'som') if not os.path.exists(som_save_dir): os.makedirs(som_save_dir) print('prototypes: \n{}'.format(self.prototypes)) pickle.dump( self.prototypes, open( os.path.join( som_save_dir, 'prototypes-{}-{}-{}.dat'.format(prototypes, sigma, lr)), 'wb')) print('...Saving Prototypes')