def RLDeconvolution(self, RLiterations, PSF, threads=multiprocessing.cpu_count(), PSF_pad=0): '''Input: RLiterations=number of iterations to perform PSF=point spread function (an EELS spectrum object) Optional argument: threads=number of computer's CPUs to use while deconvolving, default is all of them PSF_pad=value to pad PSF with (or None to not pad PSF)''' PSF_sym = PSF.SymmetrizeAroundZLP() if PSF_pad is not None: data_length = np.size(self.SpectrumRange) PSF_length = np.size(PSF_sym.intensity) pad_length = data_length/2 - (1 + data_length) % 2 - (PSF_length-(PSF_length % 2))/2 if PSF_sym.ZLP < data_length/2: PSF_sym = PSF.PadSpectrum(pad_length, pad_value=PSF_pad, pad_side='left').SymmetrizeAroundZLP() elif PSF_sym.ZLP > data_length/2: PSF_sym = PSF_sym.PadSpectrum(pad_length, pad_value=PSF_pad, pad_side='right') print 'Beginning deconvolution...' loopyP = partial(loopy, iterations=RLiterations, PSF=PSF_sym.Normalize().intensity) x_deconv = np.array(handythread.parallel_map(loopyP, abs(self.Normalize()), threads = threads)) # x_deconv = np.array(handythread.parallel_map(loopyP, self.Normalize(), # threads = threads)) x_deconv = np.ma.array(x_deconv, mask = self.data.mask) print 'Done %s iterations!' %RLiterations return EELSSpectrumImage(x_deconv, self.dispersion)
def RLDeconvolution_Adaptive(self, RLiterations, PSF, threads=multiprocessing.cpu_count(), PSF_pad=0): PSF_sym = PSF.SymmetrizeAroundZLP() data_sym = self.SymmetrizeAroundZLP() if PSF_pad is not None: data_length = np.size( self.SpectrumRange) ##replace w/ self.size[2] PSF_length = np.shape(PSF_sym.data)[2] pad_length = int(data_length / 2 - (1 + data_length) % 2 - PSF_length // 2) if PSF_sym.ZLP < data_length / 2: PSF_sym = PSF.PadSpectrum( pad_length, pad_value=PSF_pad, pad_side='left').SymmetrizeAroundZLP() elif PSF_sym.ZLP > data_length / 2: PSF_sym = PSF_sym.PadSpectrum(pad_length, pad_value=PSF_pad, pad_side='right') if data_sym.ZLP < data_length / 2: data_sym = self.PadSpectrum( pad_length, pad_value=PSF_pad + 1, pad_side='left').SymmetrizeAroundZLP() elif PSF_sym.ZLP > data_length / 2: data_sym = self.PadSpectrum(pad_length, pad_value=PSF_pad + 1, pad_side='right') print('Beginning deconvolution...') loopyP_adapt = partial(loopy_adapt, iterations=RLiterations) deconvolution_arrays = np.append(np.expand_dims(data_sym.Normalize(), axis=-1), np.expand_dims(PSF_sym.Normalize(), axis=-1), axis=-1) x_deconv = np.array( handythread.parallel_map(loopyP_adapt, deconvolution_arrays, threads=threads)) x_deconv = np.ma.array(x_deconv, mask=self.data.mask) print('Done %s iterations!' % RLiterations) return EELSSpectrumImage(x_deconv, dispersion=self.dispersion)
def litekmeans(X, k, max_iter=50): X = X.T n = X.shape[1] ndim = X.shape[0] last = 0 label = np.random.randint(k, size=(n, )) iteration = 0 batchsize = 100000 nbatches = int(np.ceil(n / batchsize)) center = np.zeros((ndim, k), dtype=np.float32) while np.any(label != last): start = time.time() iteration += 1 print 'iteration: {0}'.format(iteration) E = scipy.sparse.coo_matrix((np.ones( (n, ), dtype=np.int), (np.arange(n), label)), shape=(n, k), dtype=np.float64).tocsr() # E = one hot assignments # spdiags... = counts print 'max of E.sum(0): %s' % (E.sum(0).max(), ) print 'max of (1.0/E.sum(0)): %s' % ((1.0 / E.sum(0)).max(), ) print 'min of E.sum(0): %s' % (E.sum(0).min(), ) print 'min of (1.0/E.sum(0)): %s' % ((1.0 / E.sum(0)).min(), ) print 'np.all(1.0/E.sum(0) == np.inf): %r' % (np.all( 1.0 / E.sum(0) == np.inf), ) center = X * E * scipy.sparse.spdiags(1.0 / (E.sum(0) + 0.0000000001), 0, k, k) c2 = 0.5 * np.sum(center**2, 0).T[:, None] last = label label = np.zeros((n, ), dtype=np.int) def get_labels(batchidx): return np.argmax(np.dot( center.T, X[:, j * batchsize + batchidx * 1000:min(n, j * batchsize + (batchidx + 1) * 1000)]) - c2, axis=0) for j in range(nbatches): print 'processing batch {0:d} / {1:d}'.format(j + 1, nbatches) tmp = handythread.parallel_map(get_labels, range(int(np.ceil(batchsize / 1000))), threads=8) label[j * batchsize:min(n, int((j + 1) * batchsize))] = np.concatenate(tmp) if iteration >= max_iter: break print 'iteration took {0:d} seconds'.format(int(time.time() - start)) obj = 0 Xsq = 0.5 * np.sum(X**2, 0) batchsize = 10000 nbatches = int(np.ceil(n / batchsize)) csq = 0.5 * np.sum(center**2, 0) # TODO: do this stuff in parallel as well (takes longer than expected) def compute_sqd(batchidx): tempX = X[:, j * batchsize + batchidx * 100:min(n, j * batchsize + (batchidx + 1) * 100)] temp = np.dot(-center.T, tempX) + csq[:, None] tmp = Xsq[j * batchsize + batchidx * 100:min(n, j * batchsize + (batchidx + 1) * 100)] + temp temp_mindist = np.min( Xsq[j * batchsize + batchidx * 100:min(n, j * batchsize + (batchidx + 1) * 100)] + temp, axis=0) return np.sum(temp_mindist) for j in range(nbatches): tmp = handythread.parallel_map(compute_sqd, range(int(np.ceil(batchsize / 100))), threads=8) obj += np.sum(tmp) print 'obj: %r' % (obj, ) #print obj #for j in range(nbatches): # tempX = X[:, j * batchsize:min(n, (j + 1) * batchsize)] # temp = np.dot(-center.T, tempX) + csq[:, None] # print 'Xsq[j * batchsize:min(n, (j + 1) * batchsize)].mean(): %r' % ( # Xsq[j * batchsize:min(n, (j + 1) * batchsize)].mean(), ) # print 'mean of temp: %s' % (temp.mean(), ) # tmp = Xsq[j * batchsize:min(n, (j + 1) * batchsize)] + temp # # temp_mindist = np.min( # Xsq[j * batchsize:min(n, (j + 1) * batchsize)] + temp, # axis=0 # ) # obj = obj + np.sum(temp_mindist) print 'obj: %r' % (obj, ) center = center.T return (label, center, obj)
def test_parallel_map(self): l = range(100) r = handythread.parallel_map(lambda x: x**2, l) for i in range(len(l)): self.assertEqual(l[i]**2,r[i])
def run(doc_topics_filename, topic_keys_filename, state_filename, max_dict, meta_filename, solr_url, output_dir, date_format): if not os.path.exists(doc_topics_filename): raise FileNotFoundError(doc_topics_filename) if not os.path.exists(topic_keys_filename): raise FileNotFoundError(topic_keys_filename) if not os.path.exists(state_filename): raise FileNotFoundError(state_filename) if meta_filename is not None and not os.path.exists(meta_filename): raise FileNotFoundError(meta_filename) start = time() # read the mallet output print("Reading {}...".format(doc_topics_filename), end='', flush=True) doc_topics, num_topics = read_doc_topics(doc_topics_filename, return_num_topics=True) print("done, {:,} topics, {:,} documents".format(num_topics, len(doc_topics))) print("Reading {}...".format(topic_keys_filename), end='', flush=True) topic_keys, num_topwords = read_topic_keys(topic_keys_filename, return_num_topwords=True) print("done, {:,} top words per topic".format(num_topwords)) print("Reading {}...".format(state_filename), end='', flush=True) state = read_state(state_filename) print("done") if meta_filename is not None: print("Reading metadata from {}...".format(meta_filename), end='', flush=True) doc_meta = read_meta(meta_filename, date_format) print("done, {:,} entries found".format(len(doc_meta))) else: print("Retrieving metadata from {}...".format(solr_url), end='', flush=True) doc_meta = retrieve_meta(doc_topics, solr_url, date_format) print("done, {:,} entries retrieved".format(len(doc_meta))) topicids = range(num_topics) # check if pruning is desired if max_dict > 0: print("Pruning tokens, keeping only the top {:,} tokens by frequency...".format(max_dict), end='', flush=True) state = prune_state(state, max_dict) print("done") print("Re-indexing tokens...", end='', flush=True) state['typeindex'] = pd.factorize(state['typeindex'])[0] print("done") print("Processing state data...", end='', flush=True) # extract mapping of token id to token tokenid_map = state[['typeindex', 'type']].drop_duplicates() tokenid_map.columns = ['tokenid', 'token'] # extract mapping of token -> document -> topic state = state[['doc', 'typeindex', 'topic']] state.columns = ['docid', 'tokenid', 'topic'] # compute token, topic -> count mapping token_topic_count = state[['tokenid', 'topic']].groupby(['tokenid', 'topic'], sort=False).size().reset_index() token_topic_count.columns = ['tokenid', 'topic', 'count'] num_tokens = len(tokenid_map) doc_topics_complete = pd.merge(doc_meta, doc_topics, on='source') doc_topics_complete = doc_topics_complete.rename(columns={'id': 'docid', 'id_x': 'volid', 'id_y': 'docid'}, copy=False) docid_publishdate = doc_topics_complete[['docid', 'year']] print("done, {:,} tokens".format(num_tokens)) # Create aggregate state object print("Calculating aggregate stats...", end='', flush=True) agg = state.groupby(['docid', 'tokenid', 'topic']).size().reset_index() agg.columns = ['docid', 'tokenid', 'topic', 'count'] agg_token = pd.merge(agg, tokenid_map, on='tokenid') full_state = pd.merge(agg_token, docid_publishdate, on='docid') full_state.drop('docid', axis=1, inplace=True) corpus_token_counts_by_year = full_state[['year', 'count']].groupby('year').sum().reset_index() topic_keywords = pd.melt(topic_keys, id_vars='id', value_vars=['key.' + str(i) for i in range(num_topwords)], value_name='token') topic_keywords = topic_keywords.rename(columns={'id': 'topic'}, copy=False) topic_keywords.drop('variable', axis=1, inplace=True) topic_token_counts_by_year = \ pd.merge(full_state[['topic', 'year', 'token', 'count']], topic_keywords, on=['topic', 'token']).groupby(['topic', 'year', 'token']).sum().reset_index() print("done") print("Creating topics...", end='', flush=True) def create_topic(topicid): token_count = token_topic_count[token_topic_count['topic'] == topicid] vector = np.repeat(np.float64(0), num_tokens) for tid, _, cnt in token_count.itertuples(index=False): vector[tid] = np.float64(cnt) return Topic(vector) topics = list(parallel_map(create_topic, topicids)) print("done") # calculate trend print("Calculating topic trend...", end='', flush=True) state_trend = pd.merge(state[['docid', 'topic']], docid_publishdate, on='docid')[['topic', 'year']].groupby(['topic', 'year']).size().reset_index() state_trend.columns = ['topic', 'year', 'count'] def slope(topic): state_small = state_trend[(state_trend['topic'] == topic) & (state_trend['year'] != -1)] dates = state_small['year'].values values = state_small['count'].values lm = stats.linregress(dates, values) return lm[0] topic_keys['trend'] = list(parallel_map(slope, topicids)) print("done") # Insert dists print("Calculating topic distance from center...", end='', flush=True) topic_keys['dist'] = list(parallel_map(lambda x: x.length, topics)) print("done") print("Calculating topic means...", end='', flush=True) topic_keys['mean'] = doc_topics.ix[:, 'topic.0':].mean().values print("done") # Create Distance Matrix dist = pd.DataFrame(data=0, dtype='float64', index=topicids, columns=topicids) print("Calculating inter-topic distances...", end='', flush=True) def calc_dist(tuple): x, y = tuple d = topics[x].distance(topics[y]) dist.iloc[x, y] = d dist.iloc[y, x] = d parallel_for(calc_dist, itertools.combinations(topicids, 2)) print("done") # write results print("Writing out results...", end='', flush=True) doc_topics.to_csv(os.path.join(output_dir, 'documents.csv'), index=False, encoding='utf-8') topic_keys.to_csv(os.path.join(output_dir, 'topics.csv'), index=False, encoding='utf-8') tokenid_map.to_csv(os.path.join(output_dir, 'tokens.csv'), index=False, encoding='utf-8') agg.to_csv(os.path.join(output_dir, 'state.csv'), index=False, encoding='utf-8') dist.to_csv(os.path.join(output_dir, 'distance.csv'), encoding='utf-8') corpus_token_counts_by_year.to_csv(os.path.join(output_dir, 'counts_by_year.csv'), index=False, encoding='utf-8') topic_token_counts_by_year.to_csv(os.path.join(output_dir, 'counts_by_topic_year.csv'), index=False, encoding='utf-8') if solr_url is not None: doc_meta.drop('year', axis=1, inplace=True) doc_meta.to_csv(os.path.join(output_dir, 'docmeta.csv'), index=False, encoding='utf-8') print("done") elapsed = int(time() - start) print("All done. Time elapsed: {}".format(timedelta(seconds=elapsed)))