def GroupByParallelProcess(tweetsDF, cores, groupMethod): """ Group by and aggregate on time via a parallel process """ tweetsDF.label_date = tweetsDF.label_date.astype(int) tweetsDF = tweetsDF.set_index("label_date") # Parallelizing using Pool.apply() df_split = GetListOfSplitDFs(tweetsDF, cores) # create the multiprocessing pool pool = Pool(cores) # process the DataFrame by mapping function to each df across the pool logging.info("Starting the grouping and aggregating process.") if groupMethod == "weighted-average": df_out = pool.map(PerformGroupbyAndAggregate, df_split) elif groupMethod == "sum": df_out = pool.map(PerformSum, df_split) elif groupMethod == "mean": df_out = pool.map(PerformMean, df_split) else: logging.error("Choose correct group by method.") return None # close down the pool and join pool.close() pool.join() pool.clear() logging.info("Ended the grouping and aggregating process.") return df_out
def q5_plot_chromatic_num_bounds_by_prob(n, prange, pstep, k=None,\ clique_finder=greedy_find_clique_number, multi=False): """Plots a graph of number of colours against edge probability, for each of the various lower/upper bounds of chromatic number multi: True/False/int multiprocessing - yes/no/ num processes (default 4 if true) """ probs = np.arange(prange[0], prange[1], pstep) graphs = [[get_random_graph(n, p, k) for _ in range(10)] for p in probs] mean_bounds = [] pool = Pool(multi if type(multi) is int else 4) # graph_generator = pool.imap(multiprocessing_chrom_bounds_func, graphs) if multi else map(f, graphs) f = lambda graphs_list: list(map(get_chromatic_number_bounds, graphs_list)) graph_generator = pool.imap(f, graphs) if multi else map(f, graphs) for bounds in tqdm.tqdm(graph_generator, total=len(graphs)): mean_bounds.append(np.mean(bounds, axis=0)) pool.close() pool.join() mean_bounds = np.array(mean_bounds) plt.figure() for i, label in zip(range(mean_bounds.shape[1]), \ ['lb_comp', 'lb_clique', 'ub_clique', 'ub_greedy_rand', 'ub_greedy_msd']): plt.plot(probs, mean_bounds[:, i], label=label) plt.legend() return probs, mean_bounds
def preprocess(self): # Check if orderline should be extracted extract_orderline = conf['extract_orderline'] pool = Pool() start = time.time() # Run in parallel if extract_orderline: res_orderline = pool.apipe(self.create_orderline, return_dataframe=False) res_warehouse = pool.apipe(self.create_warehouse) res_district = pool.apipe(self.create_district) res_order = pool.apipe(self.create_order) res_customer = pool.apipe(self.create_customer) res_stock = pool.apipe(self.create_stock) # Consolidate result pool.close() pool.join() list_of_processed_files = [res_warehouse.get(), res_district.get(), res_order.get(), res_customer.get(), res_stock.get()] if extract_orderline: list_of_processed_files.append(res_orderline.get()) end = time.time() self.debug("Preprocessing of csv file took {}s".format(end - start)) return list_of_processed_files
def perplexity(lang="eng"): """ finds satistical perplexity of the language model in Google Books N-Gram dataset. """ pool = ProcessingPool(4) unigram_counter, mgram_counter, ngram_counter= pool.map(get_ngram_counter, [1,2,3], [lang] * 3) pool.close() pool.join() total_words = np.sum(np.array(list(unigram_counter.values()))) print("total_words = ", total_words) ngram_conditionals = get_ngram_conditionals(ngram_counter, mgram_counter) probs = np.power(np.array(list(ngram_conditionals.values()), dtype=np.float64), -np.array(list(ngram_counter.values()), dtype=np.float64) \ / total_words) print("probs shape = ", probs.shape) PP = (np.prod(probs, dtype=np.float64)) return PP
def multi_Non_Tweep_friends(self, handle): min_position, links = self.get_tweets(handle) print("Scraping last 100 days of activity") while (True): min_position1, links1 = self.get_tweets(handle, min_position) links = links + links1 if (min_position1 == None): break min_position = min_position1 people_list = [] link = [x for x in links if handle in x] link = self.duplicates(link) p = Pool(10) # Pool tells how many at a time with Pool(10) as p: records = list(tqdm(p.imap(self.get_people, link), total=len(link))) p.terminate() p.join() p.close() people_list = [item for sublist in records for item in sublist] people_list = self.duplicates(people_list) people_list = [x for x in people_list if x != handle] return (people_list)
def multi_word_cut(self, sentences): print('Multiprocessing Word cut ') if self.language == 'ch': jieba.initialize( ) # initialize first, or it will initialize in each process jieba.disable_parallel() def func(line): line = [i.strip() for i in jieba.cut(line, cut_all=False)] return [ i for i in line if ((not i.isdigit()) and (i not in self.stop_words)) ] else: def func(line): return [i.lower() for i in line.split(" ") if ((not i.isdigit()) and \ (i not in self.stop_words) and \ (len(i) >1 ) )] pool = Pool(nodes=5) t0 = time.time() word_cut = pool.map(func, sentences) pool.close() pool.join() pool.clear() print('MultiProcess time {:.0f}'.format(time.time() - t0)) return word_cut
def parallelize_dataframe(df, func, n_cores=16): df_split = np.array_split(df, n_cores) pool = Pool(n_cores) df = pd.concat(pool.map(func, df_split)) pool.close() pool.join() return df
def findbonds(self): """Calculates interactions between and/or within monomers""" if self.help: print( "Calculates interactions between and/or within monomers\n" f'\n\033[1mUsage: minnie findbonds \n' f' -cn, --complexName <string> \n ' f' Project ID of your complex\n\n' f' -p, --pdbs [<.pdb>/<path>] (singleframe.pdb) \n' f' Give single *.pdb or give folder path \n\n' f' -i [<hbonds>/<ionic>/<hydrophobic>/<ring_stacking>/<all>] (hbonds) \n' f' Calculates which types of interactions \n\n' f' -d <float> (2.5) \n' f' Cut-off to define a hydrogen bond\n\n' f' -intra, --includeIntra [<"True">/<"False">] ("False") \n' f' What do you want to analyze, all or only inter-monomer contacts? \033[0m \n\n\n\n' f'\n\033[1mUsage example:\033[0m\n\n' " Single frame - minnie findbonds -cn sox4 -p sox4/02_frames/md_0.pdb -i hbonds -s False \n" " Multiple frames - minnie findbonds -cn sox4 -p sox4/02_frames/* -i hbonds \n" " Multiple frames - minnie findbonds -cn sox4 -p sox4/02_frames/* -i all \n" ) elif not self.pdbs: print(f'where is pdb??') elif not self.complexName: print(f'Please specify complex name(s)') elif (self.systematic) == "True": pdb_list = self.pdbs if (self.intType == "all"): for intType in ["hbonds", "ionic", "hydrophobic", "ring_stacking"]: pool = Pool(pathos.multiprocessing.cpu_count() - 2) pool.map(analysis.comb_int, pdb_list, len(pdb_list) * [str(self.complexName)], len(pdb_list) * [str(intType)], len(pdb_list) * [str(self.includeIntra)], len(pdb_list) * [str(self.hbond_distance)]) #pool.close() else: pool = pathos.multiprocessing.ProcessingPool( pathos.multiprocessing.cpu_count() - 2) pool.map(analysis.comb_int, pdb_list, len(pdb_list) * [str(self.complexName)], len(pdb_list) * [str(self.intType)], len(pdb_list) * [str(self.includeIntra)], len(pdb_list) * [str(self.hbond_distance)]) pool.close() analysis.combine_interfacea_results(self.complexName) elif (self.systematic) == "False": if (self.intType == "all"): for intType in ["hbonds", "ionic", "hydrophobic", "ring_stacking"]: analysis.comb_int(self.pdbs[0], self.complexName, intType, self.includeIntra, self.hbond_distance) else: analysis.comb_int(self.pdbs[0], self.complexName, self.intType, self.includeIntra, self.hbond_distance) analysis.combine_interfacea_results(self.complexName)
def test_multiprocess(): x_list = [1,2,3,4,5,6,7,] y_list = ['1','2','3','4','5','6','7'] epoch = 8 pool = Pool(epoch) res = pool.amap(test_task,x_list,y_list) pool.pipe(test_task,'22','222') pool.close() pool.join()
class ConsensusMHSampler(MHSampler): def __init__(self, log_f, log_g, g_sample, x0, iterations, shards=1): super(ConsensusMHSampler, self).__init__(log_f, log_g, g_sample, x0, iterations) self.shards = shards assert len(self.log_distribution_fn) == self.shards self.log_fn_dict = {} # for pickling purposes for i in range(self.shards): self.log_fn_dict[i] = self.log_distribution_fn[i] self.pool = Pool(nodes=self.shards) def sample(self): map_results = self.pool.map(self.map_sample, range(self.shards)) self.pool.close() self.pool.join() self.pool.terminate() self.pool.restart() self.saved_states = self.reduce_sample(map_results) def map_sample(self, index): np.random.seed(1) cur_state = self.start_state sample_results = [cur_state] prob, count = 0, 0 for i in range(self.iterations): if i % 5000 == 0: print("iteration {}".format(i)) candidate_state = self.get_transition_sample(cur_state) acceptance = self.calculate_acceptance_ratio(candidate_state, self.log_fn_dict[index]) prob += acceptance count += 1 new_state = self.transition_step(cur_state, candidate_state, acceptance) sample_results.append(new_state) cur_state = new_state sample_results = np.array(sample_results) print("INDEX {}: Avg acceptance prob is {}".format(index, prob/count)) return (sample_results, 1.0 / (1e-8 + self.get_sample_variance(sample_results))) def get_sample_variance(self, data): return np.linalg.norm(np.var(np.array(data), axis=0)) def reduce_sample(self, results): ''' results is a list of (sample_array, weight) tuples ''' sample_results = 0 total_weight = 0 for sample, weight in results: sample_results += weight * sample total_weight += weight return sample_results / total_weight
def parallelize_dataframe(df: pd.DataFrame, func, n_cores=4) -> pd.DataFrame: df_split = np.array_split(df, n_cores) pool = Pool(n_cores) df = pd.concat(pool.map(func, df_split)) pool.close() pool.join() # have to include this to prevent leakage and allow multiple parallel function calls pool.terminate() pool.restart() return df
def goo(): pool = Pool(4) # def f(x): # return foo(100 + x) stuff = list(tqdm.tqdm(pool.imap(foo, range(20)), total=20)) print(stuff) print('aaa') pool.close() pool.join() print('bbb')
def parallelize(data, func, num_of_processes=8): '''Function for paralellizing any function on a dataframe. Stolen from stack overflow, user Tom Raz: https://stackoverflow.com/questions/26784164/pandas-multiprocessing-apply''' data_split = np.array_split(data, num_of_processes) pool = Pool(num_of_processes) data = pd.concat(pool.map(func, data_split)) pool.close() pool.join() return data
def get_jsd_gmm(gmmfit, savefile=None, multiprocessing=False, n_pool=10): n_idx = len(gmmfit) idx = np.arange(n_idx) labels = gmmfit['label'] i_list, i_label = [], [] j_list, j_label = [], [] for i in range(n_idx): i_list = i_list + list(np.repeat(int(idx[i]), n_idx - i - 1)) i_label = i_label + list(np.repeat(labels[i], n_idx - i - 1)) if i < n_idx: j_list = j_list + list(idx[i + 1::]) j_label = j_label + list(labels[i + 1::]) n_pairs = len(i_list) gc_pairs = table.Table() gc_pairs['i'] = i_label gc_pairs['j'] = j_label gc_pairs['jsd'] = np.zeros(len(gc_pairs), dtype='float64') n_gmm = len(gmmfit['weights'][0]) gmm_i = mixture.GaussianMixture(n_components=n_gmm) gmm_j = mixture.GaussianMixture(n_components=n_gmm) def wrapper(idx): i, j = i_list[idx], j_list[idx] gmm_i.means_ = gmmfit['means'][i] gmm_i.weights_ = gmmfit['weights'][i] gmm_i.covariances_ = gmmfit['covars'][i] gmm_i.precisions_ = gmmfit['prec'][i] gmm_i.precisions_cholesky_ = gmmfit['prec_chol'][i] gmm_j.means_ = gmmfit['means'][j] gmm_j.weights_ = gmmfit['weights'][j] gmm_j.covariances_ = gmmfit['covars'][j] gmm_j.precisions_ = gmmfit['prec'][j] gmm_j.precisions_cholesky_ = gmmfit['prec_chol'][j] jsd = gmm_jsd(gmm_i, gmm_j) gc_pairs['jsd'][idx] = jsd return jsd if multiprocessing: pool = Pool(n_pool) jsd = pool.map(wrapper, range(n_pairs)) pool.close() gc_pairs['jsd'] = jsd else: for idx in range(n_pairs): wrapper(idx) if savefile is not None: io.ascii.write(gc_pairs, savefile) return gc_pairs
def parallelize_dataframe(df, func, num_partitions=num_cores, num_cores=num_cores): df_split = np.array_split(df, num_partitions, axis=0) pool = Pool(num_cores) df = pd.concat(pool.map(func, df_split)) pool.close() pool.join() pool.clear() return df
def Pool(cpus=cpu_count()) -> ProcessingPool: """Context manager for pathos ProcessingPool""" # Creates a pool with processes p = ProcessingPool(cpus) yield p # Need to clear due to: # https://github.com/uqfoundation/pathos/issues/111 p.close() p.join() p.clear()
def get_top_n(self, query, corpus, n=5): temp_corpus = [" ".join(ele) for ele in corpus] pool = Pathos_Pool(cpu_count()) corpus_embeddings = self._encode_sentences(temp_corpus) query_embeddings = self._encode_sentences(query) scores = self._calc_similarity(query_embeddings, corpus_embeddings) pool.close() top_results = torch.topk(scores, n) return [{ "idx": i, "document": corpus[i] } for i in top_results[1].numpy().tolist()]
def Pool(threads: int, multiplier: int, name: str): """Context manager for pathos ProcessingPool""" # Creates a pool with threads else cpu_count * multiplier p = ProcessingPool(threads if threads else cpu_count() * multiplier) logging.debug(f"Created {name} pool") yield p # Need to clear due to: # https://github.com/uqfoundation/pathos/issues/111 p.close() p.join() p.clear()
def finally_ip(): (ipList,portList) = get_ip_list(url,headers) #运行pool = ThreadPool(2)有时会出现module '__main__' has no attribute '__spec__'错误 不造如何解决 #尝试过 __spec__=None 的方式 没有什么用 pool = ThreadPool(4) start_time = time.time() results = pool.map(test_ip,ipList,portList) pool.close() pool.join() end_time = time.time() print("并行耗时:"+str(end_time-start_time)) return results
def compute_scores(args): results_file = args.results_file scores_file = args.score_file num_captions = args.num_captions is_exp = args.exp generated_image_tokens = get_generated_tokens(res_file=results_file, num=num_captions) gt_image_tokens = get_gt_tokens() print('number of test images: %d, all images: %d') % ( len(generated_image_tokens), len(gt_image_tokens)) all_image_ids = generated_image_tokens.keys() def f(image_id_thread): image_ids = image_id_thread[0] thread_num = image_id_thread[1] scores = {} for image_id in image_ids: res_tokens = generated_image_tokens[str(image_id)] gt_tokens = gt_image_tokens[str(image_id)] wmd_score = word_mover_distance(res_tokens, gt_tokens, wvmodel=wvmodel, is_exp=is_exp) scores[image_id] = wmd_score print('Thread: %d, Image ID: %s, WMD score: %.5f') % ( thread_num, image_id, wmd_score) return scores num_images = len(all_image_ids) num_workers = 20 num_per_split = num_images // num_workers images_split = [] for i in range(num_workers): if i == (num_workers - 1): images_split.append([all_image_ids[(i * num_per_split):], i]) else: images_split.append([ all_image_ids[(i * num_per_split):((i + 1) * num_per_split)], i ]) pool = Pool(num_workers) all_scores = pool.map(f, images_split) pool.close() pool.join() scores = {} for s in all_scores: scores.update(s) with open(scores_file, 'w') as f: json.dump(scores, f) total_score = 0 for key in scores.keys(): total_score += scores[key] print('WMD score: %.5f') % (total_score / len(scores))
def get_top_n(self, query, corpus, n=5): # scores = np.zeros(self.corpus_size) temp_corpus = [" ".join(ele) for ele in corpus] pool = Pathos_Pool(cpu_count()) scores = pool.map(self._calc_distance, [query] * self.corpus_size, temp_corpus) pool.close() scores = np.array(scores) # for i, sent in enumerate(tqdm(corpus)): # scores[i] = self._calc_distance(query, sent) top_n = np.argsort(scores)[::-1][:n] return [{"idx": i, "document": corpus[i]} for i in top_n]
def parallelise_initsync(argv, ssp_params, process_control_id, logger): # Pivot the collection of source_system_profile records into # three separate lists to enable us to call pool.map on each record (source_schemas, tables, target_schemas, query_conditions) = map(list, zip(*ssp_params)) source_conn_detail = dbuser.get_dbuser_properties(argv.sourceuser) target_conn_detail = dbuser.get_dbuser_properties(argv.targetuser) logger.info("Processing tables with {} dedicated worker processes".format( argv.numprocesses)) pool = Pool(nodes=argv.numprocesses) argvs = [argv] * len(tables) source_conn_details = [source_conn_detail] * len(tables) target_conn_details = [target_conn_detail] * len(tables) pcids = [process_control_id] * len(tables) queues = [manager.Queue()] * len(tables) logger.debug("Starting a new process for each table in: {tables}".format( tables=tables)) # Execute initsync for each schema/table combination in parallel pool.map(initsync_table, argvs, source_conn_details, target_conn_details, source_schemas, tables, target_schemas, pcids, query_conditions, queues, chunksize=1) # Ensure tables are processed in sequence # and workers are fully utilised pool.close() logger.debug("parallelise_initsync: Pool joining") pool.join() logger.debug("parallelise_initsync: Pool joined") all_table_results = {} for q in queues: size = q.qsize() message = q.get() logger.debug("Message queue size = {s}, message = {m}".format( s=size, m=message)) all_table_results.update(message) logger.debug("all_table_results = {r}".format(r=all_table_results)) return all_table_results
def parallel_apply(self, df, func): # add try statement re function not returning a DataFrame if self.preprocessing_checks(df, func): # split DataFrame into a list of smaller DataFrames self.df_split = np.array_split(df, self.partitions, axis=0) # create the multiprocessing pool pool = Pool(self.cores) # process the DataFrame by mapping function to each df across the pool df = pd.concat(pool.map(func, self.df_split), axis=0).copy() # close down the pool and join pool.close() pool.join() pool.clear() return df
def get_gt_tokens( coco_file='../data/files/dataset_coco.json', coco_tokens_file='../data/files/coco_tokens_Google_news.json'): if os.path.exists(coco_tokens_file): with open(coco_tokens_file, 'r') as f: dataset = json.load(f) return dataset print 'Processing ground-truth data...' with open(coco_file, 'r') as f: dataset = json.load(f) def f(images): image_tokens = {} # images = dataset['images'] for image in images: sentence = image['sentences'] image_id = str(image['cocoid']) tokens = [] for s in sentence: tokens.extend(s['tokens']) filter_token = [] for token in tokens: if (token not in stop_words) and (token in vocab): filter_token.append(token) # tokens = [token for token in tokens if token not in stop_words and token in vocab] image_tokens[image_id] = filter_token return image_tokens all_images = dataset['images'] num_images = len(all_images) num_workers = 30 num_per_split = num_images // num_workers images_split = [] for i in range(num_workers): if i == (num_workers - 1): images_split.append(all_images[(i * num_per_split):]) else: images_split.append( all_images[(i * num_per_split):((i + 1) * num_per_split)]) pool = Pool(num_workers) all_images_tokens = pool.map(f, images_split) pool.close() pool.join() all_token_dict = {} for d in all_images_tokens: all_token_dict.update(d) with open(coco_tokens_file, 'w') as f: json.dump(all_token_dict, f) return all_token_dict
def makeRadial(): rad, angle = d["radial"]["rad"], d["radial"]["angle"] args = np.linspace(angle, angle + np.pi, frameCount) pool = Pool(4) while True: subIm = JuliaTools.subImage(c=rad * np.exp(1j * angle), r=r, n=10, p=p, iters=iters, split=split, save=False, aura=False) isBlackList = pool.map(subIm, coords) if not all(isBlackList): break else: rad *= 0.975 # Circular arc c follows in complex plane cPath = rad * np.exp(1j * args) for frame in xrange(frameCount): subIm = JuliaTools.subImage(c=cPath[frame], r=r, n=n, p=p, iters=iters, split=split) isBlackList = pool.map(subIm, coords) allBlack = all(isBlackList) if not allBlack: JuliaTools.makeFrame(frame, n, split, coords) pool.close() JuliaTools.prepareForFFmpeg(frameCount=frameCount, loop=True) with open("tweet.txt", "w") as out: out.write("Images generated using constants" " on a circular arc of radius {:03.2f}.".format(rad)) stop = timeit.default_timer() print stop - start
def apply_by_multiprocessing(df, func, **kwargs): """ Parallel execution function for the DataFrame :param df: Input DataFrame :param func: :param kwargs: additional arguments for the df.apply() such as axis and et al. :return: Output DataFrame """ workers = kwargs.pop('workers') pool = Pool(processes=workers) result = pool.map(_apply_df, [(d, func, i, kwargs) for i, d in enumerate(np.array_split(df, workers))]) pool.close() result = sorted(result, key=lambda x: x[0]) return pd.concat([i[1] for i in result])
def make_query(self, size=1): ## quit if nr_unlabeled_samples = 1 if self.dataset.len_unlabeled() == 1: return self.dataset.get_unlabeled_entries()[0].astype(int) ## Set the possible labels self.possible_labels = list(set(self.dataset.get_labeled_entries()[1])) ## Train the model self.model.train(self.dataset) ## Get probabilities X_ids, X = self.dataset.get_unlabeled_entries() pred = self.model.predict_proba( X) # pred.shape = (n_unlabeled, nr_of_labels) ## Setup pool for cpu parallelisation p = Pool(cpu_count(), maxtasksperchild=1000) ## nr of unlabeled samples -> len(X) ## Get uncertainty after adding every sample with every label total = np.asarray( p.map(self._eer, X_ids, len(X) * [self.dataset], len(X) * [self.depth])) # total.shape = (n_unlabeled, nr_of_labels) ## Close the Pool again p.close() p.join() p.clear() ## Get the total uncertainty of one sample after adding a label weighted by the labels probability total = np.inner( pred, total, ).diagonal() # total.shape = (n_unlabeled,) ## Zip it total = zipit(X_ids, total) ## Sort it results = sort_by_2nd(total, 'min') return results[:size, 0].astype(int)
def start(self, text_data_dir, res_dir, nprocs=8): ''' entry function text_data_dir: folder of raw data text_res_dir: folder of output verbose: int. Information is printed every N records nprocs: number of cores in parallel ''' p = PathosPool(nprocs) filepathsvec, filenamesvec, respaths = list(), list(), list() for dirpath, _, filenames in os.walk(text_data_dir): for filename in filenames: if (("gz" in filename) and ('md5' not in filename) and ('copy' not in filename)): filepath = os.path.join(dirpath, filename) print(filepath) res_name = filename.split(".")[0] + ".csv.gz" respath = os.path.join(res_dir, res_name) #if os.path.exists(respath): # pass #else: if True: filepathsvec.append(filepath) filenamesvec.append(filename) respaths.append(respath) #p.apply_async(process_data, args = (filepath,filename, # respath, True, # [title_stop_path, # affil_stop_path, # mesh_stop_path])) self.affildicts = p.amap( partial(self.process_data, stop_paths=[ self.title_stop_path, self.affil_stop_path, self.mesh_stop_path ], rm_stopwords=True, affiliation_correction=True, select_journals=self.select_journals), filepathsvec, filenamesvec, respaths) p.close() p.join() # Having an issue joining print("joined") p.clear() # Delete the pool
def makePower(): global c pMin, pMax = d["power"]["pMin"], d["power"]["pMax"] pPath = np.linspace(pMin, pMax, frameCount) pool = Pool(4) # Get interesting c while True: subIm = JuliaTools.subImage(c=c, n=10, iters=iters / 2, r=r, p=pMin, split=split, save=False, aura=False) isBlackList = pool.map(subIm, coords) if not all(isBlackList): break else: c *= 0.975 for frame in xrange(frameCount): subIm = JuliaTools.subImage(c=c, r=r, n=n, p=pPath[frame], iters=iters / 2, split=split) isBlackList = pool.map(subIm, coords) allBlack = all(isBlackList) if not allBlack: JuliaTools.makeFrame(frame, n, split, coords) pool.close() JuliaTools.prepareForFFmpeg(frameCount=frameCount, loop=True) with open("tweet.txt", "w") as out: out.write("woooooooooooooooooooo") stop = timeit.default_timer() print stop - start
def _mp_improve(self, container, scenario_builder): """Improves b/2 best solutions from the container and updates the score table with the generated solutions """ container.sort() pool = Pool(processes=self._proc_count) logging.info("Starting processes") start = datetime.now() best = [] builders = [] for i in range(self._b/2): best.append(container.get(i)) builders.append(scenario_builder) try: result = pool.map(self._improve, best, builders) pool.close() pool.join() except MemoryError as e: send_email("I crashed again, please help!") import pudb pudb.set_trace() print(e.message()) logging.info("Processes finished - %s" % (datetime.now() - start)) # How infuriating was that?! # pathos was being smart and was caching pool so this is needed # to prevent from erroring out pool.restart() start = datetime.now() logging.info("mp_improve second loop") for entry in result: index = container.index(entry['individual']) best = entry['improvements'].get(0) if best.get_utility() < entry['individual'].get_utility(): container.replace(best, index) for improvement in entry['improvements'].get_all(): self._update_score_table(improvement) logging.info("mp_improve second loop - %s" % (datetime.now() - start)) logging.info("Improved %d solutions" % container.get_changes()) container.reset_changes() return container
class analyze(setup.setup): def __init__(self,args,logging_level=logging.INFO): super(analyze, self ).__init__(args,logging_level) # set up processing pool and run all analyses specified in args def run(self): if self.args.jumpdists: n_bins=100. bin_width = 1/n_bins bins = np.arange(0,1+bin_width,1/n_bins) if self.args.file: user,vals = self.artist_jump_distributions(self.args.file,bins=bins,self_jumps=False) with open(self.args.resultdir+user,'w') as fout: fout.write(','.join(vals.astype(str))+'\n') else: raise('not implemented!') self.pool = Pool(self.args.n) self.rootLogger.info("Pool started") self.rootLogger.info("Starting jump distance analysis") func_partial = partial(self.artist_jump_distributions,bins=bins,self_jumps=False) with open(self.args.resultdir+'jumpdists','w') as fout: for user,vals in self.pool.imap(func_partial,self.listen_files): fout.write(user+'\t'+','.join(vals.astype(str))+'\n') self.pool.close() self.rootLogger.info("Pool closed") if self.args.blockdists: #self.rootLogger.info("Starting block distance analysis") self.mean_block_distances(self.args.file) if self.args.diversity_dists: bins = np.arange(0,1.01,.01) self.diversity_distributions(self.args.file,bins=bins) if self.args.clustering: self.clustering(self.args.file) if self.args.values: self.patch_values(self.args.file) if self.args.exp: self.explore_exploit(self.args.file) if self.args.patch_len_dists: self.patch_len_dists(self.args.file) # calculate distribution (using histogram with specified bins) # of sequential artist-to-artist distances def artist_jump_distributions(self,fi,bins,self_jumps=False): user = fi.split('/')[-1][:-4] df = pd.read_pickle(fi) if self_jumps: vals = np.histogram(df['dist'].dropna(),bins=bins)[0] else: vals = np.histogram(df['dist'][df['dist']>0],bins=bins)[0] self.rootLogger.info('artist jump distances done for user {} ({})'.format(user,fi)) return user,vals # calculate distribution (using histogram with specified bins) # of patch diversity for each user # awk 'FNR==1' * > diversity_dists_zeros # awk 'FNR==2' * > diversity_dists_nozeros def diversity_distributions(self,fi,bins): if 'patches' not in fi: raise('WRONG DATATYPE') user = fi.split('/')[-1].split('_')[0] df = pd.read_pickle(fi).dropna(subset=['diversity']) zeros = np.histogram(df[df['n']>=5]['diversity'],bins=bins)[0] nozeros = np.histogram(df[(df['n']>=5)&(df['diversity']>0)]['diversity'],bins=bins)[0] zeros = zeros/float(zeros.sum()) nozeros = nozeros/float(nozeros.sum()) with open(self.args.resultdir+user,'w') as fout: fout.write(user+'\t'+'zeros'+'\t'+','.join(zeros.astype(str))+'\n') fout.write(user+'\t'+'nozeros'+'\t'+','.join(nozeros.astype(str))+'\n') self.rootLogger.info('diversity distributions done for user {} ({})'.format(user,fi)) def mean_block_distances(self,fi,n=100): def cos_nan(arr1,arr2): if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)): return np.nan else: return cosine(arr1,arr2) user = fi.split('/')[-1].split('_')[0] df = pd.read_pickle(fi) blocks = df[df['n']>=5].dropna() result = [] for i in xrange(len(blocks)-n): first = blocks['centroid'].iloc[i] result.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first)))) result = np.nanmean(np.vstack(result),0) with open(self.args.resultdir+user,'w') as fout: fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n') self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi)) # now shuffled # idx = np.array(blocks.index) # np.random.shuffle(idx) # blocks = blocks.reindex(idx) # result_random = [] # for i in xrange(len(blocks)-n): # first = blocks['centroid'].iloc[i] # result_random.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first)))) # result_random = np.nanmean(np.vstack(result_random),0) # with open(self.args.resultdir+user,'w') as fout: # fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n') # fout.write('\t'.join([user,'patch_random',','.join(result_random.astype(str))])+'\n') # self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi)) def clustering(self,fi): df = pd.read_pickle(fi) user = fi.split('/')[-1].split('_')[0] mask = (df['centroid'].apply(lambda arr: ~np.any(np.isnan(arr))).values)&(df['n']>=5)&(df['diversity']<=0.2) clust_data = df[mask].reset_index() arr = np.vstack(clust_data['centroid']) Z = linkage(arr, 'complete') clusters = fcluster(Z,t=0.2,criterion='distance') assignments = np.repeat(np.nan,len(df)) assignments[np.where(mask)] = clusters df['patch_clust'] = assignments df.to_pickle('{}{}.pkl'.format(self.args.resultdir,user)) self.rootLogger.info('Patch clusters for user {} processed successfully ({})'.format(user,fi)) def patch_len_dists(self,fi): df = pd.read_pickle(fi) user = fi.split('/')[-1][:-4] explore = df[np.isnan(df['patch_clust'])] result_explore = explore['n'].value_counts() df['explore'] = np.isnan(df['patch_clust']).astype(int) df['explore-idx'] = df['explore'].cumsum() result_exploit = df.groupby('explore-idx').apply(lambda df: df.dropna()['n'].sum()).value_counts() result_explore = result_explore.reindex(xrange(1,max(result_explore.index)+1),fill_value=0.).values result_exploit = result_exploit.reindex(xrange(1,max(result_exploit.index)+1),fill_value=0.).values result_explore = sparse.csr_matrix(result_explore) result_exploit = sparse.csr_matrix(result_exploit) with open(self.args.resultdir+user,'w') as fout: fout.write(user+'\t'+'explore'+'\t'+':'.join([','.join(a.astype(str)) for a in result_explore.data,result_explore.indices,result_explore.indptr])+'\n') fout.write(user+'\t'+'exploit'+'\t'+':'.join([','.join(a.astype(str)) for a in result_exploit.data,result_exploit.indices,result_exploit.indptr])+'\n') self.rootLogger.info('User {} processed successfully ({})'.format(user,fi)) def explore_exploit(self,fi): user = fi.split('/')[-1][:-4] df_patches_raw = pd.read_pickle(fi) # add time in next bout df_patches_raw['next_n'] = df_patches_raw['n'].shift(-1) # add patch values # listensPerPatch = df_patches_raw.groupby('patch_clust')['n'].sum() # overall_prop = listensPerPatch/float(df_patches_raw['n'].sum()) # overall_prop.name = 'final_value' # df_patches_raw = df_patches_raw.join(overall_prop,on='patch_clust') """ # time in next exploit patch as function of exploration time result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['next_n'].mean() fout.write(user+'\t'+'next-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # total time exploiting as a function of time exploring df_patches_raw['explore'] = np.isnan(df_patches_raw['patch_clust']).astype(int) df_patches_raw['explore-idx'] = df_patches_raw['explore'].cumsum() # combine all exploit listens #grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'][1:].sum()]})) # only last exploit bout grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'].iloc[-1]]})) #result = grp_explore.groupby('n')['n-exploit'].mean() #fout.write(user+'\t'+'total-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # exploration time as a function of exploitation time grp_exploit = grp_explore.copy() grp_exploit['n-explore'] = grp_exploit['n'].shift(-1) result = grp_exploit.groupby('n-exploit')['n-explore'].mean() fout.write(user+'\t'+'explore-vs-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # prob exploit given explore time - already done # explore_only = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])] # result = explore_only['n'][:-1].value_counts() # arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values # final_result = arr/(np.cumsum(arr[::-1])[::-1]) # final_result = sparse.csr_matrix(final_result) # with open(self.args.resultdir+user+'_exploit','w') as fout: # fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n') # prob explore given exploit time result = grp_explore['n-exploit'][grp_explore['n-exploit']>0].value_counts() arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values final_result = arr/np.cumsum(arr[::-1])[::-1] final_result = sparse.csr_matrix(final_result) with open(self.args.resultdir+user+'_explore','w') as fout: fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n') #fout.write(user+'\t'+'prob-explore-given-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # patch value as a function of exploration time df_patches_raw['final_value_next'] = df_patches_raw['final_value'].shift(-1) result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['final_value_next'].mean() fout.write(user+'\t'+'exploit-value-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ self.rootLogger.info('User {} processed successfully ({})'.format(user,fi))
def genseq(idx): first = np.where(np.random.multinomial(1,pvals=pops)==1)[0][0] last = first last_ts = datetime.now() result = {'artist_idx':[first],'ts':[last_ts]} for i in xrange(seq_length-1): next_listen = draw(last) last = next_listen gap_bin = 120*np.where(np.random.multinomial(1,pvals=td)==1)[0][0] gap = np.random.randint(gap_bin,gap_bin+120) result['artist_idx'].append(next_listen) new_ts = last_ts+timedelta(0,gap) result['ts'].append(new_ts) last_ts = new_ts df = pd.DataFrame(result) df['block'] = ((df['artist_idx'].shift(1) != df['artist_idx']).astype(int).cumsum())-1 df.to_pickle(str(idx)+'.pkl') logging.info('idx {} complete'.format(idx)) pool = Pool(cpu_count()) indices = range(n) pool.map(genseq,indices) pool.close()
def integrate_model(self, n_realizations, int_length = None, noise_type = 'white', sigma = 1., n_workers = 3, diagnostics = True): """ Integrate trained model. noise_type: -- white - classic white noise, spatial correlation by cov. matrix of last level residuals -- cond - find n_samples closest to the current space in subset of n_pcs and use their cov. matrix -- seasonal - seasonal dependence of the residuals, fit n_harm harmonics of annual cycle, could also be used with cond. except 'white', one can choose more settings like ['seasonal', 'cond'] """ if self.verbose: print("preparing to integrate model...") pcs = self.input_pcs.copy() pcs = pcs.T # time x dim pcmax = np.amax(pcs, axis = 0) pcmin = np.amin(pcs, axis = 0) self.varpc = np.var(pcs, axis = 0, ddof = 1) self.int_length = pcs.shape[0] if int_length is None else int_length self.diagnostics = diagnostics if self.harmonic_pred in ['all', 'first']: if self.verbose: print("...using harmonic predictors (with annual frequency)...") self.xsin = np.sin(2*np.pi*np.arange(self.int_length) / 12.) self.xcos = np.cos(2*np.pi*np.arange(self.int_length) / 12.) if self.verbose: print("...preparing noise forcing...") self.sigma = sigma if isinstance(noise_type, basestring): if noise_type not in ['white', 'cond', 'seasonal']: raise Exception("Unknown noise type to be used as forcing. Use 'white', 'cond', or 'seasonal'.") elif isinstance(noise_type, list): noise_type = frozenset(noise_type) if not noise_type.issubset(set(['white', 'cond', 'seasonal'])): raise Exception("Unknown noise type to be used as forcing. Use 'white', 'cond', or 'seasonal'.") self.last_level_res = self.residuals[max(self.residuals.keys())] self.noise_type = noise_type if noise_type == 'white': if self.verbose: print("...using spatially correlated white noise...") Q = np.cov(self.last_level_res, rowvar = 0) self.rr = np.linalg.cholesky(Q).T if 'seasonal' in noise_type: n_harmonics = 5 if self.verbose: print("...fitting %d harmonics to estimate seasonal modulation of last level's residual..." % n_harmonics) if self.delay_model: resid_delayed = self.last_level_res[-(self.last_level_res.shape[0]//12)*12:].copy() rr_last = np.reshape(resid_delayed, (12, self.last_level_res.shape[0]//12, self.last_level_res.shape[1]), order = 'F') else: rr_last = np.reshape(self.last_level_res, (12, self.last_level_res.shape[0]//12, self.last_level_res.shape[1]), order = 'F') rr_last_std = np.nanstd(rr_last, axis = 1, ddof = 1) predictors = np.zeros((12, 2*n_harmonics + 1)) for nh in range(n_harmonics): predictors[:, 2*nh] = np.cos(2*np.pi*(nh+1)*np.arange(12) / 12) predictors[:, 2*nh+1] = np.sin(2*np.pi*(nh+1)*np.arange(12) / 12) predictors[:, -1] = np.ones((12,)) bamp = np.zeros((predictors.shape[1], pcs.shape[1])) for k in range(bamp.shape[1]): bamp[:, k] = np.linalg.lstsq(predictors, rr_last_std[:, k])[0] rr_last_std_ts = np.dot(predictors, bamp) self.rr_last_std_ts = np.repeat(rr_last_std_ts, repeats = self.last_level_res.shape[0]//12, axis = 0) if self.delay_model: resid_delayed /= self.rr_last_std_ts Q = np.cov(resid_delayed, rowvar = 0) else: self.last_level_res /= self.rr_last_std_ts Q = np.cov(self.last_level_res, rowvar = 0) self.rr = np.linalg.cholesky(Q).T if diagnostics: if self.verbose: print("...running diagnostics for the data...") # ACF, kernel density, integral corr. timescale for data self.max_lag = 50 lag_cors = np.zeros((2*self.max_lag + 1, pcs.shape[1])) kernel_densities = np.zeros((100, pcs.shape[1], 2)) for k in range(pcs.shape[1]): lag_cors[:, k] = cross_correlation(pcs[:, k], pcs[:, k], max_lag = self.max_lag) kernel_densities[:, k, 0], kernel_densities[:, k, 1] = kdensity_estimate(pcs[:, k], kernel = 'epanechnikov') integral_corr_timescale = np.sum(np.abs(lag_cors), axis = 0) # init for integrations lag_cors_int = np.zeros([n_realizations] + list(lag_cors.shape)) kernel_densities_int = np.zeros([n_realizations] + list(kernel_densities.shape)) stat_moments_int = np.zeros((4, n_realizations, pcs.shape[1])) # mean, variance, skewness, kurtosis int_corr_scale_int = np.zeros((n_realizations, pcs.shape[1])) self.diagpc = np.diag(np.std(pcs, axis = 0, ddof = 1)) self.maxpc = np.amax(np.abs(pcs)) self.diagres = {} self.maxres = {} for l in self.residuals.keys(): self.diagres[l] = np.diag(np.std(self.residuals[l], axis = 0, ddof = 1)) self.maxres[l] = np.amax(np.abs(self.residuals[l])) self.pcs = pcs if n_workers > 1: # from multiprocessing import Pool from pathos.multiprocessing import ProcessingPool pool = ProcessingPool(n_workers) map_func = pool.amap if self.verbose: print("...running integration of %d realizations using %d workers..." % (n_realizations, n_workers)) else: map_func = map if self.verbose: print("...running integration of %d realizations single threaded..." % n_realizations) rnds = [] for n in range(n_realizations): r = {} for l in self.fit_mat.keys(): if l == 0: if self.delay_model: r[l] = np.dot(self.diagpc, np.random.normal(0, sigma, (pcs.shape[1], self.delay))) else: r[l] = np.dot(np.random.normal(0, sigma, (pcs.shape[1],)), self.diagpc) else: if self.delay_model: r[l] = np.dot(self.diagres[l-1], np.random.normal(0, sigma, (pcs.shape[1], self.delay))) else: r[l] = np.dot(np.random.normal(0, sigma, (pcs.shape[1],)), self.diagres[l-1]) rnds.append(r) args = [[i, rnd, noise_type] for i, rnd in zip(range(n_realizations), rnds)] results = map_func(self._process_integration, args) del args if n_workers > 1: pool.close() self.integration_results = np.zeros((n_realizations, pcs.shape[1], self.int_length)) self.num_exploding = np.zeros((n_realizations,)) if n_workers > 1: results = results.get() if self.diagnostics: # x, num_exploding, xm, xv, xs, xk, lc, kden, ict for i, x, num_expl, xm, xv, xs, xk, lc, kden, ict in results: self.integration_results[i, ...] = x.T self.num_exploding[i] = num_expl stat_moments_int[0, i, :] = xm stat_moments_int[1, i, :] = xv stat_moments_int[2, i, :] = xs stat_moments_int[3, i, :] = xk lag_cors_int[i, ...] = lc kernel_densities_int[i, ...] = kden int_corr_scale_int[i, ...] = ict else: for i, x, num_expl in results: self.integration_results[i, ...] = x.T self.num_exploding[i] = num_expl if self.verbose: print("...integration done, now saving results...") if self.verbose: print("...results saved to structure.") print("there was %d expolding integration chunks in %d realizations." % (np.sum(self.num_exploding), n_realizations)) if self.diagnostics: if self.verbose: print("plotting diagnostics...") import matplotlib.pyplot as plt # plot all diagnostic stuff ## mean, variance, skewness, kurtosis, integral corr. time scale t**s = ['MEAN', 'VARIANCE', 'SKEWNESS', 'KURTOSIS', 'INTEGRAL CORRELATION TIME SCALE'] plot = [np.mean(pcs, axis = 0), np.var(pcs, axis = 0, ddof = 1), sts.skew(pcs, axis = 0), sts.kurtosis(pcs, axis = 0), integral_corr_timescale] xplot = np.arange(1, pcs.shape[1]+1) for i, tit, p in zip(range(5), t**s, plot): plt.figure() plt.title(tit, size = 20) plt.plot(xplot, p, linewidth = 3, color = '#3E3436') if i < 4: plt.plot(xplot, np.percentile(stat_moments_int[i, :, :], q = 2.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36') plt.plot(xplot, np.percentile(stat_moments_int[i, :, :], q = 97.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36') else: plt.plot(xplot, np.percentile(int_corr_scale_int, q = 2.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36') plt.plot(xplot, np.percentile(int_corr_scale_int, q = 97.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36') plt.xlabel("# PC", size = 15) plt.xlim([xplot[0], xplot[-1]]) plt.show() plt.close() ## lagged correlations, PDF - plot first 9 PCs (or less if input number of pcs is < 9) t**s = ['AUTOCORRELATION', 'PDF'] plot = [[lag_cors, lag_cors_int], [kernel_densities, kernel_densities_int]] xlabs = ['LAG', ''] for i, tit, p, xlab in zip(range(2), t**s, plot, xlabs): plt.figure() plt.suptitle(tit, size = 25) no_plts = 9 if self.no_input_ts > 9 else self.no_input_ts for sub in range(0,no_plts): plt.subplot(3, 3, sub+1) if i == 0: xplt = np.arange(0, self.max_lag+1) plt.plot(xplt, p[0][p[0].shape[0]//2:, sub], linewidth = 3, color = '#3E3436') plt.plot(xplt, np.percentile(p[1][:, p[0].shape[0]//2:, sub], q = 2.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36') plt.plot(xplt, np.percentile(p[1][:, p[0].shape[0]//2:, sub], q = 97.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36') plt.xlim([xplt[0], xplt[-1]]) else: plt.plot(p[0][:, sub, 0], p[0][:, sub, 1], linewidth = 3, color = '#3E3436') plt.plot(p[1][0, :, sub, 0], np.percentile(p[1][:, :, sub, 1], q = 2.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36') plt.plot(p[1][0, :, sub, 0], np.percentile(p[1][:, :, sub, 1], q = 97.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36') plt.xlim([p[0][0, sub, 0], p[0][-1, sub, 0]]) plt.xlabel(xlab, size = 15) plt.title("PC %d" % (int(sub)+1), size = 20) # plt.tight_layout() plt.show() plt.close()
class setup(object): # init just takes in command line arguments and sets up logging def __init__(self, args, logging_level=logging.INFO): self.args = args # logger setup now = datetime.datetime.now() log_filename = now.strftime("setup_%Y%m%d_%H%M%S.log") logFormatter = logging.Formatter("%(asctime)s\t[%(levelname)s]\t%(message)s") self.rootLogger = logging.getLogger() # fileHandler = logging.FileHandler(log_filename) # fileHandler.setFormatter(logFormatter) # self.rootLogger.addHandler(fileHandler) consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormatter) self.rootLogger.addHandler(consoleHandler) self.rootLogger.setLevel(logging_level) # self.rootLogger.info("Input arguments: "+str(args)) if self.args.feature_path: features = np.load(self.args.feature_path) self.n_features = features.shape[1] self.features = {i: features[i] for i in xrange(len(features))} @staticmethod def userFromFile(fi): return fi.split("/")[-1].split("_")[-1][:-4] # set up processing pool and run all analyses specified in args def run(self): if self.args.preprocess: # self.rootLogger.info("Starting preprocessing") self.preprocess() # self.rootLogger.info("Preprocessing complete") if self.args.patch_basis is not None: # self.rootLogger.info("Starting patch summaries") self.summarize_patches() # self.rootLogger.info("Patch summaries complete") if self.args.blockdists: # self.rootLogger.info("Starting block distance analysis") self.mean_block_distances(self.args.file) if self.args.blockgaps: # self.rootLogger.info("Starting block distance analysis") self.blockgaps(self.args.file) if self.args.scrobblegaps: # self.rootLogger.info("Starting block distance analysis") self.scrobble_gaps(self.args.file) if self.args.ee_artists: self.ee_artists(self.args.file) if self.args.ee_artists_2: self.ee_artists_2(self.args.file) if self.args.ee_artists_dists: self.ee_artists_dists(self.args.file) if self.args.block_len_dists: self.block_len_dists(self.args.file) # Calls preprocessing code to load raw text files and convert to dataframes, adding features, disances, etc. def preprocess(self): self.artist_idx_feature_map = {} for line in open(self.args.suppdir + "artist_idx_feature_map"): k, v = line.strip().split("\t") self.artist_idx_feature_map[float(k)] = int(v) if self.args.file: result = self.processor( fi=self.args.file, output_dir=self.args.pickledir, is_sorted=True, features=self.features, dist=self.args.distance_metric, session_threshold=self.args.session_thresh, dist_threshold=self.args.dist_thresh, min_patch_length=self.args.min_patch_length, artist_idx_feature_map=self.artist_idx_feature_map, ) # if self.args.patch_len_dist: # user,vals_simple,vals_shuffle = result # with open(self.args.resultdir+user,'a') as fout: # if vals_simple is not None: # fout.write('\t'.join([user,'simple',str(self.args.dist_thresh)])+'\t'+','.join(vals_simple.astype(str))+'\n') # fout.write('\t'.join([user,'shuffle',str(self.args.dist_thresh),str(self.args.min_patch_length)])+'\t'+','.join(vals_shuffle.astype(str))+'\n') else: if args.rawtext: if self.args.skip_complete: done = set( [ self.userFromFile(fi) for fi in glob(self.args.pickledir + "*.pkl") if "_patches_" not in fi and fi.startswith(self.args.prefix_output) ] ) else: done = set() files = [fi for fi in glob(self.args.datadir + "*.txt") if self.userFromFile(fi) not in done] else: if self.args.skip_complete: done = set( [ self.userFromFile(fi) for fi in glob(self.args.pickledir + "*.pkl") if "_patches_" not in fi and fi.startswith(self.args.prefix_output) ] ) else: done = set() files = [ fi for fi in glob(self.args.pickledir + "*.pkl") if "_patches_" not in fi and fi.startswith(self.args.prefix_input) and self.userFromFile(fi) not in done ] self.n_files = len(files) self.rootLogger.debug(files) func_partial = partial( self.processor, output_dir=self.args.pickledir, is_sorted=True, features=self.features, dist=self.args.distance_metric, session_threshold=self.args.session_thresh, dist_threshold=self.args.dist_thresh, min_patch_length=self.args.min_patch_length, artist_idx_feature_map=self.artist_idx_feature_map, ) self.pool = Pool(self.args.n) self.rootLogger.info("Pool started") self.pool.map(func_partial, files) self.pool.close() self.rootLogger.info("Pool closed") # Jensen Shannon Distance (Sqrt of Jensen Shannon Divergence) @staticmethod def JSD(P, Q): if np.all(np.isnan(P)) or np.all(np.isnan(Q)): return np.nan _P = P / norm(P, ord=1) _Q = Q / norm(Q, ord=1) _M = 0.5 * (_P + _Q) return np.sqrt(np.clip(0.5 * (entropy(_P, _M) + entropy(_Q, _M)), 0, 1)) # Calculate distance between any two feature arrays def calc_dist(self, idx_1, idx_2, metric="cosine"): features1 = self.get_features(idx_1) features2 = self.get_features(idx_2) if np.any(np.isnan(features1)) or np.any(np.isnan(features2)): return np.nan if np.all(features1 == features2): return 0.0 if metric == "JSD": return self.JSD(features1, features2) elif metric == "cosine": return cosine(features1, features2) elif metric == "euclidean": return euclidean(features1, features2) # "s -> (s0,s1), (s1,s2), (s2, s3), ..." @staticmethod def pairwise(iterable): a, b = tee(iterable) next(b, None) return izip(a, b) # segment patch, generating both simple and shuffle-based indices def patch_segmenter(self, df, metric, min_length, dist_thresh): l = df["artist_idx"] indices = list(np.array([len(list(v)) for g, v in groupby(l)][:-1]).cumsum()) new_indices = [] for b in indices: dist = self.calc_dist(df.iloc[b]["artist_idx"], df.iloc[b - 1]["artist_idx"], metric=metric) if (np.isnan(dist)) or (dist >= dist_thresh): new_indices.append(b) if new_indices: last_patch = False final_indices = [] for i, (a, b) in enumerate(self.pairwise([0] + new_indices + [len(df)])): if b - a >= min_length: if a > 0: final_indices.append(a) last_patch = True else: if last_patch: final_indices.append(a) last_patch = False return final_indices, new_indices return new_indices, new_indices # retrieve features from feature matrix, given an artist idx. Return array of np.nans if artist idx is null def get_features(self, idx): return self.features.get(idx, np.repeat(np.nan, self.n_features)) # if np.isnan(idx): # return np.repeat(np.nan,self.features.shape[1]) # else: # return self.features[int(idx)] # Core preprocessing code. Can take in raw text files, or pickle files (in which case feature/dist values are updated appropriately) def processor( self, fi, output_dir, is_sorted=True, features=None, dist="cosine", session_threshold=None, dist_threshold=0.2, min_patch_length=5, artist_idx_feature_map=None, ): # get user_id from filename user = self.userFromFile(fi) self.rootLogger.debug("processor called (user {})".format(user)) if fi.endswith(".txt"): if output_dir is None: raise ("output path must be specified!") if artist_idx_feature_map is None: raise ("artist_idx_feature_map_path must be provided!") df = pd.read_table(fi, header=None, names=["artist_id", "ts"], parse_dates=["ts"]) if not is_sorted: df = df.sort_values(by="ts") df["td"] = df["ts"] - df.shift(1)["ts"] df["td"] = df["td"].astype(int) / 10 ** 9 df["artist_idx"] = df["artist_id"].apply(lambda x: artist_idx_feature_map.get(x)) n = float(len(df)) n_null = df["artist_idx"].isnull().sum() notnull = n - n_null propnull = n_null / n if notnull < 1000 or (propnull >= 0.05): self.rootLogger.info( "User {} SKIPPED ({} non null, {:.1f}% null) ({})".format(user, notnull, 100 * propnull, fi) ) return None self.rootLogger.debug("DF loaded (user {})".format(user)) elif fi.endswith(".pkl"): df = pd.read_pickle(fi) # get features and calculate distances if features is not None: # df['features'] = df['artist_idx'].apply(lambda idx: self.get_features(idx)) # df['features_shift'] = df['features'].shift(1) df["prev"] = df["artist_idx"].shift(1) df["dist"] = df.apply(lambda row: self.calc_dist(row["artist_idx"], row["prev"], metric=dist), axis=1) self.rootLogger.debug("features and dists done (user {})".format(user)) if session_threshold == 0: df["session"] = 0 elif (session_threshold is not None) and (session_threshold > 0): if "td" not in df.columns: df["td"] = df["ts"] - df.shift(1)["ts"] df["td"] = df["td"].astype(int) / 10 ** 9 session_idx = 0 session_indices = [] for val in df["td"] >= session_threshold: if val: session_idx += 1 session_indices.append(session_idx) df["session"] = session_indices self.rootLogger.debug("session indices done (user {})".format(user)) if (min_patch_length is not None) and (dist_threshold is not None): self.rootLogger.debug("starting patch segmentation for user {})".format(user)) indices_shuffle = np.zeros(len(df), dtype=int) indices_simple = np.zeros(len(df), dtype=int) offset_shuffle = 0 idx_shuffle = 0 offset_simple = 0 idx_simple = 0 ### NEED TO REWORK THIS BIT TO LOSE SOME REDUNDANCY for session in df.groupby("session"): result_shuffle, result_simple = self.patch_segmenter( session[1], metric=dist, min_length=min_patch_length, dist_thresh=dist_threshold ) # if session[0]==0: # print result_shuffle,result_simple # sys.exit() n = len(session[1]) if len(result_shuffle) == 0: indices_shuffle[offset_shuffle : offset_shuffle + n] = idx_shuffle idx_shuffle += 1 else: indices_shuffle[offset_shuffle : offset_shuffle + result_shuffle[0]] = idx_shuffle idx_shuffle += 1 for v, w in self.pairwise(result_shuffle): indices_shuffle[offset_shuffle + v : offset_shuffle + w] = idx_shuffle idx_shuffle += 1 indices_shuffle[ offset_shuffle + result_shuffle[-1] : offset_shuffle + result_shuffle[-1] + n ] = idx_shuffle idx_shuffle += 1 offset_shuffle += n if len(result_simple) == 0: indices_simple[offset_simple : offset_simple + n] = idx_simple idx_simple += 1 else: indices_simple[offset_simple : offset_simple + result_simple[0]] = idx_simple idx_simple += 1 for v, w in self.pairwise(result_simple): indices_simple[offset_simple + v : offset_simple + w] = idx_simple idx_simple += 1 indices_simple[ offset_simple + result_simple[-1] : offset_simple + result_simple[-1] + n ] = idx_simple idx_simple += 1 offset_simple += n if result_shuffle: indices_shuffle[offset_shuffle + result_shuffle[-1] :] = idx_shuffle else: indices_shuffle[offset_shuffle:] = idx_shuffle if result_simple: indices_simple[offset_simple + result_simple[-1] :] = idx_simple else: indices_simple[offset_simple:] = idx_simple df["patch_idx_shuffle"] = indices_shuffle df["patch_idx_simple"] = indices_simple self.rootLogger.debug("patch indices done (user {})".format(user)) # add artist block info ### https://stackoverflow.com/questions/14358567/finding-consecutive-segments-in-a-pandas-data-frame ### -1 for zero-based indexing df["block"] = ((df["artist_idx"].shift(1) != df["artist_idx"]).astype(int).cumsum()) - 1 self.rootLogger.debug("artist blocks done (user {})".format(user)) cols = ["ts", "artist_idx", "dist", "session", "patch_idx_shuffle", "patch_idx_simple", "block"] df = df[list(set(df.columns).intersection(cols))] if self.args.save: df.to_pickle("{}{}.pkl".format(output_dir, user)) if self.args.patch_len_dist: self.patch_length_distributions(user, df, bins=np.arange(0, 1001, 1), method=self.args.patch_len_dist) # return user,vals_simple,vals_shuffle self.rootLogger.info("User {} processed successfully ({})".format(user, fi)) return None # calculate patch summary measures (mean feature array, diversity, etc.). Applied to each patch def patch_measures(self, df, agg_stats=True, metric="cosine"): first = df.iloc[0] n = len(df) start = first["ts"] if agg_stats: # artists = df['artist_idx'].values if (n == 1) or (len(df["artist_idx"].unique()) == 1): diversity = 0.0 centroid = first["features"] else: features = np.array([f for f in df["features"]]) # I expect to see RuntimeWarnings in this block with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) centroid = np.nanmean(features, axis=0) diversity = np.nanmean(pdist(features, metric=metric)) # return pd.Series({'diversity':diversity,'centroid':centroid,'start_ts':start,'n':n,'artists':artists}) return pd.Series({"diversity": diversity, "centroid": centroid, "start_ts": start, "n": n}) # generate patch summary for each user, and save resulting pickle def patch_summary(self, fi, basis, metric): user = self.userFromFile(fi) df = pd.read_pickle(fi) df["features"] = df["artist_idx"].apply(lambda idx: self.get_features(idx)) if basis == "block": agg_stats = False elif basis in ("patch_idx_shuffle", "patch_idx_simple"): agg_stats = True else: raise ("Invalid patch basis") result = df.groupby(basis).apply(self.patch_measures, agg_stats, metric) # result['start_idx'] = result['n'].cumsum().shift(1).fillna(0).astype(int) result.reset_index(drop=True).to_pickle("{}{}_patches_{}.pkl".format(self.args.resultdir, user, basis)) self.rootLogger.info("Patches processed for user {} successfully ({})".format(user, fi)) # run patch summaries for all users def summarize_patches(self): if self.args.file: self.patch_summary(fi=self.args.file, basis=self.args.patch_basis, metric=self.args.distance_metric) else: if self.args.skip_complete: done = set( [ self.userFromFile(fi) for fi in glob(self.args.pickledir + "*.pkl") if "_patches_" in fi and fi.startswith(self.args.prefix_output) ] ) else: done = set() files = [ fi for fi in glob(self.args.pickledir + "*.pkl") if "_patches_" not in fi and fi.startswith(self.args.prefix_input) and self.userFromFile(fi) not in done ] func_partial = partial(self.patch_summary, basis=self.args.patch_basis, metric=self.args.distance_metric) self.rootLogger.info("Pool started") self.pool.map(func_partial, files) self.pool.close() self.rootLogger.info("Pool closed") def patch_length_distributions(self, user, df, bins, method): n_listens = float(len(df)) if self.args.min_patch_length == 2: vc_simple = df["patch_idx_simple"].value_counts().values counts_simple = np.clip(vc_simple, 0, 1000) vals_simple = np.histogram(counts_simple, bins=bins)[0] listens_simple = np.array([i * c for i, c in enumerate(vals_simple)]) listens_simple[-1] = vc_simple[vc_simple >= 1000].sum() listens_simple = listens_simple / n_listens vc_block = df["block"].value_counts().values counts_block = np.clip(vc_block, 0, 1000) vals_block = np.histogram(counts_block, bins=bins)[0] listens_block = np.array([i * c for i, c in enumerate(vals_block)]) listens_block[-1] = vc_block[vc_block >= 1000].sum() listens_block = listens_block / n_listens with open(self.args.resultdir + user, "a") as fout: fout.write( "\t".join([user, "block", "patches", str(self.args.dist_thresh), str(self.args.min_patch_length)]) + "\t" + ",".join(vals_block.astype(str)) + "\n" ) fout.write( "\t".join([user, "block", "listens", str(self.args.dist_thresh), str(self.args.min_patch_length)]) + "\t" + ",".join(listens_block.astype(str)) + "\n" ) fout.write( "\t".join([user, "simple", "patches", str(self.args.dist_thresh), str(self.args.min_patch_length)]) + "\t" + ",".join(vals_simple.astype(str)) + "\n" ) fout.write( "\t".join([user, "simple", "listens", str(self.args.dist_thresh), str(self.args.min_patch_length)]) + "\t" + ",".join(listens_simple.astype(str)) + "\n" ) vc_shuffle = df["patch_idx_shuffle"].value_counts().values counts_shuffle = np.clip(vc_shuffle, 0, 1000) vals_shuffle = np.histogram(counts_shuffle, bins=bins)[0] listens_shuffle = np.array([i * c for i, c in enumerate(vals_shuffle)]) listens_shuffle[-1] = vc_shuffle[vc_shuffle >= 1000].sum() listens_shuffle = listens_shuffle / n_listens with open(self.args.resultdir + user, "a") as fout: fout.write( "\t".join([user, "shuffle", "patches", str(self.args.dist_thresh), str(self.args.min_patch_length)]) + "\t" + ",".join(vals_shuffle.astype(str)) + "\n" ) fout.write( "\t".join([user, "shuffle", "listens", str(self.args.dist_thresh), str(self.args.min_patch_length)]) + "\t" + ",".join(listens_shuffle.astype(str)) + "\n" ) def mean_block_distances(self, fi, n=100, shuffle=False): def hash_handler(a, frst): if frst > a: frst, a = a, frst if frst not in dhash: dhash[frst] = {} result = self.calc_dist(frst, a) dhash[frst][a] = result else: result = dhash[frst].get(a) if result is None: result = self.calc_dist(frst, a) dhash[frst][a] = result return result def cos_nan(arr1, arr2): if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)): return np.nan else: return cosine(arr1, arr2) user = fi.split("/")[-1][:-4] df = pd.read_pickle(fi) if os.path.exists(self.args.resultdir + user): levels = {"scrobble": False, "block": False, "D": True, "W": False, "M": False} else: levels = {"scrobble": True, "block": True, "D": True, "W": True, "M": True} if levels["scrobble"]: result = [] dhash = {} if shuffle: blocks = df.copy() idx = np.array(blocks.index) np.random.shuffle(idx) blocks = blocks.reindex(idx) blocks = df.copy() for i in xrange(len(blocks) - n): first = blocks["artist_idx"].iloc[i] result.append(np.array(df["artist_idx"][i + 1 : i + n + 1].apply(lambda val: hash_handler(val, first)))) result = np.nanmean(np.vstack(result), 0) with open(self.args.resultdir + user, "a") as fout: fout.write("\t".join([user, "scrobble", ",".join(result.astype(str))]) + "\n") if levels["block"]: result = [] blocks = df[["artist_idx", "block"]].groupby("block").first() if shuffle: idx = np.array(blocks.index) np.random.shuffle(idx) blocks = blocks.reindex(idx) for i in xrange(len(blocks) - n): first = blocks["artist_idx"].iloc[i] result.append( np.array(blocks["artist_idx"][i + 1 : i + 101].apply(lambda val: hash_handler(val, first))) ) result = np.nanmean(np.vstack(result), 0) with open(self.args.resultdir + user, "a") as fout: fout.write("\t".join([user, "block", ",".join(result.astype(str))]) + "\n") df["features"] = df["artist_idx"].apply(lambda idx: self.get_features(idx)) df = df.set_index("ts")["features"] for res, n in (("D", 365), ("W", 52), ("M", 12)): if levels[res]: result = [] blocks = df.resample(res).aggregate( lambda ser: np.nanmean(np.vstack(ser.values), axis=0) if len(ser) > 0 else np.repeat(np.nan, self.n_features) ) if shuffle: idx = np.array(blocks.index) np.random.shuffle(idx) blocks = blocks.reindex(idx) for i in xrange(len(blocks) - n): first = blocks.iloc[i] result.append(np.array(blocks[i + 1 : i + n + 1].apply(lambda val: cos_nan(val, first)))) result = np.nanmean(np.vstack(result), 0) with open(self.args.resultdir + user, "a") as fout: fout.write("\t".join([user, res, ",".join(result.astype(str))]) + "\n") self.rootLogger.info("Block distances for user {} processed successfully ({})".format(user, fi)) def blockgaps(self, fi): user = self.userFromFile(fi) result = [] df = pd.read_pickle(fi)[["ts", "artist_idx", "block"]].groupby("block").first() bins = np.arange(0, 31, 1) day = np.timedelta64(1, "D") for artist in df["artist_idx"].dropna().unique(): current = df[df["artist_idx"] == artist]["ts"] td = ((current - current.shift(1)).dropna()) / day vals = np.histogram(td, bins=bins)[0] result.append(vals / float(vals.sum())) result = np.nanmean(np.vstack(result), 0) with open(self.args.resultdir + user, "w") as fout: fout.write("\t".join([user, ",".join(result.astype(str))]) + "\n") self.rootLogger.info("Gap times for user {} processed successfully ({})".format(user, fi)) def scrobble_gaps(self, fi): user = self.userFromFile(fi) result = [] df = pd.read_pickle(fi)["ts"] bins = np.arange(0, 60 * 60 * 24 * 30, 120) td = (df - df.shift(1)).dropna().apply(lambda x: x.total_seconds()) vals = np.histogram(td, bins=bins)[0] result = vals / float(vals.sum()) with open(self.args.resultdir + user, "w") as fout: fout.write("\t".join([user, ",".join(result.astype(str))]) + "\n") self.rootLogger.info("Gap times for user {} processed successfully ({})".format(user, fi)) def ee_artists(self, fi): user = self.userFromFile(fi) blocks = pd.read_pickle(fi)["block"] result = blocks.value_counts().value_counts() arr = result.reindex(xrange(1, max(result.index) + 1), fill_value=0.0).values final_result = arr / (np.cumsum(arr[::-1])[::-1]) final_result = sparse.csr_matrix(final_result) with open(self.args.resultdir + user, "w") as fout: fout.write( user + "\t" + ":".join( [",".join(a.astype(str)) for a in final_result.data, final_result.indices, final_result.indptr] ) + "\n" ) self.rootLogger.info("User {} processed successfully ({})".format(user, fi)) def ee_artists_2(self, fi): user = self.userFromFile(fi) blocks = pd.read_pickle(fi)["block"] cnts = pd.DataFrame({"n": blocks.value_counts().sort_index()}) cnts["last-n"] = cnts["n"].shift(1) cnts["switch"] = cnts.apply( lambda row: 1 if ((row["last-n"] == 1) and (row["n"] > 1)) or ((row["last-n"] > 1) and (row["n"] == 1)) else 0, axis=1, ) cnts["exp-idx"] = cnts["switch"].cumsum() result = cnts.groupby("exp-idx").apply( lambda grp: pd.Series({"n": len(grp), "exploit": 0}) if grp["n"].iloc[0] == 1 else pd.Series({"n": grp["n"].sum(), "exploit": 1}) )[:-1] # result = cnts.groupby('exp-idx').apply(lambda grp: pd.Series({'n':len(grp),'exploit':0}) if grp['n'].iloc[0]==1 else pd.Series({'n':grp['n']iloc[-1],'exploit':1}))[:-1] arr_exploit = result[result["exploit"] == 1]["n"].value_counts() arr_exploit = arr_exploit.reindex(xrange(1, max(arr_exploit.index) + 1), fill_value=0.0).values arr_explore = result[result["exploit"] == 0]["n"].value_counts() arr_explore = arr_explore.reindex(xrange(1, max(arr_explore.index) + 1), fill_value=0.0).values final_result_exploit = arr_exploit / (np.cumsum(arr_exploit[::-1])[::-1]) final_result_exploit = sparse.csr_matrix(final_result_exploit) final_result_explore = arr_explore / (np.cumsum(arr_explore[::-1])[::-1]) final_result_explore = sparse.csr_matrix(final_result_explore) with open(self.args.resultdir + user, "w") as fout: fout.write( user + "\t" + "explore" + "\t" + ":".join( [ ",".join(a.astype(str)) for a in final_result_explore.data, final_result_explore.indices, final_result_explore.indptr ] ) + "\n" ) fout.write( user + "\t" + "exploit" + "\t" + ":".join( [ ",".join(a.astype(str)) for a in final_result_exploit.data, final_result_exploit.indices, final_result_exploit.indptr ] ) + "\n" ) self.rootLogger.info("User {} processed successfully ({})".format(user, fi)) def ee_artists_dists(self, fi): user = self.userFromFile(fi) blocks = pd.read_pickle(fi)["block"] cnts = pd.DataFrame({"n": blocks.value_counts().sort_index()}) cnts["last-n"] = cnts["n"].shift(1) cnts["switch"] = cnts.apply( lambda row: 1 if ((row["last-n"] == 1) and (row["n"] > 1)) or ((row["last-n"] > 1) and (row["n"] == 1)) else 0, axis=1, ) cnts["exp-idx"] = cnts["switch"].cumsum() result = cnts.groupby("exp-idx").apply( lambda grp: pd.Series({"n": len(grp), "exploit": 0}) if grp["n"].iloc[0] == 1 else pd.Series({"n": grp["n"].sum(), "exploit": 1}) )[:-1] # result = cnts.groupby('exp-idx').apply(lambda grp: pd.Series({'n':len(grp),'exploit':0}) if grp['n'].iloc[0]==1 else pd.Series({'n':grp['n']iloc[-1],'exploit':1}))[:-1] arr_exploit = result[result["exploit"] == 1]["n"].value_counts() arr_exploit = sparse.csr_matrix( arr_exploit.reindex(xrange(1, max(arr_exploit.index) + 1), fill_value=0.0).values ) arr_explore = result[result["exploit"] == 0]["n"].value_counts() arr_explore = sparse.csr_matrix( arr_explore.reindex(xrange(1, max(arr_explore.index) + 1), fill_value=0.0).values ) with open(self.args.resultdir + user, "w") as fout: fout.write( user + "\t" + "explore" + "\t" + ":".join([",".join(a.astype(str)) for a in arr_explore.data, arr_explore.indices, arr_explore.indptr]) + "\n" ) fout.write( user + "\t" + "exploit" + "\t" + ":".join([",".join(a.astype(str)) for a in arr_exploit.data, arr_exploit.indices, arr_exploit.indptr]) + "\n" ) self.rootLogger.info("User {} processed successfully ({})".format(user, fi)) def block_len_dists(self, fi): user = self.userFromFile(fi) blocks = pd.read_pickle(fi)["block"] result = blocks.value_counts().value_counts() arr = result.reindex(xrange(1, max(result.index) + 1), fill_value=0.0).values final_result = sparse.csr_matrix(arr) with open(self.args.resultdir + user, "w") as fout: fout.write( user + "\t" + ":".join( [",".join(a.astype(str)) for a in final_result.data, final_result.indices, final_result.indptr] ) + "\n" ) self.rootLogger.info("User {} processed successfully ({})".format(user, fi))