def apply_backtesting(bettor, param_grid, risk_factors, X, scores, odds, cv, random_state, n_runs, n_jobs): """Apply backtesting to evaluate bettor.""" # Check random states random_states = check_random_states(random_state, n_runs) # Check arrays X = check_array(X, dtype=None, force_all_finite=False) normalized_scores = [] for score in scores: normalized_scores.append(check_array(score, dtype=None, ensure_2d=False)) odds = check_array(odds, dtype=None) # Extract parameters parameters = ParameterGrid(param_grid) # Run backtesting data = Parallel(n_jobs=n_jobs)(delayed(fit_bet)(bettor, params, risk_factors, random_state, X, normalized_scores, odds, train_indices, test_indices) for params, random_state, (train_indices, test_indices) in tqdm(list(product(parameters, random_states, cv.split(X))), desc='Tasks')) # Combine data data = pd.concat(data, ignore_index=True) data = data.groupby(['parameters', 'risk_factor', 'experiment']).apply(lambda df: np.concatenate(df.yields.values)).reset_index() data[['coverage', 'mean_yield', 'std_yield']] = pd.DataFrame(data[0].apply(lambda yields: extract_yields_stats(yields)).values.tolist()) # Calculate results results = data.drop(columns=['experiment', 0]).groupby(['parameters', 'risk_factor']).mean().reset_index() results['std_mean_yield'] = data.groupby(['parameters', 'risk_factor'])['mean_yield'].std().values results = results.sort_values('mean_yield', ascending=False).reset_index(drop=True) return results
def find_TADs(self, data, gammalist=range(10, 110, 10), segmentation='potts', minlen=3, drop_gamma=False, n_jobs='auto'): ''' Finds TADs in data with a list of gammas. Returns a pandas DataFrame with columns 'Start', 'End' and 'Gamma'. Use genome_intervals_to_chr on the returned object to get coordinates in bed-style format and not in coordinates of concatenated genome. If *drop_gamma*, drops the 'Gamma' column (useful when using 1 gamma) ''' raise DeprecationWarning('Will be deprecated or rewritten to use'\ 'lavaburst: github.com/nezar-compbio/lavaburst') if n_jobs is 'auto': #Empirical values on my computer; with >8 Gb memory try increasing n_jobs if segmentation == 'potts': n_jobs = 3 elif segmentation == 'armatus': n_jobs = 6 if ~np.isfinite(data).any(): print 'Non-finite values in data, substituting them with zeroes' data[~np.isfinite(data)] = 0 Wcomm, Wnull, pass_mask, length = _precalculate_TADs_in_array(data) f = _calculate_TADs if n_jobs >= 1: from joblib import Parallel, delayed domains = Parallel(n_jobs=n_jobs, max_nbytes=1e6)( delayed(f)(Wcomm, Wnull, pass_mask, length, g, segmentation) for g in gammalist) elif n_jobs is None or n_jobs == False or n_jobs == 0: domains = [] for g in gammalist: domains_g = f(Wcomm, Wnull, pass_mask, length, g, segmentation) domains.append(domains_g) domains = pd.concat(domains, ignore_index=True) domains = domains.query('End-Start>=' + str(minlen)).copy() domains = domains.sort(columns=['Gamma', 'Start', 'End']) domains.reset_index(drop=True, inplace=True) domains[['Start', 'End']] = domains[['Start', 'End']].astype(int) domains[['Start', 'End']] *= self.resolution domains = domains[['Start', 'End', 'Score', 'Gamma']] if drop_gamma: domains.drop('Gamma', axis=1, inplace=True) domains = self.genome_intervals_to_chr(domains).reset_index(drop=True) return domains
def find_TADs(self, data, gammalist=range(10, 110, 10), segmentation='potts', minlen=3, drop_gamma=False, n_jobs='auto'): ''' Finds TADs in data with a list of gammas. Returns a pandas DataFrame with columns 'Start', 'End' and 'Gamma'. Use genome_intervals_to_chr on the returned object to get coordinates in bed-style format and not in coordinates of concatenated genome. If *drop_gamma*, drops the 'Gamma' column (useful when using 1 gamma) ''' raise DeprecationWarning('Will be deprecated or rewritten to use'\ 'lavaburst: github.com/nezar-compbio/lavaburst') if n_jobs is 'auto': #Empirical values on my computer; with >8 Gb memory try increasing n_jobs if segmentation == 'potts': n_jobs = 3 elif segmentation == 'armatus': n_jobs = 6 if ~np.isfinite(data).any(): print 'Non-finite values in data, substituting them with zeroes' data[~np.isfinite(data)] = 0 Wcomm, Wnull, pass_mask, length = _precalculate_TADs_in_array(data) f = _calculate_TADs if n_jobs >= 1: from joblib import Parallel, delayed domains = Parallel(n_jobs=n_jobs, max_nbytes=1e6)( delayed(f)(Wcomm, Wnull, pass_mask, length, g, segmentation) for g in gammalist) elif n_jobs is None or n_jobs == False or n_jobs == 0: domains = [] for g in gammalist: domains_g = f(Wcomm, Wnull, pass_mask, length, g, segmentation) domains.append(domains_g) domains = pd.concat(domains, ignore_index=True) domains = domains.query('End-Start>='+str(minlen)).copy() domains = domains.sort(columns=['Gamma', 'Start', 'End']) domains.reset_index(drop=True, inplace=True) domains[['Start', 'End']] = domains[['Start', 'End']].astype(int) domains[['Start', 'End']] *= self.resolution domains = domains[['Start', 'End', 'Score', 'Gamma']] if drop_gamma: domains.drop('Gamma', axis=1, inplace=True) domains = self.genome_intervals_to_chr(domains).reset_index(drop=True) return domains
return time_since_spray num_cores = multiprocessing.cpu_count() inputs = testtrapsweather.index distance_binary = Parallel(n_jobs=num_cores)(delayed(distance_calc)(i) for i in inputs) time_binary = Parallel(n_jobs=num_cores)(delayed(time_calc)(i) for i in inputs) print('make binaries... done') distance_binary = pd.DataFrame(distance_binary) time_binary = pd.DataFrame(time_binary) time_binary.reset_index(inplace=True) time_binary.drop('index', axis=1, inplace=True) # if observation took place before spray, zero out time # else return elapsed time between spray and observation print('negating sprays after traps...') for col in time_binary.columns: time_binary[col] = time_binary[col].map(lambda x: 0 if x < 0 else x) print('done') # https://chrisalbon.com/python/data_wrangling/pandas_rename_multiple_columns/ time_binary.columns = distance_binary.columns time_binary_backup = time_binary.copy() distance_binary_backup = distance_binary.copy()
def token_text(text): """ Tokenize and cleanup text """ # split into words using NLTK's word tokenizer words = tokenize.word_tokenize(text) # remove punctuation signs and convert to lowercase words = [w.lower() for w in words if w.isalpha()] return words # get conversation data convo_date = Parallel(n_jobs=12)(delayed(convo2df)(p) for p in tqdm(glob.glob('../data/*frompdf.txt'))) convo_date = pd.concat(convo_date, ignore_index=False) # authorship is not relevant here convo_date = convo_date.drop('author', axis=1) convo_txt = Parallel(n_jobs=12)(delayed(convo2df)(p) for p in tqdm(glob.glob('../data/*txt')) if 'pdf' not in p) convo_txt = pd.concat(convo_txt, ignore_index=False) convo_df = pd.merge(convo_txt, convo_date, on='text') # drop duplicates that originate from merging info from pdf and with old dfs convo_df = convo_df.drop_duplicates(['text','datetime','author']) # tokenize texts convo_df['words'] = convo_df.text.apply(token_text) convo_df[['polarity','subject']] = convo_df.words.apply(lambda x:\ pd.Series(TextBlob(' '.join(x)).sentiment)) convo_df.to_csv('../output/convo_01022020.csv', index=False)