def test_mention_context_batch_sampler(): mentions_by_page = { 0: [40, 50, 60, 70, 80, 90], 1: [100], 2: [0, 10, 20, 30] } cursor = get_mock_cursor(mentions_by_page) batch_size = 5 page_id_order = [2, 0, 1] mentions_in_page_order = _.mapcat( page_id_order, lambda page_id: mentions_by_page[page_id]) batch_sampler = MentionContextBatchSampler(cursor, page_id_order, batch_size) batches_seen = [] indexes_seen = [] for batch_num, batch_indexes in enumerate(batch_sampler): assert _.is_empty(_.intersection(batch_indexes, indexes_seen)) indexes_seen.extend(batch_indexes) if batch_num == 0: assert len(set(batch_indexes) - {0, 10, 20, 30}) == 1 assert any([ mention in set(batch_indexes) - {0, 10, 20, 30} for mention in mentions_by_page[0] ]) assert len(batch_indexes) == batch_size elif batch_num == 1: assert _.is_empty(set(batch_indexes) - set(mentions_by_page[0])) assert len(batch_indexes) == batch_size elif batch_num == 2: assert batch_indexes == [100] assert len(batch_indexes) == 1 batches_seen.append(batch_num) assert _.is_empty(_.difference(mentions_in_page_order, indexes_seen)) assert batches_seen == [0, 1, 2]
def plot_experiment(experiment_spec, experiment_df): ''' Plot the variable specs vs fitness vector of an experiment, where each point is a trial. ref colors: https://plot.ly/python/heatmaps-contours-and-2dhistograms-tutorial/#plotlys-predefined-color-scales ''' y_cols = ['fitness'] + FITNESS_COLS x_cols = ps.difference(experiment_df.columns.tolist(), y_cols) fig = viz.tools.make_subplots(rows=len(y_cols), cols=len(x_cols), shared_xaxes=True, shared_yaxes=True) fitness_sr = experiment_df['fitness'] min_fitness = fitness_sr.values.min() max_fitness = fitness_sr.values.max() for row_idx, y in enumerate(y_cols): for col_idx, x in enumerate(x_cols): x_sr = experiment_df[x] guard_cat_x = x_sr.astype(str) if x_sr.dtype == 'object' else x_sr trace = viz.go.Scatter( y=experiment_df[y], yaxis=f'y{row_idx+1}', x=guard_cat_x, xaxis=f'x{col_idx+1}', showlegend=False, mode='markers', marker={ 'symbol': 'circle-open-dot', 'color': experiment_df['fitness'], 'opacity': 0.5, # dump first quarter of colorscale that is too bright 'cmin': min_fitness - 0.50 * (max_fitness - min_fitness), 'cmax': max_fitness, 'colorscale': 'YIGnBu', 'reversescale': True }, ) fig.append_trace(trace, row_idx + 1, col_idx + 1) fig.layout[f'xaxis{col_idx+1}'].update(title='<br>'.join(ps.chunk(x, 20)), zerolinewidth=1, categoryarray=sorted(guard_cat_x.unique())) fig.layout[f'yaxis{row_idx+1}'].update(title=y, rangemode='tozero') fig.layout.update(title=f'experiment graph: {experiment_spec["name"]}', width=max(600, len(x_cols) * 300), height=700) viz.plot(fig) return fig
def retro_analyze_trials(predir): '''Retro analyze all trials''' logger.info('Running retro_analyze_trials') session_spec_paths = glob(f'{predir}/*_s*_spec.json') # remove session spec paths trial_spec_paths = ps.difference(glob(f'{predir}/*_t*_spec.json'), session_spec_paths) util.parallelize(_retro_analyze_trial, [(p,) for p in trial_spec_paths], num_cpus=util.NUM_CPUS)
def retro_analyze_trials(predir): '''Retro analyze all trials''' logger.info('Running retro_analyze_trials') session_spec_paths = glob(f'{predir}/*_s*_spec.json') # remove session spec paths trial_spec_paths = ps.difference(glob(f'{predir}/*_t*_spec.json'), session_spec_paths) for p in trial_spec_paths: _retro_analyze_trial(p)
def plot_experiment(experiment_spec, experiment_df, metrics_cols): ''' Plot the metrics vs. specs parameters of an experiment, where each point is a trial. ref colors: https://plot.ly/python/heatmaps-contours-and-2dhistograms-tutorial/#plotlys-predefined-color-scales ''' y_cols = metrics_cols x_cols = ps.difference(experiment_df.columns.tolist(), y_cols + ['trial']) fig = subplots.make_subplots(rows=len(y_cols), cols=len(x_cols), shared_xaxes=True, shared_yaxes=True, print_grid=False) strength_sr = experiment_df['strength'] min_strength, max_strength = strength_sr.min(), strength_sr.max() for row_idx, y in enumerate(y_cols): for col_idx, x in enumerate(x_cols): x_sr = experiment_df[x] guard_cat_x = x_sr.astype(str) if x_sr.dtype == 'object' else x_sr trace = go.Scatter( y=experiment_df[y], yaxis=f'y{row_idx+1}', x=guard_cat_x, xaxis=f'x{col_idx+1}', showlegend=False, mode='markers', marker={ 'symbol': 'circle-open-dot', 'color': strength_sr, 'opacity': 0.5, # dump first portion of colorscale that is too bright 'cmin': min_strength - 0.5 * (max_strength - min_strength), 'cmax': max_strength, 'colorscale': 'YlGnBu', 'reversescale': False }, ) fig.add_trace(trace, row_idx + 1, col_idx + 1) fig.update_xaxes(title_text='<br>'.join(ps.chunk(x, 20)), zerolinewidth=1, categoryarray=sorted(guard_cat_x.unique()), row=len(y_cols), col=col_idx + 1) fig.update_yaxes(title_text=y, rangemode='tozero', row=row_idx + 1, col=1) fig.layout.update(title=f'experiment graph: {experiment_spec["name"]}', width=100 + 300 * len(x_cols), height=200 + 300 * len(y_cols)) plot(fig) graph_prepath = experiment_spec['meta']['graph_prepath'] save_image(fig, f'{graph_prepath}_experiment_graph.png') # save important graphs in prepath directly prepath = experiment_spec['meta']['prepath'] save_image(fig, f'{prepath}_experiment_graph.png') return fig
def get_not_registered_imeis(cls, imeis): """Method to check and return IMEIs which are not already registered.""" try: normalized_imeis = cls.bulk_normalize(imeis) query = ApprovedImeis.query.filter(ApprovedImeis.imei.in_(normalized_imeis)). \ filter(ApprovedImeis.status == 'whitelist') data = query.all() data = list(map(lambda x: x.imei, data)) return pydash.difference(normalized_imeis, data) except Exception as e: raise e
def calc_experiment_df(trial_data_dict, info_prepath=None): '''Collect all trial data (metrics and config) from trials into a dataframe''' experiment_df = pd.DataFrame(trial_data_dict).transpose() cols = METRICS_COLS config_cols = sorted(ps.difference(experiment_df.columns.tolist(), cols)) sorted_cols = config_cols + cols experiment_df = experiment_df.reindex(sorted_cols, axis=1) experiment_df.sort_values(by=['strength'], ascending=False, inplace=True) if info_prepath is not None: util.write(experiment_df, f'{info_prepath}_experiment_df.csv') # save important metrics in info_prepath directly util.write(experiment_df, f'{info_prepath.replace("info/", "")}_experiment_df.csv') return experiment_df
def retro_analyze_experiment(predir): '''Retro analyze an experiment''' logger.info('Running retro_analyze_experiment') trial_spec_paths = glob(f'{predir}/*_t*_spec.json') # remove trial and session spec paths experiment_spec_paths = ps.difference(glob(f'{predir}/*_spec.json'), trial_spec_paths) experiment_spec_path = experiment_spec_paths[0] spec = util.read(experiment_spec_path) info_prepath = spec['meta']['info_prepath'] if os.path.exists(f'{info_prepath}_trial_data_dict.json'): return # only run analysis if experiment had been ran trial_data_dict = util.read(f'{info_prepath}_trial_data_dict.json') analysis.analyze_experiment(spec, trial_data_dict)
def retro_analyze_experiment(predir): '''Retro analyze an experiment''' logger.info('Running retro_analyze_experiment') if ps.is_empty(glob(f'{predir}/info/*_trial_data_dict.json')): logger.info( 'Skipping retro_analyze_experiment since no experiment was ran.') return # only run analysis if experiment had been ran trial_spec_paths = glob(f'{predir}/*_t*_spec.json') # remove trial and session spec paths experiment_spec_paths = ps.difference(glob(f'{predir}/*_spec.json'), trial_spec_paths) experiment_spec_path = experiment_spec_paths[0] spec = util.read(experiment_spec_path) info_prepath = spec['meta'].get('info_prepath') trial_data_dict = util.read(f'{info_prepath}_trial_data_dict.json') analysis.analyze_experiment(spec, trial_data_dict)
def analyze_experiment(experiment): ''' Gather experiment trial_data_dict as experiment_df, plot. Search module must return best_spec and experiment_data with format {trial_index: exp_trial_data}, where trial_data = {**var_spec, **fitness_vec, fitness}. This is then made into experiment_df. @returns {DataFrame} experiment_df Of var_specs, fitness_vec, fitness for all trials. ''' logger.info('Analyzing experiment') experiment_df = pd.DataFrame(experiment.trial_data_dict).transpose() cols = FITNESS_COLS + ['fitness'] config_cols = sorted(ps.difference(experiment_df.columns.tolist(), cols)) sorted_cols = config_cols + cols experiment_df = experiment_df.reindex(sorted_cols, axis=1) experiment_df.sort_values(by=['fitness'], ascending=False, inplace=True) logger.info(f'Experiment data:\n{experiment_df}') experiment_fig = plot_experiment(experiment.spec, experiment_df) save_experiment_data(experiment.spec, experiment.info_space, experiment_df, experiment_fig) return experiment_df
def test_difference(case, expected): assert _.difference(*case) == expected
with open('files/gene lists/dbVarCNGsBenignPCGenes.txt', 'r') as dbVarCNGsBenignPCGenes: benignGain = dbVarCNGsBenignPCGenes.read().splitlines() with open('files/gene lists/dbVarCNLsBenignPCGenes.txt', 'r') as dbVarCNLsBenignPCGenes: benignLoss = dbVarCNLsBenignPCGenes.read().splitlines() with open('files/gene lists/dbVarCNGPeaksPathogenicPCGenes.txt', 'r') as dbVarCNGPeaksPathogenicPCGenes: pathoGain = dbVarCNGPeaksPathogenicPCGenes.read().splitlines() with open('files/gene lists/dbVarCNLPeaksPathogenicPCGenes.txt', 'r') as dbVarCNLPeaksPathogenicPCGenes: pathoLoss = dbVarCNLPeaksPathogenicPCGenes.read().splitlines() lists['BGBLPGPL'] = pydash.intersection(benignGain, benignLoss, pathoGain, pathoLoss) _BGBLPG = pydash.intersection(benignGain, benignLoss, pathoGain) lists['BGBLPG'] = pydash.difference(_BGBLPG, pathoLoss) _BGBLPL = pydash.intersection(benignGain, benignLoss, pathoLoss) lists['BGBLPL'] = pydash.difference(_BGBLPL, pathoGain) _BGPGPL = pydash.intersection(benignGain, pathoGain, pathoLoss) lists['BGPGPL'] = pydash.difference(_BGPGPL, benignLoss) _BLPGPL = pydash.intersection(benignLoss, pathoGain, pathoLoss) lists['BLPGPL'] = pydash.difference(_BLPGPL, benignGain) _BGBL = pydash.intersection(benignGain, benignLoss) lists['BGBL'] = pydash.difference(_BGBL, pathoGain, pathoLoss) _BGPG = pydash.intersection(benignGain, pathoGain) lists['BGPG'] = pydash.difference(_BGPG, benignLoss, pathoLoss)
def enum_envs(): '''Enumerate all the env names of the latest version''' envs = [es.id for es in gym.envs.registration.registry.all()] envs += INCLUDE_ENVS envs = ps.difference(envs, EXCLUDE_ENVS) return envs
async def command_clear(message, client, *args, **kwargs): if message.content.startswith('!clear'): logger.info('Run command !clear') # If no arguments were found for the command, clear the entire channel command_args = kwargs.pop('command_args', []) if not command_args: # Get all the messages in the channel messages = client.logs_from(message.channel) messages = [m async for m in messages] # Try bulk deleting the messages in chunks of 100 for chunk in pydash.chunk(messages, 100): await client.delete_messages(chunk) messages = pydash.difference(messages, chunk) # Individually delete each message which could not be deleted # via bulk for message_ in messages: await client.delete_message(message_) return True # If arguments were found for the command, do different things else: command_args = list(command_args) # If there is more than one argument, do nothing if len(command_args) > 1: return True # Save the first argument to a variable to readability clear_amount = command_args[0] # If the first argument is an integer, only clear so many messages if type(clear_amount) is int: # TODO: Support retrieving more than 100 messages # If clear_amount is greater than 100, send an apology if clear_amount > 100: await client.send_message( message.channel, 'I can only clear 100 messages at a time.' ) return True # Get the messages to be deleted messages = client.logs_from( message.channel, limit=clear_amount) messages = [m async for m in messages] # Try bulk deleting the messages in chunks of 100 for chunk in pydash.chunk(messages, 100): await client.delete_messages(chunk) messages = pydash.difference(messages, chunk) # Individually delete each message which could not be deleted # via bulk for message_ in messages: await client.delete_message(message_) return True return False
def get_recon_managed_regions(): # we only want to mail on regions 90 days post launch region_metadata = get_region_metadata() rms_managed_regions = get_regions_within_ninety_business_days_post_launch(region_metadata) return pydash.difference(get_airport_codes(region_metadata), get_airport_codes(rms_managed_regions))