def make_ngram_map(word_list, n): """Creates a dict of n-gram tuples to suffix counters. The word_list is broken into partitions of length n+1. The first n words are the n-gram, and the last word is the suffix. Parameters: word_list : list[str] The list of words from which to get n-grams. n: int Length of the n-grams (number of words). Returns: Dict[Tuple[str], Counter[str]] A mapping of n-gram tuples to a collection of subsequent words. """ ngram_map = {} for part in funcy.partition(n + 1, 1, word_list): ngram = tuple(part[0:-1]) suffix = part[-1] suffix_map = (ngram_map.get(ngram, collections.Counter())) suffix_map[suffix] += 1 # TODO: Don't like the mutation here ngram_map[ngram] = suffix_map return ngram_map
def _get_latest_ballet_version_string(): """Get the latest version of ballet according to pip Parses the result of `pip search ballet`. Looks for something named `ballet` to be installed. If something appears to be named `ballet` but is not installed, then obviously it is not correct because otherwise how would this code be running? :) Returns: Union[str, None]: latest version of ballet or None if something went wrong """ # $ pip search ballet # something-else-that-has-ballet-in-the-name (1.1.1) - some description # ballet (x.y.z) - some description # INSTALLED: x.y.z # LATEST: u.v.w # something-else-that-has-ballet-in-the-name (1.1.1) - some description output = _query_pip_search_ballet() for triple in funcy.partition(3, 1, output): match = _extract_latest_from_search_triple(triple) if match: return match return None
def branches(self): """ Return the branches for this cond expression. This returns all couples of expressions (condition, return value), i.e. all result expressions except the fallback one (the constructor's last argument). :rtype: list[(AbstractExpression, AbstractExpression)] """ return funcy.partition(2, self.args)
def parse_dataset(filename): dataset_file = open(filename) X = [] y = [] for line in dataset_file: tokens = line.split() is_spam = int(tokens[1] == "spam") word_counts = {} for token, count in funcy.partition(2, tokens[2:]): word_counts[token] = int(count) X.append(word_counts) y.append(is_spam) return X, y
async def s3_fetch_blocks_and_ops_in_blocks(s3_url, s3_client, block_nums): response = 'n/a' while True: try: response = await client.post(url, data=request_json) jsonrpc_response = await response.json() response_pairs = funcy.partition(2,jsonrpc_response) results = [] for get_block, get_ops in response_pairs: assert get_block['id'] == get_ops['id'] results.append((get_block['id'],get_block['result'],get_ops['result'])) assert len(results) == len(block_nums) return results except Exception as e: logger.exception('error fetching ops in block', e=e, response=response)
async def fetch_blocks_and_ops_in_blocks(url, client, block_nums): request_data = ','.join( f'{{"id":{block_num},"jsonrpc":"2.0","method":"get_block","params":[{block_num}]}},{{"id":{block_num},"jsonrpc":"2.0","method":"get_ops_in_block","params":[{block_num},false]}}' for block_num in block_nums) request_json = f'[{request_data}]'.encode() response = 'n/a' while True: try: response = await client.post(url, data=request_json) jsonrpc_response = await response.json() response_pairs = funcy.partition(2,jsonrpc_response) results = [] for get_block, get_ops in response_pairs: assert get_block['id'] == get_ops['id'] results.append((get_block['id'],get_block['result'],get_ops['result'])) assert len(results) == len(block_nums) return results except Exception as e: logger.exception('error fetching ops in block', e=e, response=response)
def context_words2features(self, mode, output): logger.info('Building Training Data, Labels from Contexts') word_contexts = self._train_data.contexts n_context_words = self._train_data._n_context_words n_features = len(self._train_data.vocabulary) window_size = self._train_data._window_size if mode == 'normal': z = zarr.open(output, 'w') z.create_dataset('train_data', shape=(n_context_words, ), chunks=(256, ), dtype='i') z.create_dataset('labels', shape=(n_context_words, ), chunks=(256, ), dtype='i') counter = 0 for word, contexts in word_contexts.items(): logger.info('Processing word {word}, contexts {size}'.format( word=word, size=len(contexts))) train_data = [] labels = [] word_idx = self._train_data[word] for context in contexts: for context_word in context: train_data.append(word_idx) labels.append(self._train_data[context_word]) z['train_data'][counter:counter + len(contexts) * window_size * 2] = numpy.array( train_data, dtype=numpy.int32) z['labels'][counter:counter + len(contexts) * window_size * 2] = numpy.array( labels, dtype=numpy.int32) counter += len(contexts) * window_size * 2 elif mode == 'bayes': window_size *= 2 z = zarr.open(output, 'w') z.create_dataset('train_data_central', shape=(n_context_words, ), chunks=(256, ), dtype='i') z.create_dataset('labels', shape=(n_context_words, ), chunks=(256, ), dtype='i') z.create_dataset('train_data_contexts', shape=(n_context_words, window_size), chunks=(256, ), dtype='i') counter = 0 for word, contexts in word_contexts.items(): logger.info('Processing word {word}, contexts {size}'.format( word=word, size=len(contexts))) word_idx = self._train_data[word] train_data_central = [] context_idx = [] for context in contexts: for context_word in context: train_data_central.append(word_idx) context_idx.append(self._train_data[context_word]) n_words = len(contexts) * window_size z['train_data_central'][counter:counter + n_words] = numpy.array( train_data_central, dtype=numpy.int32) z['labels'][counter:counter + n_words] = numpy.array( context_idx, dtype=numpy.int32) context_idx = funcy.partition(window_size, context_idx) context_idx = [ context for context in context_idx for _ in range(window_size) ] z['train_data_contexts'][counter:counter + n_words, :] = numpy.array( context_idx, dtype=numpy.int32) counter += n_words elif mode == 'embed': n_sentences = 0 max_length = 0 for sentence in self._train_data.sentences: n_sentences += 1 if len(sentence) > max_length: max_length = len(sentence) z = zarr.open(output, 'w') z.create_dataset('lang_data', shape=(n_sentences, max_length), chunks=(256, ), dtype='i') counter = 0 for sentence in self._train_data.sentences: idx = [] for word in sentence: try: idx.append(self._train_data[word]) except: idx.append(self._train_data['UNK']) pad_size = max_length - len(idx) pad = [self._train_data['PAD'] for _ in range(pad_size)] idx += pad z['lang_data'][counter, :] = idx counter += 1
def make_figure_x(): baselines_df = _load_baselines_df() problems = list(baselines_df.index) best_pipelines = [_get_best_pipeline(problem) for problem in problems] mlz_pipelines_df = pd.DataFrame.from_records( [ pipeline.to_dict() for pipeline in best_pipelines if pipeline is not None ] ) mlz_pipelines_df['problem'] = mlz_pipelines_df['dataset'].str.replace( '_dataset_TRAIN', '') mlz_pipelines_df = mlz_pipelines_df.set_index('problem') _add_tscores(mlz_pipelines_df) combined_df = baselines_df.join( mlz_pipelines_df, lsuffix='_ll', rsuffix='_mlz') data = ( combined_df[['t-score_ll', 't-score_mlz']] .dropna() .rename(columns={'t-score_ll': 'baseline', 't-score_mlz': 'ML Bazaar'}) .sort_values('baseline') .stack() .to_frame('score') .reset_index() .rename(columns={'level_1': 'system'}) ) # specifically abbreviate 'uu3_world_development_indicators' mask = data['problem'] == 'uu3_world_development_indicators' data.loc[mask, 'problem'] = 'uu3_wdi' with sns.plotting_context('paper'): fig, ax = plt.subplots(figsize=(6, 4)) sns.barplot(x='problem', y='score', hue='system', data=data, ax=ax) ax.set_yticks([0.0, 0.5, 1.0]) ax.set_xlabel('') plt.xticks(rotation=90) sns.despine(left=True, bottom=True) plt.tight_layout() ax.get_legend().remove() # color patches for (_, b2) in fy.partition( 2, 2, sorted(ax.patches, key=lambda o: o.get_x()) ): b2.set_hatch('////') _savefig(fig, 'figure6', figdir=OUTPUT_DIR) fn = OUTPUT_DIR.joinpath('figurex.csv') data.to_csv(fn) # Compute performance vs human baseline (Section 5.3) result = ( combined_df [['t-score_ll', 't-score_mlz']] .dropna() .apply(np.diff, axis=1) .agg(['mean', 'std']) ) fn = OUTPUT_DIR.joinpath('performance_vs_baseline.csv') result.to_csv(fn)