Exemplo n.º 1
0
def make_ngram_map(word_list, n):
    """Creates a dict of n-gram tuples to suffix counters.

    The word_list is broken into partitions of length n+1. The first n words are
    the n-gram, and the last word is the suffix. 

    Parameters:
        word_list : list[str]
            The list of words from which to get n-grams.
        n: int
            Length of the n-grams (number of words).
    
    Returns:
        Dict[Tuple[str], Counter[str]]
            A mapping of n-gram tuples to a collection of subsequent words.  
    """
    ngram_map = {}
    
    for part in funcy.partition(n + 1, 1, word_list):
        ngram = tuple(part[0:-1])
        suffix = part[-1]
        suffix_map = (ngram_map.get(ngram, collections.Counter()))
        suffix_map[suffix] += 1 # TODO: Don't like the mutation here
        ngram_map[ngram] = suffix_map

    return ngram_map
Exemplo n.º 2
0
def _get_latest_ballet_version_string():
    """Get the latest version of ballet according to pip

    Parses the result of `pip search ballet`. Looks for something named
    `ballet` to be installed. If something appears to be named `ballet` but
    is not installed, then obviously it is not correct because otherwise how
    would this code be running? :)

    Returns:
        Union[str, None]: latest version of ballet or None if something went
            wrong
    """

    # $ pip search ballet
    # something-else-that-has-ballet-in-the-name (1.1.1)  - some description
    # ballet (x.y.z)  - some description
    #   INSTALLED: x.y.z
    #   LATEST:    u.v.w
    # something-else-that-has-ballet-in-the-name (1.1.1)  - some description

    output = _query_pip_search_ballet()
    for triple in funcy.partition(3, 1, output):
        match = _extract_latest_from_search_triple(triple)
        if match:
            return match

    return None
Exemplo n.º 3
0
    def branches(self):
        """
        Return the branches for this cond expression.

        This returns all couples of expressions (condition, return value), i.e.
        all result expressions except the fallback one (the constructor's last
        argument).

        :rtype: list[(AbstractExpression, AbstractExpression)]
        """
        return funcy.partition(2, self.args)
Exemplo n.º 4
0
def parse_dataset(filename):
    dataset_file = open(filename)
    X = []
    y = [] 
    for line in dataset_file:
        tokens = line.split()
        is_spam = int(tokens[1] == "spam")
        word_counts = {}
        for token, count in funcy.partition(2, tokens[2:]):
            word_counts[token] = int(count)
        X.append(word_counts)
        y.append(is_spam)
    return X, y
Exemplo n.º 5
0
async def s3_fetch_blocks_and_ops_in_blocks(s3_url, s3_client, block_nums):

    response = 'n/a'
    while True:
        try:
            response = await client.post(url, data=request_json)
            jsonrpc_response = await response.json()
            response_pairs = funcy.partition(2,jsonrpc_response)
            results = []
            for get_block, get_ops in response_pairs:
                assert get_block['id'] == get_ops['id']
                results.append((get_block['id'],get_block['result'],get_ops['result']))
            assert len(results) == len(block_nums)
            return results
        except Exception as e:
            logger.exception('error fetching ops in block',
                             e=e, response=response)
Exemplo n.º 6
0
async def fetch_blocks_and_ops_in_blocks(url, client, block_nums):
    request_data = ','.join(
        f'{{"id":{block_num},"jsonrpc":"2.0","method":"get_block","params":[{block_num}]}},{{"id":{block_num},"jsonrpc":"2.0","method":"get_ops_in_block","params":[{block_num},false]}}'
        for block_num in block_nums)
    request_json = f'[{request_data}]'.encode()
    response = 'n/a'
    while True:
        try:
            response = await client.post(url, data=request_json)
            jsonrpc_response = await response.json()
            response_pairs = funcy.partition(2,jsonrpc_response)
            results = []
            for get_block, get_ops in response_pairs:
                assert get_block['id'] == get_ops['id']
                results.append((get_block['id'],get_block['result'],get_ops['result']))
            assert len(results) == len(block_nums)
            return results
        except Exception as e:
            logger.exception('error fetching ops in block',
                             e=e, response=response)
Exemplo n.º 7
0
    def context_words2features(self, mode, output):

        logger.info('Building Training Data, Labels from Contexts')

        word_contexts = self._train_data.contexts
        n_context_words = self._train_data._n_context_words
        n_features = len(self._train_data.vocabulary)
        window_size = self._train_data._window_size

        if mode == 'normal':

            z = zarr.open(output, 'w')
            z.create_dataset('train_data',
                             shape=(n_context_words, ),
                             chunks=(256, ),
                             dtype='i')
            z.create_dataset('labels',
                             shape=(n_context_words, ),
                             chunks=(256, ),
                             dtype='i')

            counter = 0

            for word, contexts in word_contexts.items():

                logger.info('Processing word {word}, contexts {size}'.format(
                    word=word, size=len(contexts)))

                train_data = []
                labels = []
                word_idx = self._train_data[word]

                for context in contexts:
                    for context_word in context:

                        train_data.append(word_idx)
                        labels.append(self._train_data[context_word])

                z['train_data'][counter:counter +
                                len(contexts) * window_size * 2] = numpy.array(
                                    train_data, dtype=numpy.int32)
                z['labels'][counter:counter +
                            len(contexts) * window_size * 2] = numpy.array(
                                labels, dtype=numpy.int32)

                counter += len(contexts) * window_size * 2

        elif mode == 'bayes':

            window_size *= 2

            z = zarr.open(output, 'w')
            z.create_dataset('train_data_central',
                             shape=(n_context_words, ),
                             chunks=(256, ),
                             dtype='i')
            z.create_dataset('labels',
                             shape=(n_context_words, ),
                             chunks=(256, ),
                             dtype='i')
            z.create_dataset('train_data_contexts',
                             shape=(n_context_words, window_size),
                             chunks=(256, ),
                             dtype='i')

            counter = 0

            for word, contexts in word_contexts.items():

                logger.info('Processing word {word}, contexts {size}'.format(
                    word=word, size=len(contexts)))

                word_idx = self._train_data[word]
                train_data_central = []
                context_idx = []

                for context in contexts:
                    for context_word in context:
                        train_data_central.append(word_idx)
                        context_idx.append(self._train_data[context_word])

                n_words = len(contexts) * window_size

                z['train_data_central'][counter:counter +
                                        n_words] = numpy.array(
                                            train_data_central,
                                            dtype=numpy.int32)
                z['labels'][counter:counter + n_words] = numpy.array(
                    context_idx, dtype=numpy.int32)
                context_idx = funcy.partition(window_size, context_idx)
                context_idx = [
                    context for context in context_idx
                    for _ in range(window_size)
                ]
                z['train_data_contexts'][counter:counter +
                                         n_words, :] = numpy.array(
                                             context_idx, dtype=numpy.int32)

                counter += n_words

        elif mode == 'embed':

            n_sentences = 0
            max_length = 0

            for sentence in self._train_data.sentences:
                n_sentences += 1
                if len(sentence) > max_length:
                    max_length = len(sentence)

            z = zarr.open(output, 'w')
            z.create_dataset('lang_data',
                             shape=(n_sentences, max_length),
                             chunks=(256, ),
                             dtype='i')

            counter = 0
            for sentence in self._train_data.sentences:
                idx = []
                for word in sentence:
                    try:
                        idx.append(self._train_data[word])
                    except:
                        idx.append(self._train_data['UNK'])
                pad_size = max_length - len(idx)
                pad = [self._train_data['PAD'] for _ in range(pad_size)]
                idx += pad
                z['lang_data'][counter, :] = idx
                counter += 1
Exemplo n.º 8
0
def make_figure_x():
    baselines_df = _load_baselines_df()

    problems = list(baselines_df.index)
    best_pipelines = [_get_best_pipeline(problem) for problem in problems]
    mlz_pipelines_df = pd.DataFrame.from_records(
        [
            pipeline.to_dict()
            for pipeline in best_pipelines
            if pipeline is not None
        ]
    )
    mlz_pipelines_df['problem'] = mlz_pipelines_df['dataset'].str.replace(
        '_dataset_TRAIN', '')
    mlz_pipelines_df = mlz_pipelines_df.set_index('problem')
    _add_tscores(mlz_pipelines_df)

    combined_df = baselines_df.join(
        mlz_pipelines_df, lsuffix='_ll', rsuffix='_mlz')

    data = (
        combined_df[['t-score_ll', 't-score_mlz']]
        .dropna()
        .rename(columns={'t-score_ll': 'baseline',
                         't-score_mlz': 'ML Bazaar'})
        .sort_values('baseline')
        .stack()
        .to_frame('score')
        .reset_index()
        .rename(columns={'level_1': 'system'})
    )

    # specifically abbreviate 'uu3_world_development_indicators'
    mask = data['problem'] == 'uu3_world_development_indicators'
    data.loc[mask, 'problem'] = 'uu3_wdi'

    with sns.plotting_context('paper'):
        fig, ax = plt.subplots(figsize=(6, 4))
        sns.barplot(x='problem', y='score', hue='system', data=data, ax=ax)

        ax.set_yticks([0.0, 0.5, 1.0])
        ax.set_xlabel('')
        plt.xticks(rotation=90)

        sns.despine(left=True, bottom=True)
        plt.tight_layout()
        ax.get_legend().remove()

        # color patches
        for (_, b2) in fy.partition(
                2, 2, sorted(ax.patches, key=lambda o: o.get_x())
        ):
            b2.set_hatch('////')

        _savefig(fig, 'figure6', figdir=OUTPUT_DIR)

    fn = OUTPUT_DIR.joinpath('figurex.csv')
    data.to_csv(fn)

    # Compute performance vs human baseline (Section 5.3)
    result = (
        combined_df
        [['t-score_ll', 't-score_mlz']]
        .dropna()
        .apply(np.diff, axis=1)
        .agg(['mean', 'std'])
    )

    fn = OUTPUT_DIR.joinpath('performance_vs_baseline.csv')
    result.to_csv(fn)