Пример #1
0
def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]

    assert c.name == db.concat([a, b]).name
    assert b.concat().name != a.concat().name
    assert b.concat().name == b.concat().name

    b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3])
    assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])
Пример #2
0
def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]

    assert c.name == db.concat([a, b]).name
    assert b.concat().name != a.concat().name
    assert b.concat().name == b.concat().name

    b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3])
    assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])
Пример #3
0
def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]

    b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3])
    assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])
Пример #4
0
def get_IDF(words_by_document):
    unique_words = []
    for words_for_single_document in words_by_document:
        unique_words.append(words_for_single_document.distinct())
    large_bag = db.concat(unique_words)
    frequencies = large_bag.frequencies()
    idf = frequencies.map(lambda x: (x[0], round(math.log((len(words_by_document) + 1)/x[1]), 10)))
    return idf
Пример #5
0
def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]

    b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3])
    assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])
Пример #6
0
def _apply__flowly__tz__apply_map_concat(bag, transform, rules):
    return db.concat([
        bag.map_partitions(_apply_map_concat_impl,
                           funcs=list(funcs),
                           _flowly_id=flowly_id)
        # TODO: fix chunk_size
        for flowly_id, funcs in zip(id_sequence,
                                    partition_all(10, transform.funcs))
    ])
def _ddfs_to_bag(data, cube):
    if not isinstance(data, dict):
        data = {cube.seed_dataset: data}

    ktk_cube_dataset_ids = sorted(data.keys())
    bags = []
    for ktk_cube_dataset_id in ktk_cube_dataset_ids:
        bags.append(
            db.from_delayed(data[ktk_cube_dataset_id].to_delayed()).map_partitions(
                _convert_write_bag, ktk_cube_dataset_id=ktk_cube_dataset_id
            )
        )

    return (db.concat(bags), ktk_cube_dataset_ids)
Пример #8
0
def build_vocab(data_file, output_dir, size=50000, lang='en'):
    """
    Builds vocab of <size> from <data_file> and stores it in <output_dir>.
    """
    b = db.read_text(data_file).str.strip().str.lower()
    if lang is 'fr':
        b = b.str.replace(u'\u2019', u"'")
    b = b.map(lambda s: _TOKENIZER.findall(s)).concat().frequencies().topk(
        size - 4, lambda x: x[1]).pluck(0)
    a = db.from_sequence(_START_VOCAB)
    c = db.concat([a, b]).repartition(1)
    save_path = '%s/vocab%d.%s' % (output_dir, size, lang)
    with ProgressBar():
        c.to_textfiles([save_path])
    return save_path
Пример #9
0
def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])

    assert list(c) == [1, 2, 3, 4, 5, 6]
Пример #10
0
import dask.bag as db
from pyspark import SparkContext

if __name__ == '__main__':
    collection1 = [n for n in range(0, 6)]
    collection2 = [n for n in range(4, 10)]

    sc = SparkContext()
    rdd1 = sc.parallelize(collection1, 3)
    rdd2 = sc.parallelize(collection2, 3)
    res = rdd1.union(rdd2).collect()

    print(res)

    rdd1 = db.from_sequence(collection1, npartitions=3)
    rdd2 = db.from_sequence(collection2, npartitions=3)
    res = db.concat([rdd1, rdd2]).compute()

    print(res)
Пример #11
0
Файл: dsk.py Проект: chmp/flowly
def _apply__flowly__tz__apply_concat(bag, transform, rules):
    return db.concat([apply(func, bag, rules=rules) for func in transform.funcs])
Пример #12
0
def _apply__flowly__tz__apply_concat(bag, transform, rules):
    return db.concat(
        [apply(func, bag, rules=rules) for func in transform.funcs])
Пример #13
0
    if config.HasField("covid_json_dir"):
        document_pipeline.perform_document_independent_tasks(
            config=config,
            documents=document_pipeline.get_covid_documents(config),
            ckpt_prefix="covid",
            semrep_work_dir=semrep_work_dir,
        )

    # At this point, we are going to recover text sources using the checkpoint
    # module

    ##############################################################################

    parsed_sentences = dbag.concat([
        checkpoint.checkpoint(name, verbose=False)
        for name in checkpoint.get_checkpoints_like("*parsed_sentences")
    ])

    # Perform n-gram mining, introduces a new field "ngrams"
    ngram_sentences = ngram_util.get_frequent_ngrams(
        analyzed_sentences=parsed_sentences,
        max_ngram_length=config.phrases.max_ngram_length,
        min_ngram_support=config.phrases.min_ngram_support,
        min_ngram_support_per_partition=\
            config.phrases.min_ngram_support_per_partition,
        ngram_sample_rate=config.phrases.ngram_sample_rate,
    )
    ckpt("ngram_sentences")

    ngram_edges = graph_util.record_to_bipartite_edges(
        records=ngram_sentences,
Пример #14
0
Файл: bag.py Проект: filmor/rsds
from dask.distributed import Client
from utils import timer
import dask
import json
import dask.bag as db
import os

client = Client("tcp://localhost:8786")

if not os.path.exists("data"):
    os.makedirs('data', exist_ok=True)  # Create data/ directory
    b = dask.datasets.make_people()  # Make records of people
    b.map(json.dumps).to_textfiles(
        'data/*.json')  # Encode as JSON, write to disk

b = db.read_text('data/*.json').map(json.loads)
b = db.concat([b for _ in range(400)])

with timer("Map/filt/count"):
    m = b.map(lambda record: record['occupation'])
    filt = m.filter(lambda record: len(record) > 6)
    res = m.map(lambda record: len(record))
    res = res.count().compute()
Пример #15
0
Файл: dsk.py Проект: chmp/flowly
def _apply__flowly__tz__apply_map_concat(bag, transform, rules):
    return db.concat([
        bag.map_partitions(_apply_map_concat_impl, funcs=list(funcs), _flowly_id=flowly_id)
        # TODO: fix chunk_size
        for flowly_id, funcs in zip(id_sequence, partition_all(10, transform.funcs))
    ])
Пример #16
0
def test_concat_after_map():
    a = db.from_sequence([1, 2])
    b = db.from_sequence([4, 5])
    result = db.concat([a.map(inc), b])
    assert list(result) == [2, 3, 4, 5]
Пример #17
0
def _get_pairs(
    wikipedia_path=None,
    books_path=None,
    common_crawl_path=None,
    wikipedia_lang='en',
    target_seq_length=128,
    short_seq_prob=0.1,
    blocksize=None,
    num_blocks=None,
    duplicate_factor=5,
    sample_ratio=0.9,
    seed=12345,
    tokenizer=None,
    masking=False,
    masked_lm_ratio=0.15,
):
    vocab_words = tuple(tokenizer.vocab.keys())

    def _to_partition_pairs(partition_documents):
        partition_documents = tuple(partition_documents)
        partition_pairs = []
        for _ in range(duplicate_factor):
            for document_index in range(len(partition_documents)):
                partition_pairs.extend(
                    create_pairs_from_document(
                        partition_documents,
                        document_index,
                        max_seq_length=target_seq_length,
                        short_seq_prob=short_seq_prob,
                        masking=masking,
                        masked_lm_ratio=masked_lm_ratio,
                        vocab_words=vocab_words,
                    ))
        random.shuffle(partition_pairs)
        return partition_pairs

    if num_blocks is not None:
        if blocksize is not None:
            raise ValueError(
                'Only one of num_blocks or blocksize needs to be set!')
        blocksize = estimate_block_size(
            (wikipedia_path, books_path, common_crawl_path),
            num_blocks,
        )

    bags = []
    if wikipedia_path is not None:
        bags.append(
            read_wikipedia(
                wikipedia_path,
                lang=wikipedia_lang,
                blocksize=blocksize,
                sample_ratio=sample_ratio,
                sample_seed=seed,
            ))
    if books_path is not None:
        bags.append(
            read_books(
                books_path,
                blocksize=blocksize,
                sample_ratio=sample_ratio,
                sample_seed=seed,
            ))
    if common_crawl_path is not None:
        bags.append(
            read_common_crawl(
                common_crawl_path,
                blocksize=blocksize,
                sample_ratio=sample_ratio,
                sample_seed=seed,
            ))
    bag_texts = db.concat(bags)
    bag_texts = _shuffle_bag_texts(bag_texts)
    bag_documents = _get_documents(bag_texts, tokenizer)
    return bag_documents.map_partitions(_to_partition_pairs)
Пример #18
0
def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]
    assert c.name == db.concat([a, b]).name
Пример #19
0
 def concat(cls, hashbags):
     return hashbags[0].new(db.concat([hb.bag for hb in hashbags]))
Пример #20
0
def test_concat_after_map():
    a = db.from_sequence([1, 2])
    b = db.from_sequence([4, 5])
    result = db.concat([a.map(inc), b])
    assert list(result) == [2, 3, 4, 5]
Пример #21
0
            # Replace bag with result of ckpt, typically with save / load
            globals()[name] = dask_checkpoint.checkpoint(
                bag, name=name, checkpoint_dir=checkpoint_dir, **ckpt_kwargs)
        if config.HasField(
                "stop_after_ckpt") and config.stop_after_ckpt == name:
            print("Stopping early.")
            exit(0)

    ##############################################################################
    # BEGIN PIPELINE                                                             #
    ##############################################################################

    documents = get_medline_documents(config, download_shared)
    if config.HasField("covid_json_dir"):
        documents = dbag.concat([
            documents,
            get_covid_documents(config),
        ])
    ckpt("documents")

    # Split documents into sentences, filter out too-long and too-short sentences.
    sentences = documents.map_partitions(
        text_util.split_sentences,
        # --
        min_sentence_len=config.parser.min_sentence_len,
        max_sentence_len=config.parser.max_sentence_len,
    )
    ckpt("sentences")

    # Add POS tagging, lemmas, entitites, and additional data to each sent
    sentences_with_lemmas = sentences.map_partitions(
        text_util.analyze_sentences,