Пример #1
0
def estimate(predictions_rgb, predictions_pcloud, weights=(0.6, 0.4)):
    """Return ids of object that appears more times in each matcher.

    That is, returns the id of the item with higher frequency in
    the rgb predictions, the point cloud predictions and
    the item with higer frequency in a weighted sum of rgb and pcloud preds.

    Args:
        predictions_rgb: list of predictions of the RGB matcher
        predictions_pcloud: list of predictions of the point cloud matcher
        weights: tuple to indicate relative weights of the matchers.
            Default=(0.6, 0.4)

    Return:
        tuple with (id_of_weighted_sum, id_rgb, id_pcloud)

    Example:

        >>> estimate([1, 1, 1, 2, 2], [1, 0, 2, 2, 1])
        (1, 1, 1)
        >>> estimate([1, 1, 1, 2, 2], [1, 0, 2, 2, 2])
        (2, 1, 2)
        >>> estimate([1, 1, 1, 2, 2], [1, 0, 0, 0, 2])
        (1, 1, 0)
        >>> estimate([1, 1, 1, 2, 2], [2, 2, 2, 2, 2], weights=(1,0))
        (1, 1, 2)
    """
    w_rgb, w_pcloud = weights
    freqs_rgb = pd.Series(frequencies(predictions_rgb))
    freqs_pcloud = pd.Series(frequencies(predictions_pcloud))
    freqs = pd.Series.add(w_rgb * freqs_rgb,
                          w_pcloud * freqs_pcloud, fill_value=0)
    return (freqs.nlargest(1).index[0],
            freqs_rgb.nlargest(1).index[0],
            freqs_pcloud.nlargest(1).index[0])
Пример #2
0
def test_min_max():
    loop = IOLoop.current()
    cluster = yield LocalCluster(
        0,
        scheduler_port=0,
        silence_logs=False,
        processes=False,
        dashboard_address=None,
        loop=loop,
        asynchronous=True,
    )
    yield cluster._start()
    try:
        adapt = Adaptive(
            cluster.scheduler,
            cluster,
            minimum=1,
            maximum=2,
            interval="20 ms",
            wait_count=10,
        )
        c = yield Client(cluster, asynchronous=True, loop=loop)

        start = time()
        while not cluster.scheduler.workers:
            yield gen.sleep(0.01)
            assert time() < start + 1

        yield gen.sleep(0.2)
        assert len(cluster.scheduler.workers) == 1
        assert frequencies(pluck(1, adapt.log)) == {"up": 1}

        futures = c.map(slowinc, range(100), delay=0.1)

        start = time()
        while len(cluster.scheduler.workers) < 2:
            yield gen.sleep(0.01)
            assert time() < start + 1

        assert len(cluster.scheduler.workers) == 2
        yield gen.sleep(0.5)
        assert len(cluster.scheduler.workers) == 2
        assert len(cluster.workers) == 2
        assert frequencies(pluck(1, adapt.log)) == {"up": 2}

        del futures

        start = time()
        while len(cluster.scheduler.workers) != 1:
            yield gen.sleep(0.01)
            assert time() < start + 2
        assert frequencies(pluck(1, adapt.log)) == {"up": 2, "down": 1}
    finally:
        yield c.close()
        yield cluster.close()
Пример #3
0
    def _make_formula_dict(self):
        """From the list of atoms, form a dictionary where the keys are the
        element symbols and the values are the number of atoms of that
        element.
        """

        return frequencies(self.atoms)
Пример #4
0
def merge(*exprs, **kwargs):
    if len(exprs) + len(kwargs) == 1:
        # we only have one object so don't need to construct a merge
        if exprs:
            # we only have a positional argumnent, return it unchanged
            return exprs[0]
        if kwargs:
            # we only have a single keyword argument, label it and return it
            [(k, v)] = kwargs.items()
            return v.label(k)

    # label all the kwargs and sort in key order
    exprs = tuple(
        concatv(
            (_wrap(expr, '_%s' % n) for n, expr in enumerate(exprs)),
            (label(_wrap(v, k), k)
             for k, v in sorted(kwargs.items(), key=first)),
        ))

    if all(ndim(expr) == 0 for expr in exprs):
        raise TypeError('cannot merge all scalar expressions')

    result = Merge(
        exprs,
        varargsexpr(exprs),
        maxshape(map(shape, exprs)),
    )

    if not isdistinct(result.fields):
        raise ValueError(
            "Repeated columns found: " + ', '.join(
                k for k, v in frequencies(result.fields).items() if v > 1), )

    return result
Пример #5
0
def test_TaskStreamPlugin(c, s, *workers):
    es = TaskStreamPlugin(s)
    assert not es.buffer

    futures = c.map(div, [1] * 10, range(10))
    total = c.submit(sum, futures[1:])
    yield wait(total)

    assert len(es.buffer) == 11

    workers = dict()

    rects = es.rectangles(0, 10, workers)
    assert workers
    assert all(n == 'div' for n in rects['name'])
    assert all(d > 0 for d in rects['duration'])
    counts = frequencies(rects['color'])
    assert counts['black'] == 1
    assert set(counts.values()) == {9, 1}
    assert len(set(rects['y'])) == 3

    rects = es.rectangles(2, 5, workers)
    assert all(len(L) == 3 for L in rects.values())

    starts = sorted(rects['start'])
    rects = es.rectangles(2, 5, workers=workers,
                          start_boundary=(starts[0] + starts[1]) / 2000)
    assert set(rects['start']).issubset(set(starts[1:]))
Пример #6
0
def test_avoid_churn():
    """ We want to avoid creating and deleting workers frequently

    Instead we want to wait a few beats before removing a worker in case the
    user is taking a brief pause between work
    """
    cluster = yield LocalCluster(
        0,
        asynchronous=True,
        processes=False,
        scheduler_port=0,
        silence_logs=False,
        dashboard_address=None,
    )
    client = yield Client(cluster, asynchronous=True)
    try:
        adapt = Adaptive(cluster.scheduler, cluster, interval="20 ms", wait_count=5)

        for i in range(10):
            yield client.submit(slowinc, i, delay=0.040)
            yield gen.sleep(0.040)

        assert frequencies(pluck(1, adapt.log)) == {"up": 1}
    finally:
        yield client.close()
        yield cluster.close()
Пример #7
0
def test_TaskStreamPlugin(c, s, *workers):
    es = TaskStreamPlugin(s)
    assert not es.buffer

    futures = c.map(div, [1] * 10, range(10))
    total = c.submit(sum, futures[1:])
    yield _wait(total)

    assert len(es.buffer) == 11

    workers = dict()

    rects = es.rectangles(0, 10, workers)
    assert all(n == 'div' for n in rects['name'])
    assert all(d > 0 for d in rects['duration'])
    counts = frequencies(rects['color'])
    assert counts['black'] == 1
    assert set(counts.values()) == {9, 1}
    assert len(set(rects['y'])) == 3

    rects = es.rectangles(2, 5, workers)
    assert all(len(L) == 3 for L in rects.values())

    starts = sorted(rects['start'])
    rects = es.rectangles(2,
                          5,
                          workers=workers,
                          start_boundary=(starts[0] + starts[1]) / 2000)
    assert set(rects['start']).issubset(set(starts[1:]))
Пример #8
0
 def filter_columns(self, data, headers):
     """
     Drop columns that meet drop criteria, unless they have been
     explicitly selected.
     """
     drop = set(self.drop)
     select_patterns = [
         re.compile(pattern, re.I) for pattern in self.select
     ]
     select = len(select_patterns) > 0
     headers_out = []
     columns_out = []
     for header, column in zip(headers, zip(*data)):
         if select:
             for pattern in select_patterns:
                 if pattern.search(header):
                     headers_out.append(header)
                     columns_out.append(column)
         else:
             freqs = frequencies(column)
             if not set(freqs.keys()).issubset(drop):
                 headers_out.append(header)
                 columns_out.append(column)
     rows_out = list(zip(*columns_out))
     return rows_out, headers_out
Пример #9
0
def run(filename, vocab='default', vocab_ans='default'):

    #Read the data
    questions, answers = readFile(filename)

    #Create DataFrame from the lists
    df1, df3 = seperateImages(questions)

    df1 = pd.DataFrame(df1, columns=['question'])
    df2 = pd.DataFrame(answers, columns=['answer'])
    df3 = pd.DataFrame(df3, columns=['images'])

    #Concatanate dataframes along their rows, and construct the final version of the dataframe
    frames = [df1, df2, df3]
    data = pd.concat(frames, axis=1)

    #Check the first few elements for correctness
    data.head()

    #Count frequencies
    freqs_que = frequencies(' '.join(data['question']).split(' '))
    freqs_ans = frequencies(' '.join(data['answer']).split(' '))

    #Initialize the vocabulary - For test data, the training vocabulary is used. That's why the 'default' keyword is used.
    if vocab == 'default':
        vocabulary_que = createVocabulary(freqs_que)
    else:
        vocabulary_que = vocab

    if vocab_ans == 'default':
        vocabulary_ans = createVocabularyAnswers(freqs_ans)
    else:
        vocabulary_ans = vocab_ans

    #Encode question into integer vectors
    encoded_questions = encodeQuestions(data['question'], vocabulary_que)
    #Encode answers into one-hot vectors
    encoded_answers = encodeAnswers(data['answer'], vocabulary_ans)

    #Pad the questions into uniform length
    padded_questions = sequencePad(encoded_questions)
    #padded_answers = sequencePad(encoded_answers,MAXLEN = 2)

    return [
        df3, padded_questions, encoded_answers, vocabulary_que, vocabulary_ans
    ]
Пример #10
0
def _check_dsk(dsk):
    """ Check that graph is well named and non-overlapping """
    if not isinstance(dsk, HighLevelGraph):
        return

    assert all(isinstance(k, (tuple, str)) for k in dsk.layers)
    freqs = frequencies(concat(dsk.dicts.values()))
    non_one = {k: v for k, v in freqs.items() if v != 1}
    assert not non_one, non_one
Пример #11
0
def _check_dsk(dsk):
    """ Check that graph is well named and non-overlapping """
    if not isinstance(dsk, HighLevelGraph):
        return

    assert all(isinstance(k, (tuple, str)) for k in dsk.layers)
    freqs = frequencies(concat(dsk.dicts.values()))
    non_one = {k: v for k, v in freqs.items() if v != 1}
    assert not non_one, non_one
Пример #12
0
def test_min_max():
    loop = IOLoop.current()
    cluster = yield LocalCluster(0, scheduler_port=0, silence_logs=False,
                                 processes=False, diagnostics_port=None,
                                 loop=loop, asynchronous=True)
    yield cluster._start()
    try:
        adapt = Adaptive(cluster.scheduler, cluster, minimum=1, maximum=2,
                         interval='20 ms', wait_count=10)
        c = yield Client(cluster, asynchronous=True, loop=loop)

        start = time()
        while not cluster.scheduler.workers:
            yield gen.sleep(0.01)
            assert time() < start + 1

        yield gen.sleep(0.2)
        assert len(cluster.scheduler.workers) == 1
        assert frequencies(pluck(1, adapt.log)) == {'up': 1}

        futures = c.map(slowinc, range(100), delay=0.1)

        start = time()
        while len(cluster.scheduler.workers) < 2:
            yield gen.sleep(0.01)
            assert time() < start + 1

        assert len(cluster.scheduler.workers) == 2
        yield gen.sleep(0.5)
        assert len(cluster.scheduler.workers) == 2
        assert len(cluster.workers) == 2
        assert frequencies(pluck(1, adapt.log)) == {'up': 2}

        del futures

        start = time()
        while len(cluster.scheduler.workers) != 1:
            yield gen.sleep(0.01)
            assert time() < start + 2
        assert frequencies(pluck(1, adapt.log)) == {'up': 2, 'down': 1}
    finally:
        yield c.close()
        yield cluster.close()
Пример #13
0
def classify(filenames):
    langs = []
    for filename in filenames:
      with open(filename, 'rb') as f:
        langs.append(np.loadtxt(f, delimiter=',', skiprows=1))

    gmms = []
    for l in langs:
      g = GMM(n_components = 4, covariance_type='full')
      g.fit(l)
      gmms.append(g)

    all_data = np.row_stack(langs)
    for i in xrange(len(gmms)):
      g = gmms[i]
      l = langs[i]
      pred = [x > math.log(0.5) for x in g.score(all_data)]
      expected = [np.any(np.equal(l,x).all(1)) for x in all_data]
      return t.frequencies(expected), t.frequencies(zip(expected,pred))
Пример #14
0
def test_adapt_quickly():
    """ We want to avoid creating and deleting workers frequently

    Instead we want to wait a few beats before removing a worker in case the
    user is taking a brief pause between work
    """
    cluster = yield LocalCluster(
        0,
        asynchronous=True,
        processes=False,
        scheduler_port=0,
        silence_logs=False,
        dashboard_address=None,
    )
    client = yield Client(cluster, asynchronous=True)
    adapt = Adaptive(cluster.scheduler,
                     cluster,
                     interval=20,
                     wait_count=5,
                     maximum=10)
    try:
        future = client.submit(slowinc, 1, delay=0.100)
        yield wait(future)
        assert len(adapt.log) == 1

        # Scale up when there is plenty of available work
        futures = client.map(slowinc, range(1000), delay=0.100)
        while frequencies(pluck(1, adapt.log)) == {"up": 1}:
            yield gen.sleep(0.01)
        assert len(adapt.log) == 2
        assert "up" in adapt.log[-1]
        d = [x for x in adapt.log[-1] if isinstance(x, dict)][0]
        assert 2 < d["n"] <= adapt.maximum

        while len(cluster.scheduler.workers) < adapt.maximum:
            yield gen.sleep(0.01)

        del futures

        while len(cluster.scheduler.workers) > 1:
            yield gen.sleep(0.01)

        # Don't scale up for large sequential computations
        x = yield client.scatter(1)
        for i in range(100):
            x = client.submit(slowinc, x)

        yield gen.sleep(0.1)
        assert len(cluster.scheduler.workers) == 1
    finally:
        yield client.close()
        yield cluster.close()
Пример #15
0
 def find_corners(self) -> List[int]:
     edges = {}
     edges_dict_list = [{
         min(edge, edge[::-1]): tile.id
         for edge in tile.edges
     } for tile in self.tiles]
     edges = merge_with(lambda x: x, *edges_dict_list)
     freqs = frequencies(
         concat((value for value in edges.values() if len(value) == 2)))
     corners = [id for id, count in freqs.items() if count == 2]
     if len(corners) != 4:
         raise ValueError("Wrong number of corners!")
     return corners
Пример #16
0
def merge(*tables):
    # Get common sub expression
    child = common_subexpression(*tables)
    if not child:
        raise ValueError("No common sub expression found for input tables")

    result = Merge(child, tables)

    if not isdistinct(result.columns):
        raise ValueError("Repeated columns found: " + ', '.join(k for k, v in
            frequencies(result.columns).items() if v > 1))

    return result
Пример #17
0
def merge(*exprs):
    # Get common sub expression
    try:
        child = common_subexpression(*exprs)
    except:
        raise ValueError("No common sub expression found for input expressions")

    result = Merge(child, exprs)

    if not isdistinct(result.fields):
        raise ValueError("Repeated columns found: " + ', '.join(k for k, v in
            frequencies(result.fields).items() if v > 1))

    return result
Пример #18
0
Файл: index.py Проект: vunb/eva
    def build(self, section, texts, **kwargs):
        sec = self.sections[section]
        frequency = kwargs.pop('frequency', 0)
        if frequency > 0:
            freq_dict = frequencies([y for x in texts for y in x])
            texts = [[y for y in x if freq_dict[y] > frequency] for x in texts]
        sec.dictionary = corpora.Dictionary(texts)
        sec.corpus = [sec.dictionary.doc2bow(text) for text in texts]

        sec.tfidf = models.TfidfModel(sec.corpus)
        sec.lsi = models.LsiModel(sec.tfidf[sec.corpus],
                                  id2word=sec.dictionary,
                                  num_topics=kwargs.pop('num_topics', 250))
        sec.index = similarities.MatrixSimilarity(sec.lsi[sec.corpus])
    def __init__(self, vqa, vqaRes, n=2):
        VQAEval.__init__(self, vqa, vqaRes, n)

        print "Initialize class normalized evaluation..."
        # calculates answer frequencies over the current answers (train, val,
        # etc.)
        quesIds = [x for x in self.params['question_id']]
        gts = {}
        for quesId in quesIds:
            gts[quesId] = self.vqa.qa[quesId]

        # consider frequencies for all answers
        all_answers = [x['answer'] for y in gts for x in gts[y]['answers']]
        self.answer2freq = frequencies(all_answers)
        print "Class normalized evaluation initialized!"
Пример #20
0
    def __init__(self, vqa, vqaRes, n=2):
        VQAEval.__init__(self, vqa, vqaRes, n)

        print "Initialize class normalized evaluation..."
        # calculates answer frequencies over the current answers (train, val,
        # etc.)
        quesIds = [x for x in self.params['question_id']]
        gts = {}
        for quesId in quesIds:
            gts[quesId] = self.vqa.qa[quesId]

        # consider frequencies for all answers
        all_answers = [x['answer'] for y in gts for x in gts[y]['answers']]
        self.answer2freq = frequencies(all_answers)
        print "Class normalized evaluation initialized!"
Пример #21
0
def merge(*exprs, **kwargs):
    # Get common sub expression
    exprs = exprs + tuple(label(v, k) for k, v in kwargs.items())
    try:
        child = common_subexpression(*exprs)
    except:
        raise ValueError(
            "No common sub expression found for input expressions")

    result = Merge(child, exprs)

    if not isdistinct(result.fields):
        raise ValueError("Repeated columns found: " + ', '.join(
            k for k, v in frequencies(result.fields).items() if v > 1))

    return result
Пример #22
0
def test_worker_breaks_and_returns(c, s, a):
    future = c.submit(slowinc, 1, delay=0.1)
    for i in range(10):
        future = c.submit(slowinc, future, delay=0.1)

    yield _wait(future)

    a.batched_stream.comm.close()

    yield gen.sleep(0.1)
    start = time()
    yield _wait(future)
    end = time()

    assert end - start < 1

    assert frequencies(s.task_state.values()) == {'memory': 1, 'released': 10}
Пример #23
0
def test_worker_breaks_and_returns(c, s, a):
    future = c.submit(slowinc, 1, delay=0.1)
    for i in range(10):
        future = c.submit(slowinc, future, delay=0.1)

    yield wait(future)

    a.batched_stream.comm.close()

    yield gen.sleep(0.1)
    start = time()
    yield wait(future, timeout=10)
    end = time()

    assert end - start < 1

    states = frequencies(ts.state for ts in s.tasks.values())
    assert states == {'memory': 1, 'released': 10}
Пример #24
0
def merge(*exprs, **kwargs):
    if len(exprs) + len(kwargs) == 1:
        if exprs:
            return exprs[0]
        if kwargs:
            [(k, v)] = kwargs.items()
            return v.label(k)
    # Get common sub expression
    exprs += tuple(label(v, k) for k, v in sorted(kwargs.items(), key=first))
    child = common_subexpression(*exprs)
    result = Merge(child, exprs)

    if not isdistinct(result.fields):
        raise ValueError(
            "Repeated columns found: " + ', '.join(
                k for k, v in frequencies(result.fields).items() if v > 1), )

    return result
Пример #25
0
def test_worker_breaks_and_returns(c, s, a):
    future = c.submit(slowinc, 1, delay=0.1)
    for i in range(10):
        future = c.submit(slowinc, future, delay=0.1)

    yield wait(future)

    yield a.batched_stream.comm.close()

    yield gen.sleep(0.1)
    start = time()
    yield wait(future, timeout=10)
    end = time()

    assert end - start < 1

    states = frequencies(ts.state for ts in s.tasks.values())
    assert states == {"memory": 1, "released": 10}
Пример #26
0
def test_adapt_quickly():
    """ We want to avoid creating and deleting workers frequently

    Instead we want to wait a few beats before removing a worker in case the
    user is taking a brief pause between work
    """
    cluster = yield LocalCluster(0, asynchronous=True, processes=False,
                                 scheduler_port=0, silence_logs=False,
                                 diagnostics_port=None)
    client = yield Client(cluster, asynchronous=True)
    adapt = Adaptive(cluster.scheduler, cluster, interval=20, wait_count=5,
                     maximum=10)
    try:
        future = client.submit(slowinc, 1, delay=0.100)
        yield wait(future)
        assert len(adapt.log) == 1

        # Scale up when there is plenty of available work
        futures = client.map(slowinc, range(1000), delay=0.100)
        while frequencies(pluck(1, adapt.log)) == {'up': 1}:
            yield gen.sleep(0.01)
        assert len(adapt.log) == 2
        assert 'up' in adapt.log[-1]
        d = [x for x in adapt.log[-1] if isinstance(x, dict)][0]
        assert 2 < d['n'] <= adapt.maximum

        while len(cluster.scheduler.workers) < adapt.maximum:
            yield gen.sleep(0.01)

        del futures

        while len(cluster.scheduler.workers) > 1:
            yield gen.sleep(0.01)

        # Don't scale up for large sequential computations
        x = yield client.scatter(1)
        for i in range(100):
            x = client.submit(slowinc, x)

        yield gen.sleep(0.1)
        assert len(cluster.scheduler.workers) == 1
    finally:
        yield client.close()
        yield cluster.close()
Пример #27
0
def decide_worker(dependencies, stacks, who_has, restrictions, key):
    """ Decide which worker should take task

    >>> dependencies = {'c': {'b'}, 'b': {'a'}}
    >>> stacks = {('alice', 8000): ['z'], ('bob', 8000): []}
    >>> who_has = {'a': {('alice', 8000)}}
    >>> restrictions = {}

    We choose the worker that has the data on which 'b' depends (alice has 'a')

    >>> decide_worker(dependencies, stacks, who_has, restrictions, 'b')
    ('alice', 8000)

    If both Alice and Bob have dependencies then we choose the less-busy worker

    >>> who_has = {'a': {('alice', 8000), ('bob', 8000)}}
    >>> decide_worker(dependencies, stacks, who_has, restrictions, 'b')
    ('bob', 8000)

    Optionally provide restrictions of where jobs are allowed to occur

    >>> restrictions = {'b': {'alice', 'charile'}}
    >>> decide_worker(dependencies, stacks, who_has, restrictions, 'b')
    ('alice', 8000)
    """
    deps = dependencies[key]
    workers = frequencies(w for dep in deps
                            for w in who_has[dep])
    if not workers:
        workers = stacks
    if key in restrictions:
        r = restrictions[key]
        workers = {w for w in workers if w[0] in r}  # TODO: nonlinear
        if not workers:
            workers = {w for w in stacks if w[0] in r}
            if not workers:
                raise ValueError("Task has no valid workers", key, r)
    if not workers:
        raise ValueError("No workers found")

    worker = min(workers, key=lambda w: len(stacks[w]))
    return worker
Пример #28
0
def merge(*exprs, **kwargs):
    if len(exprs) + len(kwargs) == 1:
        if exprs:
            return exprs[0]
        if kwargs:
            [(k, v)] = kwargs.items()
            return v.label(k)
    # Get common sub expression
    exprs += tuple(label(v, k) for k, v in sorted(kwargs.items(), key=first))
    child = common_subexpression(*exprs)
    result = Merge(child, exprs)

    if not isdistinct(result.fields):
        raise ValueError(
            "Repeated columns found: " + ', '.join(
                k for k, v in frequencies(result.fields).items() if v > 1
            ),
        )

    return result
Пример #29
0
def merge(*exprs, **kwargs):
    if len(exprs) + len(kwargs) == 1:
        if exprs:
            return exprs[0]
        if kwargs:
            [(k, v)] = kwargs.items()
            return v.label(k)
    # Get common sub expression
    exprs = exprs + tuple(label(v, k) for k, v in kwargs.items())
    try:
        child = common_subexpression(*exprs)
    except:
        raise ValueError("No common sub expression found for input expressions")

    result = Merge(child, exprs)

    if not isdistinct(result.fields):
        raise ValueError("Repeated columns found: " + ', '.join(k for k, v in
            frequencies(result.fields).items() if v > 1))

    return result
Пример #30
0
def test_avoid_churn():
    """ We want to avoid creating and deleting workers frequently

    Instead we want to wait a few beats before removing a worker in case the
    user is taking a brief pause between work
    """
    cluster = yield LocalCluster(0, asynchronous=True, processes=False,
                                 scheduler_port=0, silence_logs=False,
                                 diagnostics_port=None)
    client = yield Client(cluster, asynchronous=True)
    try:
        adapt = Adaptive(cluster.scheduler, cluster, interval='20 ms', wait_count=5)

        for i in range(10):
            yield client.submit(slowinc, i, delay=0.040)
            yield gen.sleep(0.040)

        assert frequencies(pluck(1, adapt.log)) == {'up': 1}
    finally:
        yield client.close()
        yield cluster.close()
Пример #31
0
def merge(*exprs, **kwargs):
    if len(exprs) + len(kwargs) == 1:
        # we only have one object so don't need to construct a merge
        if exprs:
            # we only have a positional argumnent, return it unchanged
            return exprs[0]
        if kwargs:
            # we only have a single keyword argument, label it and return it
            [(k, v)] = kwargs.items()
            return v.label(k)

    # label all the kwargs and sort in key order
    exprs = tuple(concatv(
        (_wrap(expr, '_%s' % n) for n, expr in enumerate(exprs)),
        (
            label(_wrap(v, k), k)
            for k, v in sorted(kwargs.items(), key=first)
        ),
    ))

    if all(ndim(expr) == 0 for expr in exprs):
        raise TypeError('cannot merge all scalar expressions')

    result = Merge(
        exprs,
        varargsexpr(exprs),
        maxshape(map(shape, exprs)),
    )

    if not isdistinct(result.fields):
        raise ValueError(
            "Repeated columns found: " + ', '.join(
                k for k, v in frequencies(result.fields).items() if v > 1
            ),
        )

    return result
Пример #32
0
#!/usr/bin/env python

import os
import toolz
import json

toolz.frequencies


def messages_from_file(fname):
    with file(fname) as ff:
        return json.loads(ff.read())

def gen():
    for fname in os.listdir("../resources"):
        if not fname.startswith("I3Live"):
            continue
        for m in messages_from_file("../resources/" + fname):
            yield m["service"]


print toolz.frequencies(gen())

print "OK"
Пример #33
0
import sys
import itertools
import toolz

from gensim.models import word2vec

data_file = sys.argv[1]

sentences = [
    s for s in word2vec.LineSentence(data_file)
    if toolz.count(toolz.unique(s)) >= 2
]

cmb = toolz.frequencies(
    toolz.mapcat(lambda s: itertools.combinations(sorted(toolz.unique(s)), 2),
                 sentences))

for (k1, k2), v in sorted(cmb.items(), key=lambda x: -x[1]):
    print(f"item1 = {k1}, item2 = {k2}, freq = {v}")
Пример #34
0
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"
]

d = tz.pipe(
    D, c.map(lambda x: x.strip()), c.map(lambda x: x.lower()),
    c.map(lambda x: x.translate(str.maketrans('', '', string.punctuation))),
    c.map(lambda x: re.sub('[0-9]+', '', x)), c.map(lambda x: x.split()),
    c.map(lambda x: [word for word in x if word not in stops]), list)

d_sub = d[:500]

tf = {id: tz.frequencies(doc) for id, doc in enumerate(d_sub)}
df = pd.DataFrame(tf).fillna(0)
words = df.index

ds = df.values.T
ds = ds.astype(int)


def DataTrans(x):
    """Turn the data into the desired structure"""

    N_d = np.sum(x)
    V = len(x)

    row = 0
Пример #35
0
""" Example Toolz

    From `toolz` pypi page
    https://pypi.python.org/pypi/toolz """

# %%
from toolz import compose, frequencies, partial
from toolz.curried import map


def stem(word):
    """ Stem word to primitive form """
    return word.lower().rstrip(",.!:;'-\"").lstrip("'\"")

wordcount = compose(frequencies, map(stem), str.split)

sentence = "This cat jumped over this other cat!"
wordcount(sentence)
# {'this': 2, 'cat': 2, 'jumped': 1, 'over': 1, 'other': 1}

# %%
print("sentance: {}".format(sentence))
print("split: {}".format(str.split(sentence)))
print("stem: {}".format(stem(sentence)))
print("frequencies: {}".format(frequencies(sentence)))
Пример #36
0
def analyze_text(texts, cleaner):
    return word_ratio(
        toolz.frequencies(
            filter(word_is_desired,
                   itertools.chain(*map(cleaner.clean_text, texts)))))
Пример #37
0
def _judgeMultiSubmit(judgeScores):
    # Test this
    judges = [x.judge for x in judgeScores]
    fs = toolz.frequencies(judges)
    toomany = toolz.valfilter(lambda num: num > 1, fs)
    return [judge.username for judge in toomany.keys()]
Пример #38
0
def count_ingredient_occurances(foods: List[FoodEntry],
                                ingredients: List[str]) -> int:
    freqs = frequencies(concat(map(lambda x: x.ingredients, foods)))
    return sum((count for ingredient, count in freqs.items()
                if ingredient in ingredients))
Пример #39
0
def test_frequencies():
    frequencies(big_data)
Пример #40
0
def test_frequencies_small():
    for i in range(1000):
        frequencies(small_data)
Пример #41
0
sentences = list(word2vec.LineSentence(data_file))

dic = Dictionary(sentences)

corpus = [dic.doc2bow(s) for s in sentences if len(s) >= 2]

lda = LdaModel(corpus = corpus, id2word = dic, num_topics = topic_num, alpha = alpha, random_state = 1)

doc_topics = [lda[c] for c in corpus]

avg_doc_topics = mean([len(t) for t in doc_topics])

print(f"topics num of doc = {avg_doc_topics}")

topic_freq = frequencies(concat([[x[0] for x in t] for t in doc_topics]))

wb = Workbook()

sh1 = wb.active
sh1.title = 'topics'

sh1.append(['topic', 'freq', 'item', 'prob'])

for i in range(topic_num):
  for t in lda.get_topic_terms(i):
    item = dic[t[0]]

    sh1.append([i, topic_freq[i], item, t[1]])

sh1.auto_filter.ref = f"A1:D{sh1.max_row}"
def Estimate_Frequency(Set)
    from toolz import frequencies
    wordcount_x = frequencies(' '.join(Set).split(' '))

    # Kraino is a framework that helps in fast prototyping Visual Turing Test models
    # This function takes wordcounts and returns word2index - mapping from words into indices, 
    # and index2word - mapping from indices to words and building the vocabulary.
    
    from kraino.utils.input_output_space import build_vocabulary
    word2index_x, index2word_x = build_vocabulary(
    this_wordcount=wordcount_x,
    truncate_to_most_frequent=0)
    word2index_x
return word2index_x
    
    
Пример #43
0
def stem(word):
    """ Stem word to primitive form """
    return word.lower().rstrip(",.!:;'-\"").lstrip("'\"")

wordcount = compose(frequencies, map(stem), str.split)

sentence = "This cat jumped over this other cat!"
wordcount(sentence)
# {'this': 2, 'cat': 2, 'jumped': 1, 'over': 1, 'other': 1}

# %%
print("sentance: {}".format(sentence))
print("split: {}".format(str.split(sentence)))
print("stem: {}".format(stem(sentence)))
print("frequencies: {}".format(frequencies(sentence)))

#%%
#
# Curry.
# 

from toolz.functoolz import curry 

@curry
def add(x, y, echo=False):
    if echo:
        print(f"x = {x}")
        print(f"y = {y}")

    return x + y
Пример #44
0
def test_frequencies_small():
    for i in range(1000):
        frequencies(small_data)
Пример #45
0
dp = data_provider.select['daquar-triples']
train_text_representation = dp['text'](train_or_test='train')

n_elements = 10
#print('== Questions:')
#print_list(train_text_representation['x'][:n_elements])
#print('== Answers:')
#print_list(train_text_representation['y'][:n_elements])
#print('== Image Names:')
#print_list(train_text_representation['img_name'][:n_elements])

from toolz import frequencies
train_raw_x = train_text_representation['x']
# we start from building the frequencies table
wordcount_x = frequencies(' '.join(train_raw_x).split(' '))
# print the most and least frequent words
n_show = 5
#print(sorted(wordcount_x.items(), key=lambda x: x[1], reverse=True)[:n_show])
#print(sorted(wordcount_x.items(), key=lambda x: x[1])[:n_show])

# Kraino is a framework that helps in fast prototyping Visual Turing Test models
from kraino.utils.input_output_space import build_vocabulary

# This function takes wordcounts and returns word2index - mapping from words into indices,
# and index2word - mapping from indices to words.
word2index_x, index2word_x = build_vocabulary(
    this_wordcount=wordcount_x,
    truncate_to_most_frequent=0)

#print (word2index_x)
Пример #46
0
for i in xrange(len(langs)):
  l = langs[i]
  expected = [sys.argv[1+i] if np.any(np.equal(l,x).all(1)) else '' for x in all_data]
  labels.append(expected)

labels = [[l for l in list(label) if len(l) > 0] for label in zip(*labels)]
mlb = MultiLabelBinarizer()
indicators = mlb.fit_transform(labels)

train, test, y_train, y_test = train_test_split(all_data, indicators)

# clf = OneVsRestClassifier(GMM(n_components = 4, covariance_type='full'))
# clf = OneVsRestClassifier(svm.SVC())
clf = OneVsRestClassifier(SGDClassifier())
clf.fit(train, y_train)

train_pred =  clf.predict(train)
test_pred = clf.predict(test)

for i in xrange(train_pred.shape[1]):
  print mlb.classes_[i]
  print "Train data"
  print t.frequencies(y_train[:,i]), t.frequencies(zip(y_train[:,i],train_pred[:,i]))
  print metrics.classification_report(y_train[:,i], train_pred[:,i])
  print "Test data"
  print t.frequencies(y_test[:,i]), t.frequencies(zip(y_test[:,i],test_pred[:,i]))
  print metrics.classification_report(y_test[:,i], test_pred[:,i])
  print ""

#test_set = ["features/de-test-10000.features", "features/fr-test-10000.features"]
def analyze_poems(poems, cleaner):
    return word_ratio(
        toolz.frequencies(
            filter(word_is_desired,
                itertools.chain(*map(cleaner.clean_poem, poems)))))
Пример #48
0
def decide_worker(dependencies, stacks, who_has, restrictions, nbytes, key):
    """ Decide which worker should take task

    >>> dependencies = {'c': {'b'}, 'b': {'a'}}
    >>> stacks = {('alice', 8000): ['z'], ('bob', 8000): []}
    >>> who_has = {'a': {('alice', 8000)}}
    >>> nbytes = {'a': 100}
    >>> restrictions = {}

    We choose the worker that has the data on which 'b' depends (alice has 'a')

    >>> decide_worker(dependencies, stacks, who_has, restrictions, nbytes, 'b')
    ('alice', 8000)

    If both Alice and Bob have dependencies then we choose the less-busy worker

    >>> who_has = {'a': {('alice', 8000), ('bob', 8000)}}
    >>> decide_worker(dependencies, stacks, who_has, restrictions, nbytes, 'b')
    ('bob', 8000)

    Optionally provide restrictions of where jobs are allowed to occur

    >>> restrictions = {'b': {'alice', 'charile'}}
    >>> decide_worker(dependencies, stacks, who_has, restrictions, nbytes, 'b')
    ('alice', 8000)

    If the task requires data communication, then we choose to minimize the
    number of bytes sent between workers. This takes precedence over worker
    occupancy.

    >>> dependencies = {'c': {'a', 'b'}}
    >>> who_has = {'a': {('alice', 8000)}, 'b': {('bob', 8000)}}
    >>> nbytes = {'a': 1, 'b': 1000}
    >>> stacks = {('alice', 8000): [], ('bob', 8000): []}

    >>> decide_worker(dependencies, stacks, who_has, {}, nbytes, 'c')
    ('bob', 8000)
    """
    deps = dependencies[key]
    workers = frequencies(w for dep in deps
                            for w in who_has[dep])
    if not workers:
        workers = stacks
    if key in restrictions:
        r = restrictions[key]
        workers = {w for w in workers if w[0] in r}  # TODO: nonlinear
        if not workers:
            workers = {w for w in stacks if w[0] in r}
            if not workers:
                raise ValueError("Task has no valid workers", key, r)
    if not workers or not stacks:
        raise ValueError("No workers found")

    commbytes = {w: sum(nbytes[k] for k in dependencies[key]
                                   if w not in who_has[k])
                 for w in workers}

    minbytes = min(commbytes.values())

    workers = {w for w, nb in commbytes.items() if nb == minbytes}
    worker = min(workers, key=lambda w: len(stacks[w]))
    return worker
def daquar_qa_triples(
        path=None, 
        train_or_test='train', 
        keep_top_qa_pairs=0,
        **kwargs):
    """
    DAQUAR question answer pairs.

    In:
        path - path to DAQUAR root folder, if None then default path is chosen
            by default None
        train_or_test - switch between train and test set;
            value belongs to \{'train', 'val', 'test'\} 
            by default 'train'
        keep_top_qa_pairs - filter out question-answer pairs to the
            keep_top_qa_pairs if positive; by default 0

    Out:
        x - textual questions
        y - textual answers
        img_name - names of the images
        img_ind - image indices that correspond to x
        question_id - empty list as it is unused in DAQUAR
        end_of_question - end of question token
        end_of_answer - end of answer token
        answer_words_delimiter - delimiter for multiple word answers
    """
    if path is None:
        curr_dir = os.path.dirname(os.path.realpath(__file__))
        path = os.path.join(curr_dir, '..', '..', 'data', 'daquar')

    if train_or_test == 'val':
        # we don't have a well established split
        train_or_test = 'train'

    xy_list = file2list(
            os.path.join(path,'qa.894.raw.'+train_or_test+'.format_triple'))

    # create a dictionary of allowed qa pairs
    all_answers = xy_list[1::3]
    freq = frequencies(all_answers)
    if keep_top_qa_pairs <= 0:
        most_frequent_answers = sorted(
                freq.items(), key=lambda x:x[1], reverse=True)
    else:
        most_frequent_answers = sorted(
                freq.items(), key=lambda x:x[1], reverse=True)[:keep_top_qa_pairs]
    allowed_answers_dict = dict(most_frequent_answers)
    #

    x_list = []
    y_list = []
    img_name_list = []
    img_ind_list = []
    for x, y, image_name in zip(xy_list[::3], xy_list[1::3], xy_list[2::3]):
        if y in allowed_answers_dict:
            x_list.append(x)
            y_list.append(y)
            img_name_list.append(image_name)
            img_num = re.search('(?<=image)[0-9]+', image_name).group(0)
            img_ind_list.append(int(img_num)-1)

    return {'x':x_list, 
            'y':y_list, 
            'img_name':img_name_list, 
            'img_ind': img_ind_list, 
            'question_id': [],
            'end_of_question':'?', 
            'end_of_answer':'',
            'answer_words_delimiter':','}
def vqa_general(path=None, train_or_test='train', dataset_type='mscoco', 
        task_type='OpenEnded', annotation_year='2014', question_year='2015', 
        image_name_template='COCO_2014_{0:0=12}', answer_mode='single_random',
        keep_top_qa_pairs=0):
    """
    VT-Vision-Lab VQA question answeir pairs. It is a general interface.
    In:
        path - path to VQA root folder, if None then default path is chosen;
            by default None
        train_or_test - switch between train and test set;
            value belongs to \{'train', 'val', 'test', 'test_dev'\} 
            by default 'train'
        dataset_type - type of dataset, e.g. 'mscoco'
        task_type - type of the task, e.g. 'OpenEnded'
        annotation_year - annotation year
        question_year - question year
        image_name_template - template for giving names to images
        answer_mode - possible answer modes:
            'single_random' - single answer, randomly chosen
            'single_confident' - single answer, randomly chosen among the confident;
                if there is no confident then randomly chosen (the same as single)
            'single_frequent' - the most frequent answer
            'all' - with one question all answers
            'all_repeat' - all answers by repeating the same question
            'all_repeat_confidentonly' - all answers that are confident (repeats the same question)
        keep_top_qa_pairs - filter out question-answer pairs to the
            keep_top_qa_pairs if positive; by default 0

    Out:
        x - textual questions
        y - textual answers
        img_name - names of the images
        img_ind - image indices that correspond to x
        question_id - list of question indices
        end_of_question - end of question token
        end_of_answer - end of answer token
        answer_words_delimiter - delimiter for multiple word answers
        anno_path - constructed path to annotations
        questions_path - constructed path to questions
    """

    def preprocess_question(q):
        q_tmp = q.strip().lower().encode('utf8')
        if q_tmp[-1] == '?' and q_tmp[-2] != ' ':
            # separate word token from the question mark
            q_tmp = q_tmp[:-1] + ' ?'
        # remove question mark
        if q_tmp[-1] == '?': q_tmp = q_tmp[:-1]
        return q_tmp
    #

    assert answer_mode in ['single_random', 'single_confident', 'single_frequent', 'all', 'all_repeat', 'all_repeat_confidentonly']
    assert task_type in ['OpenEnded', 'MultipleChoice'], \
            'The task is either ''OpenEnded'' of ''MultipleChoice'''
    assert dataset_type in ['mscoco', 'abstract_v002'], \
            'The type of dataset is eigher ''mscoco'' or ''abstract_v002'''

    vqa_dict = vqa_get_object(
            path=path, 
            train_or_test=train_or_test, 
            dataset_type=dataset_type, 
            task_type=task_type, 
            annotation_year=annotation_year, 
            question_year=question_year)
    vqa = vqa_dict['vqa_object']

    # questions can be filtered, e.g. by the question type
    ann_ids = vqa.getQuesIds()     
    anns = vqa.loadQA(ann_ids)
   
    # process annotations
    question_id_list = []
    image_name_list = []
    image_id_list = []
    x_list = []
    y_list = []

    # return only questions if there are no annotations
    if anns == []:
        for ques in vqa.questions['questions']:
            question = preprocess_question(ques['question'])
            x_list.append(question)
            question_id_list.append(ques['question_id'])
            image_id = ques['image_id']
            image_name = image_name_template.format(image_id)
            image_name_list.append(image_name)
            image_id_list.append(image_id)

    # create a dictionary of allowed qa pairs
    all_answers = [x['answer'] for anno in anns for x in anno['answers']]
    freq = frequencies(all_answers)
    if keep_top_qa_pairs <= 0:
        most_frequent_answers = sorted(
                freq.items(), key=lambda x:x[1], reverse=True)
    else:
        most_frequent_answers = sorted(
                freq.items(), key=lambda x:x[1], reverse=True)[:keep_top_qa_pairs]
    allowed_answers_dict = dict(most_frequent_answers)
    #

    for anno in anns:
        image_id = anno['image_id']
        image_name = image_name_template.format(image_id)
        question_id = anno['question_id']
        question = preprocess_question(vqa.qqa[question_id]['question'])
        assert image_id == vqa.qqa[question_id]['image_id'], \
                'image id of the question and answer are different'
        # randomizing the answers list
        randomized_answers = copy.deepcopy(anno['answers'])
        np.random.shuffle(randomized_answers)
        randomized_allowed_answers_list = \
                [x for x in randomized_answers if x['answer'] in allowed_answers_dict]
        if randomized_allowed_answers_list == []:
            continue
        #
        if answer_mode == 'single_random':
            answer = randomized_allowed_answers_list[0]['answer']
        elif answer_mode == 'single_confident':
            # if there is no confident answer, take a random one
            confidence_list = [x['answer_confidence'] \
                    for x in randomized_allowed_answers_list]
            yes_list = [j for j,x in enumerate(confidence_list) if x == 'yes'] 
            if yes_list == []:
                answer = randomized_allowed_answers_list[0]['answer']
            else:
                answer = randomized_allowed_answers_list[yes_list[0]]['answer']
        elif answer_mode == 'single_frequent':
            tmp = frequencies([x['answer'] for x in randomized_allowed_answers_list])
            answer = sorted(tmp.items(), key=lambda x: x[1], reverse=True)[0][0]
        elif answer_mode == 'all':
            raise NotImplementedError()
        elif answer_mode == 'all_repeat':
            answer_list_all_mode = []
            for answer in randomized_allowed_answers_list:
                answer_list_all_mode.append(answer['answer'].encode('utf8'))
        elif answer_mode == 'all_repeat_confidentonly':
            # like repeat but consider only confident answers
            confidence_list = [x['answer_confidence'] \
                    for x in randomized_allowed_answers_list]
            yes_list = [j for j,x in enumerate(confidence_list) if x == 'yes'] 
            if yes_list == []:
                # we keep only confident qa pairs
                continue
            answer_list_all_mode = []
            for answer_no, answer in enumerate(randomized_allowed_answers_list):
                if answer_no in yes_list:
                    answer_list_all_mode.append(answer['answer'].encode('utf8'))
        else:
            raise NotImplementedError()

        if 'single' in answer_mode:
            answer = answer.encode('utf8')
            x_list.append(question)
            y_list.append(answer)
            image_name_list.append(image_name)
            image_id_list.append(image_id)
            question_id_list.append(question_id)
        elif 'all' in answer_mode:
            num_answers_all_mode = len(answer_list_all_mode)
            x_list.extend([question]*num_answers_all_mode)
            image_name_list.extend([image_name]*num_answers_all_mode)
            image_id_list.extend([image_id]*num_answers_all_mode)
            question_id_list.extend([question_id]*num_answers_all_mode)
            y_list.extend(answer_list_all_mode)
        else:
            raise NotImplementedError()

    return {'x':x_list, 'y':y_list, 
            'img_name':image_name_list, 
            'img_ind': image_id_list, 
            'question_id': question_id_list,
            'end_of_question':'?', 
            'end_of_answer':'',
            'answer_words_delimiter':' ',
            'vqa_object':vqa,
            'questions_path':vqa_dict['questions_path'],
            'anno_path':vqa_dict['anno_path']}
Пример #51
0
def parse(line):
    parts = line.split()
    x, y = int(parts[0].rstrip(',')), int(parts[1])
    cmap[(x, y)] = 'A'
    return x, y


points = list(map(parse, sys.stdin))

# find corners
min_x = min(points, key=lambda x: x[0])[0]
min_y = min(points, key=lambda x: x[1])[1]
max_x = max(points, key=lambda x: x[0])[0]
max_y = max(points, key=lambda x: x[1])[1]
grid = {}
for y in range(min_y, max_y + 1):
    for x in range(min_x, max_x + 1):
        c = closest(x, y, points)
        if c:
            grid[(x, y)] = c
            print(cmap[c], end='')
        else:
            print('.', end='')
    print()
# print(grid)
bad_points = keyfilter(
    lambda x: x[0] in [max_x, min_x] or x[1] in [max_y, min_y], grid).values()
grid = valfilter(lambda x: x not in bad_points, grid)
# print(grid)
print(max(frequencies(grid.values()).values()))
Пример #52
0
def blockwise(func, out_ind, *args, **kwargs):
    """ Tensor operation: Generalized inner and outer products

    A broad class of blocked algorithms and patterns can be specified with a
    concise multi-index notation.  The ``blockwise`` function applies an in-memory
    function across multiple blocks of multiple inputs in a variety of ways.
    Many dask.array operations are special cases of blockwise including
    elementwise, broadcasting, reductions, tensordot, and transpose.

    Parameters
    ----------
    func : callable
        Function to apply to individual tuples of blocks
    out_ind : iterable
        Block pattern of the output, something like 'ijk' or (1, 2, 3)
    *args : sequence of Array, index pairs
        Sequence like (x, 'ij', y, 'jk', z, 'i')
    **kwargs : dict
        Extra keyword arguments to pass to function
    dtype : np.dtype
        Datatype of resulting array.
    concatenate : bool, keyword only
        If true concatenate arrays along dummy indices, else provide lists
    adjust_chunks : dict
        Dictionary mapping index to function to be applied to chunk sizes
    new_axes : dict, keyword only
        New indexes and their dimension lengths

    Examples
    --------
    2D embarrassingly parallel operation from two arrays, x, and y.

    >>> z = blockwise(operator.add, 'ij', x, 'ij', y, 'ij', dtype='f8')  # z = x + y  # doctest: +SKIP

    Outer product multiplying x by y, two 1-d vectors

    >>> z = blockwise(operator.mul, 'ij', x, 'i', y, 'j', dtype='f8')  # doctest: +SKIP

    z = x.T

    >>> z = blockwise(np.transpose, 'ji', x, 'ij', dtype=x.dtype)  # doctest: +SKIP

    The transpose case above is illustrative because it does same transposition
    both on each in-memory block by calling ``np.transpose`` and on the order
    of the blocks themselves, by switching the order of the index ``ij -> ji``.

    We can compose these same patterns with more variables and more complex
    in-memory functions

    z = X + Y.T

    >>> z = blockwise(lambda x, y: x + y.T, 'ij', x, 'ij', y, 'ji', dtype='f8')  # doctest: +SKIP

    Any index, like ``i`` missing from the output index is interpreted as a
    contraction (note that this differs from Einstein convention; repeated
    indices do not imply contraction.)  In the case of a contraction the passed
    function should expect an iterable of blocks on any array that holds that
    index.  To receive arrays concatenated along contracted dimensions instead
    pass ``concatenate=True``.

    Inner product multiplying x by y, two 1-d vectors

    >>> def sequence_dot(x_blocks, y_blocks):
    ...     result = 0
    ...     for x, y in zip(x_blocks, y_blocks):
    ...         result += x.dot(y)
    ...     return result

    >>> z = blockwise(sequence_dot, '', x, 'i', y, 'i', dtype='f8')  # doctest: +SKIP

    Add new single-chunk dimensions with the ``new_axes=`` keyword, including
    the length of the new dimension.  New dimensions will always be in a single
    chunk.

    >>> def f(x):
    ...     return x[:, None] * np.ones((1, 5))

    >>> z = blockwise(f, 'az', x, 'a', new_axes={'z': 5}, dtype=x.dtype)  # doctest: +SKIP

    New dimensions can also be multi-chunk by specifying a tuple of chunk
    sizes.  This has limited utility as is (because the chunks are all the
    same), but the resulting graph can be modified to achieve more useful
    results (see ``da.map_blocks``).

    >>> z = blockwise(f, 'az', x, 'a', new_axes={'z': (5, 5)}, dtype=x.dtype)  # doctest: +SKIP

    If the applied function changes the size of each chunk you can specify this
    with a ``adjust_chunks={...}`` dictionary holding a function for each index
    that modifies the dimension size in that index.

    >>> def double(x):
    ...     return np.concatenate([x, x])

    >>> y = blockwise(double, 'ij', x, 'ij',
    ...               adjust_chunks={'i': lambda n: 2 * n}, dtype=x.dtype)  # doctest: +SKIP

    Include literals by indexing with None

    >>> y = blockwise(add, 'ij', x, 'ij', 1234, None, dtype=x.dtype)  # doctest: +SKIP
    """
    out = kwargs.pop('name', None)      # May be None at this point
    token = kwargs.pop('token', None)
    dtype = kwargs.pop('dtype', None)
    adjust_chunks = kwargs.pop('adjust_chunks', None)
    new_axes = kwargs.pop('new_axes', {})
    align_arrays = kwargs.pop('align_arrays', True)

    # Input Validation
    if len(set(out_ind)) != len(out_ind):
        raise ValueError("Repeated elements not allowed in output index",
                         [k for k, v in toolz.frequencies(out_ind).items() if v > 1])
    new = (set(out_ind)
           - {a for arg in args[1::2] if arg is not None for a in arg}
           - set(new_axes or ()))
    if new:
        raise ValueError("Unknown dimension", new)

    from .core import Array, unify_chunks, normalize_arg

    if dtype is None:
        raise ValueError("Must specify dtype of output array")

    if align_arrays:
        chunkss, arrays = unify_chunks(*args)
    else:
        arginds = [(a, i) for (a, i) in toolz.partition(2, args) if i is not None]
        if arginds:
            arg, ind = max(arginds, key=lambda ai: len(ai[1]))
            chunkss = dict(zip(ind, arg.chunks))
        else:
            chunkss = {}
        arrays = args[::2]

    for k, v in new_axes.items():
        if not isinstance(v, tuple):
            v = (v,)
        chunkss[k] = v
    arginds = list(zip(arrays, args[1::2]))

    for arg, ind in arginds:
        if hasattr(arg, 'ndim') and hasattr(ind, '__len__') and arg.ndim != len(ind):
            raise ValueError("Index string %s does not match array dimension %d"
                             % (ind, arg.ndim))

    numblocks = {a.name: a.numblocks for a, ind in arginds if ind is not None}

    dependencies = []
    arrays = []

    # Normalize arguments
    argindsstr = []
    for a, ind in arginds:
        if ind is None:
            a = normalize_arg(a)
            a, collections = unpack_collections(a)
            dependencies.extend(collections)
        else:
            arrays.append(a)
            a = a.name
        argindsstr.extend((a, ind))

    # Normalize keyword arguments
    kwargs2 = {}
    for k, v in kwargs.items():
        v = normalize_arg(v)
        v, collections = unpack_collections(v)
        dependencies.extend(collections)
        kwargs2[k] = v

    # Finish up the name
    if not out:
        out = '%s-%s' % (token or utils.funcname(func).strip('_'),
                         base.tokenize(func, out_ind, argindsstr, dtype, **kwargs))

    graph = core_blockwise(func, out, out_ind, *argindsstr, numblocks=numblocks,
                           dependencies=dependencies, new_axes=new_axes, **kwargs2)
    graph = HighLevelGraph.from_collections(out, graph,
                                            dependencies=arrays + dependencies)

    chunks = [chunkss[i] for i in out_ind]
    if adjust_chunks:
        for i, ind in enumerate(out_ind):
            if ind in adjust_chunks:
                if callable(adjust_chunks[ind]):
                    chunks[i] = tuple(map(adjust_chunks[ind], chunks[i]))
                elif isinstance(adjust_chunks[ind], numbers.Integral):
                    chunks[i] = tuple(adjust_chunks[ind] for _ in chunks[i])
                elif isinstance(adjust_chunks[ind], (tuple, list)):
                    chunks[i] = tuple(adjust_chunks[ind])
                else:
                    raise NotImplementedError(
                        "adjust_chunks values must be callable, int, or tuple")
    chunks = tuple(chunks)

    return Array(graph, out, chunks, dtype=dtype)
Пример #53
0
corpus = [dic.doc2bow(s) for s in sentences]

lda = LdaModel(corpus=corpus,
               id2word=dic,
               num_topics=topic_num,
               alpha=alpha,
               random_state=1)

doc_topics = [lda[c] for c in corpus]

avg_doc_topics = mean([len(t) for t in doc_topics])

print(f"topics num of doc = {avg_doc_topics}")

topic_freq = frequencies([t[0] for dt in doc_topics for t in dt])

print('----------')

for i in range(topic_num):
    items = [(dic[t[0]], t[1]) for t in lda.get_topic_terms(i, topn=5)]
    freq = topic_freq[i] if i in topic_freq else 0

    print(f"topic_id = {i}, freq = {freq}, items = {items}")

print('----------')

for i in range(len(corpus)):
    dts = lda.get_document_topics(corpus[i], per_word_topics=True)

    for dt in dts[2]:
Пример #54
0
def test_frequencies():
    frequencies(big_data)