Exemplo n.º 1
0
def sum_folder(channel):
    import pickle
    import logging

    from more_itertools import peekable
    import pandas as pd

    from fowler.corpora.execnet import initialize_channel

    _, data = initialize_channel(channel)

    logger = logging.getLogger('execnet.fum_folder')

    kwargs = data.get('kwargs', {})
    instance = data['instance']
    folder_name = data['folder_name']
    folder = getattr(instance, folder_name)

    result = None
    for item in channel:

        if item == ('message', 'terminate'):
            if result is not None:
                logger.debug('Sending the final result, size: %s', len(result))
                channel.send(('result', pickle.dumps(result)))
            break

        type_, data = item
        if type_ == 'task':

            intermediate_results = peekable(enumerate(folder(data, **kwargs)))

            if intermediate_results:
                if result is None:
                    _, result = next(intermediate_results)

                # TODO: It would be nice to catch any exceptioin here,
                # (especially, the one that happens inside of the folder() call
                # and report it to the master.
                # Same applies to the next() call above.
                for i, r in intermediate_results:
                    logger.debug('Iteration: %s, result size: %s', i,
                                 len(result))

                    result = pd.concat(
                        [result, r],
                        copy=False,
                    ).groupby(level=result.index.names).sum()

                    if (i % 10) == 9:
                        result.sort(ascending=False, inplace=True)

                        half = len(result) // 2
                        logger.debug('Sending a result. Result size: %s', half)
                        channel.send(
                            ('result', pickle.dumps(result.tail(half))))
                        result = result.head(-half)

        channel.send(('message', 'send_next'))
Exemplo n.º 2
0
def verb_space_builder(channel):
    import pickle
    from scipy import sparse

    from fowler.corpora.execnet import logger, initialize_channel
    from fowler.corpora.models import read_space_from_file

    _, data = initialize_channel(channel)
    space = read_space_from_file(data['space_file'])

    result = {}
    for item in channel:

        if item == ('message', 'terminate'):
            if result:
                channel.send(('result', pickle.dumps(result)))
            break

        type_, data = item
        if type_ == 'task':
            # for (subj_stem, subj_tag, obj_stem, obj_tag), group in pickle.loads(data):

            # (subj_stem, subj_tag, obj_stem, obj_tag), group = pickle.loads(data)
            (verb_stem, verb_tag), group = pickle.loads(data)

            logger.debug(
                'Processing verb %s_%s with %s argument pairs.',
                verb_stem,
                verb_tag,
                len(group),
                )

            for subj_stem, subj_tag, obj_stem, obj_tag, count in group[['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag', 'count']].values:

                try:
                    subject_vector = space[subj_stem, subj_tag]
                    object_vector = space[obj_stem, obj_tag]
                except KeyError:
                    # logger.exception('Could not retrieve an argument vector.')
                    continue

                if not subject_vector.size:
                    logger.warning('Subject %s %s is empty!', subj_stem, subj_tag)
                    continue

                if not object_vector.size:
                    logger.warning('Object %s %s is empty!', obj_stem, obj_tag)
                    continue

                subject_object_tensor = sparse.kron(subject_vector, object_vector)
                t = subject_object_tensor * count

                if verb_stem not in result:
                    result[verb_stem, verb_tag] = t
                else:
                    result[verb_stem, verb_tag] += t

        channel.send(('message', 'send_next'))
Exemplo n.º 3
0
def sum_folder(channel):
    import pickle
    import logging

    from more_itertools import peekable
    import pandas as pd

    from fowler.corpora.execnet import initialize_channel

    _, data = initialize_channel(channel)

    logger = logging.getLogger('execnet.fum_folder')

    kwargs = data.get('kwargs', {})
    instance = data['instance']
    folder_name = data['folder_name']
    folder = getattr(instance, folder_name)

    result = None
    for item in channel:

        if item == ('message', 'terminate'):
            if result is not None:
                logger.debug('Sending the final result, size: %s', len(result))
                channel.send(('result', pickle.dumps(result)))
            break

        type_, data = item
        if type_ == 'task':

            intermediate_results = peekable(enumerate(folder(data, **kwargs)))

            if intermediate_results:
                if result is None:
                    _, result = next(intermediate_results)

                # TODO: It would be nice to catch any exceptioin here,
                # (especially, the one that happens inside of the folder() call
                # and report it to the master.
                # Same applies to the next() call above.
                for i, r in intermediate_results:
                    logger.debug('Iteration: %s, result size: %s', i, len(result))

                    result = pd.concat(
                        [result, r],
                        copy=False,
                    ).groupby(level=result.index.names).sum()

                    if (i % 10) == 9:
                        result.sort(ascending=False, inplace=True)

                        half = len(result) // 2
                        logger.debug('Sending a result. Result size: %s', half)
                        channel.send(('result', pickle.dumps(result.tail(half))))
                        result = result.head(-half)

        channel.send(('message', 'send_next'))
Exemplo n.º 4
0
def verb_space_builder(channel):
    import pickle

    from scipy import sparse

    from fowler.corpora.execnet import logger, initialize_channel
    from fowler.corpora.models import read_space_from_file

    _, data = initialize_channel(channel)
    space = read_space_from_file(data['space_file'])

    result = {}
    for item in channel:

        if item == ('message', 'terminate'):
            if result:
                channel.send(('result', pickle.dumps(result)))
            break

        type_, data = item
        if type_ == 'task':
            # for (subj_stem, subj_tag, obj_stem, obj_tag), group in pickle.loads(data):

            # (subj_stem, subj_tag, obj_stem, obj_tag), group = pickle.loads(data)
            (verb_stem, verb_tag), group = pickle.loads(data)

            logger.debug(
                'Processing verb %s_%s with %s argument pairs.',
                verb_stem,
                verb_tag,
                len(group),
                )

            for subj_stem, subj_tag, obj_stem, obj_tag, count in group[['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag', 'count']].values:
                # XXX consider only the triples for which `count > 1000`.

                try:
                    subject_vector = space[subj_stem, subj_tag]
                    object_vector = space[obj_stem, obj_tag]
                except KeyError:
                    # Don't log the expensions as there are many of them!
                    continue

                if not subject_vector.size:
                    # logger.warning('Subject %s %s is empty!', subj_stem, subj_tag)
                    continue

                if not object_vector.size:
                    # logger.warning('Object %s %s is empty!', obj_stem, obj_tag)
                    continue

                subject_object_tensor = sparse.kron(subject_vector, object_vector)

                # XXX multiply by the count?
                t = subject_object_tensor

                if (verb_stem, verb_tag) not in result:
                    result[verb_stem, verb_tag] = t
                else:
                    result[verb_stem, verb_tag] += t

        channel.send(('message', 'send_next'))