示例#1
0
def affinities_r():
    """The reduction step of turning lists of votes on sr_ids into
       affinities"""
    @mr_tools.dataspec_r('account_id', 'link_id', 'sr_id', 'dir')
    def process(account_srid, affs):
        # we can assume that all of the account_ids and sr_ids are
        # equal
        count = 0.0
        ups = 0.0
        account_id = sr_id = None

        for aff in affs:
            if account_id is None:
                account_id = aff.account_id
                sr_id = aff.sr_id

            count += 1
            if aff.dir == '1':
                ups += 1

        # you must vote at least three times to ride
        if count >= 3:
            yield account_id, sr_id, ups / count

    mr_tools.mr_reduce(process)
示例#2
0
def affinities_r():
    """The reduction step of turning lists of votes on sr_ids into
       affinities"""

    @mr_tools.dataspec_r('account_id',
                         'link_id',
                         'sr_id',
                         'dir')
    def process(account_srid, affs):
        # we can assume that all of the account_ids and sr_ids are
        # equal
        count=0.0
        ups=0.0
        account_id = sr_id = None

        for aff in affs:
            if account_id is None:
                account_id = aff.account_id
                sr_id = aff.sr_id

            count+=1
            if aff.dir == '1':
                ups += 1

        # you must vote at least three times to ride
        if count >= 3:
            yield account_id, sr_id, ups/count

    mr_tools.mr_reduce(process)
示例#3
0
def write_matrix(out_cm, out_clabel, out_rlabel):
    """Reformat the affinities coming out of the functions above to
       the format wanted by skmeans (which is the format used by
       CLUTO, documented at
       <http://glaros.dtc.umn.edu/gkhome/fetch/sw/cluto/manual.pdf> in
       section 3.3.1)"""
    class Stats(object):
        __slots__ = ['num_srs', 'num_rows', 'sr_map', 'total_entries']

        def __init__(self):
            self.num_srs = self.num_rows = self.total_entries = 0

            # we can safely keep the whole sr_map around in memory
            # like this because we have fewer than 100k of them
            self.sr_map = {}

    stats = Stats()

    f_cm = tempfile.TemporaryFile()
    f_cl = tempfile.TemporaryFile()

    @mr_tools.dataspec_r('sr_id', ('affinity', float))
    def _reduce(account_id, affs):
        affs = list(affs)
        for aff in affs:
            # the affinities we get are from 0..1, but skmeans wants
            # -1..1)
            aff.affinity = aff.affinity * 2 - 1

        # skmeans really doesn't like rows consisting entirely in
        # zeroes
        affs = filter(lambda aff: not (-0.001 < aff.affinity < 0.001), affs)
        if not affs:
            return []

        stats.num_rows += 1

        for aff in affs:
            if aff.sr_id not in stats.sr_map:
                stats.num_srs += 1
                stats.sr_map[aff.sr_id] = stats.num_srs  # CLUTO's
                # matricies
                # are 1-based

        stats.total_entries += len(affs)

        f_cl.write('%s\n' % (account_id, ))

        f_cm.write(' '.join(('%s %s' % (stats.sr_map[aff.sr_id], aff.affinity)
                             for aff in affs)))
        f_cm.write('\n')

        return []

    mr_tools.mr_reduce(_reduce)

    def cp_fds(infd, outfd, buffsize=1024 * 1024):
        infd.flush()
        infd.seek(0)
        while True:
            readed = infd.read(buffsize)
            if readed:
                outfd.write(readed)
            else:
                break

    with open(out_cm, 'w') as outfd:
        outfd.write('%d %d %d\n' %
                    (stats.num_rows, len(stats.sr_map), stats.total_entries))
        cp_fds(f_cm, outfd)

    with open(out_clabel, 'w') as outfd:
        cp_fds(f_cl, outfd)

    with open(out_rlabel, 'w') as outfd:
        for sr_id, sr_mapped in sorted(stats.sr_map.items(),
                                       key=lambda x: x[1]):
            outfd.write('%s\n' % (sr_id, ))
示例#4
0
def write_matrix(out_cm, out_clabel, out_rlabel):
    """Reformat the affinities coming out of the functions above to
       the format wanted by skmeans (which is the format used by
       CLUTO, documented at
       <http://glaros.dtc.umn.edu/gkhome/fetch/sw/cluto/manual.pdf> in
       section 3.3.1)"""

    class Stats(object):
        __slots__ = ['num_srs', 'num_rows', 'sr_map', 'total_entries']

        def __init__(self):
            self.num_srs = self.num_rows = self.total_entries = 0

            # we can safely keep the whole sr_map around in memory
            # like this because we have fewer than 100k of them
            self.sr_map = {}

    stats = Stats()

    f_cm = tempfile.TemporaryFile()
    f_cl = tempfile.TemporaryFile()

    @mr_tools.dataspec_r('sr_id',
                         ('affinity',float))
    def _reduce(account_id, affs):
        affs = list(affs)
        for aff in affs:
            # the affinities we get are from 0..1, but skmeans wants
            # -1..1)
            aff.affinity = aff.affinity*2-1

        # skmeans really doesn't like rows consisting entirely in
        # zeroes
        affs = filter(lambda aff: not(-0.001 < aff.affinity < 0.001),
                      affs)
        if not affs:
            return []

        stats.num_rows += 1

        for aff in affs:
            if aff.sr_id not in stats.sr_map:
                stats.num_srs += 1
                stats.sr_map[aff.sr_id] = stats.num_srs # CLUTO's
                                                        # matricies
                                                        # are 1-based

        stats.total_entries += len(affs)

        f_cl.write('%s\n' % (account_id,))

        f_cm.write(' '.join(('%s %s' % (stats.sr_map[aff.sr_id], aff.affinity)
                          for aff in affs)))
        f_cm.write('\n')

        return []

    mr_tools.mr_reduce(_reduce)

    def cp_fds(infd, outfd, buffsize = 1024*1024):
        infd.flush()
        infd.seek(0)
        while True:
            readed = infd.read(buffsize)
            if readed:
                outfd.write(readed)
            else:
                break

    with open(out_cm,'w') as outfd:
        outfd.write(
            '%d %d %d\n' % (stats.num_rows, len(stats.sr_map), stats.total_entries))
        cp_fds(f_cm, outfd)

    with open(out_clabel,'w') as outfd:
        cp_fds(f_cl, outfd)

    with open(out_rlabel, 'w') as outfd:
        for sr_id, sr_mapped in sorted(stats.sr_map.items(),
                                       key = lambda x: x[1]):
            outfd.write('%s\n' % (sr_id,))