def affinities_r(): """The reduction step of turning lists of votes on sr_ids into affinities""" @mr_tools.dataspec_r('account_id', 'link_id', 'sr_id', 'dir') def process(account_srid, affs): # we can assume that all of the account_ids and sr_ids are # equal count = 0.0 ups = 0.0 account_id = sr_id = None for aff in affs: if account_id is None: account_id = aff.account_id sr_id = aff.sr_id count += 1 if aff.dir == '1': ups += 1 # you must vote at least three times to ride if count >= 3: yield account_id, sr_id, ups / count mr_tools.mr_reduce(process)
def affinities_r(): """The reduction step of turning lists of votes on sr_ids into affinities""" @mr_tools.dataspec_r('account_id', 'link_id', 'sr_id', 'dir') def process(account_srid, affs): # we can assume that all of the account_ids and sr_ids are # equal count=0.0 ups=0.0 account_id = sr_id = None for aff in affs: if account_id is None: account_id = aff.account_id sr_id = aff.sr_id count+=1 if aff.dir == '1': ups += 1 # you must vote at least three times to ride if count >= 3: yield account_id, sr_id, ups/count mr_tools.mr_reduce(process)
def write_matrix(out_cm, out_clabel, out_rlabel): """Reformat the affinities coming out of the functions above to the format wanted by skmeans (which is the format used by CLUTO, documented at <http://glaros.dtc.umn.edu/gkhome/fetch/sw/cluto/manual.pdf> in section 3.3.1)""" class Stats(object): __slots__ = ['num_srs', 'num_rows', 'sr_map', 'total_entries'] def __init__(self): self.num_srs = self.num_rows = self.total_entries = 0 # we can safely keep the whole sr_map around in memory # like this because we have fewer than 100k of them self.sr_map = {} stats = Stats() f_cm = tempfile.TemporaryFile() f_cl = tempfile.TemporaryFile() @mr_tools.dataspec_r('sr_id', ('affinity', float)) def _reduce(account_id, affs): affs = list(affs) for aff in affs: # the affinities we get are from 0..1, but skmeans wants # -1..1) aff.affinity = aff.affinity * 2 - 1 # skmeans really doesn't like rows consisting entirely in # zeroes affs = filter(lambda aff: not (-0.001 < aff.affinity < 0.001), affs) if not affs: return [] stats.num_rows += 1 for aff in affs: if aff.sr_id not in stats.sr_map: stats.num_srs += 1 stats.sr_map[aff.sr_id] = stats.num_srs # CLUTO's # matricies # are 1-based stats.total_entries += len(affs) f_cl.write('%s\n' % (account_id, )) f_cm.write(' '.join(('%s %s' % (stats.sr_map[aff.sr_id], aff.affinity) for aff in affs))) f_cm.write('\n') return [] mr_tools.mr_reduce(_reduce) def cp_fds(infd, outfd, buffsize=1024 * 1024): infd.flush() infd.seek(0) while True: readed = infd.read(buffsize) if readed: outfd.write(readed) else: break with open(out_cm, 'w') as outfd: outfd.write('%d %d %d\n' % (stats.num_rows, len(stats.sr_map), stats.total_entries)) cp_fds(f_cm, outfd) with open(out_clabel, 'w') as outfd: cp_fds(f_cl, outfd) with open(out_rlabel, 'w') as outfd: for sr_id, sr_mapped in sorted(stats.sr_map.items(), key=lambda x: x[1]): outfd.write('%s\n' % (sr_id, ))
def write_matrix(out_cm, out_clabel, out_rlabel): """Reformat the affinities coming out of the functions above to the format wanted by skmeans (which is the format used by CLUTO, documented at <http://glaros.dtc.umn.edu/gkhome/fetch/sw/cluto/manual.pdf> in section 3.3.1)""" class Stats(object): __slots__ = ['num_srs', 'num_rows', 'sr_map', 'total_entries'] def __init__(self): self.num_srs = self.num_rows = self.total_entries = 0 # we can safely keep the whole sr_map around in memory # like this because we have fewer than 100k of them self.sr_map = {} stats = Stats() f_cm = tempfile.TemporaryFile() f_cl = tempfile.TemporaryFile() @mr_tools.dataspec_r('sr_id', ('affinity',float)) def _reduce(account_id, affs): affs = list(affs) for aff in affs: # the affinities we get are from 0..1, but skmeans wants # -1..1) aff.affinity = aff.affinity*2-1 # skmeans really doesn't like rows consisting entirely in # zeroes affs = filter(lambda aff: not(-0.001 < aff.affinity < 0.001), affs) if not affs: return [] stats.num_rows += 1 for aff in affs: if aff.sr_id not in stats.sr_map: stats.num_srs += 1 stats.sr_map[aff.sr_id] = stats.num_srs # CLUTO's # matricies # are 1-based stats.total_entries += len(affs) f_cl.write('%s\n' % (account_id,)) f_cm.write(' '.join(('%s %s' % (stats.sr_map[aff.sr_id], aff.affinity) for aff in affs))) f_cm.write('\n') return [] mr_tools.mr_reduce(_reduce) def cp_fds(infd, outfd, buffsize = 1024*1024): infd.flush() infd.seek(0) while True: readed = infd.read(buffsize) if readed: outfd.write(readed) else: break with open(out_cm,'w') as outfd: outfd.write( '%d %d %d\n' % (stats.num_rows, len(stats.sr_map), stats.total_entries)) cp_fds(f_cm, outfd) with open(out_clabel,'w') as outfd: cp_fds(f_cl, outfd) with open(out_rlabel, 'w') as outfd: for sr_id, sr_mapped in sorted(stats.sr_map.items(), key = lambda x: x[1]): outfd.write('%s\n' % (sr_id,))