예제 #1
0
 def __init__(self):
   sqler = get_sqler()
   self.sq = sqler.db
   self.cursor = self.sq.cursor()
   self.fg = FeatureGetter()
             
   self.dump_table = 'outer_products'
   self.request = "INSERT INTO "+self.dump_table+" (req_id, data) VALUES (%s, %s)"
   req_per_node = self.NUM_COUCHREQUESTS/comm_size 
   lower = comm_rank*req_per_node + self.START_OFFSET
   upper = req_per_node
   print 'node %d computes %d to %d'%(comm_rank, lower, lower+upper)
   if comm_rank == comm_size-1:
     # The last guy just takes the rest
     upper *= 2
   # Get the req_ids for this node
   req = "SELECT req_id from " + self.dump_table
   self.cursor.execute(req)
   ids = self.cursor.fetchall()
   self.existent_req_ids = [int(x[0]) for x in ids]
   
   request = "SELECT id, host_user_id, surf_user_id FROM couchrequest limit "\
     + str(lower) + ", " + str(upper)
   print 'get the req_user_map database...'
   t = time.time()
   self.cursor.execute(request)
   t -= time.time()
   print 'took %f secs'%(-t)
   rows = self.cursor.fetchall()
   self.req_user_map = {int(row[0]):(int(row[1]),int(row[2])) for row in rows}
   print 'len user_map: %d'%len(self.req_user_map)
   for r in self.existent_req_ids:
     if self.req_user_map.has_key(r):
       self.req_user_map.pop(r)
   print 'reduced len user_map: %d'%len(self.req_user_map)
예제 #2
0
파일: dolearning.py 프로젝트: sergeyk/csrec
from math import sqrt
from time import time
import numpy as np


# get data (Tobi)
testing = True
dataobject = CompetitorSetCollection(testing=testing,validation=False)
#print dataobject.get_nsamples() # N
#print dataobject.get_sample(17) # yields a competitorset
# TODO: Tobi - put your stuff here

# get featuremethod (Ron)
from features.user_features import FeatureGetter

fg = FeatureGetter(testing)
#print fg.get_features(907345, 907345, 1)
dimension = fg.get_dimension()


# create SGD object, sample different competitorsets, and do learning
from gradientdescent import SGDLearning
from gradientdescent_personalization import SGDLearningPersonalized
import random


# it is better to load all competitorsets at once and then do learning fast
traindataobject = CompetitorSetCollection(testing=True,validation=False)
Ntrain = traindataobject.get_nsamples()
competitorsets_train = [traindataobject.get_sample(i) for i in xrange(Ntrain)]
예제 #3
0
class OuterProductDumper():
  
  NUM_COUCHREQUESTS = 10928173
  START_OFFSET = 0

  def __init__(self):
    sqler = get_sqler()
    self.sq = sqler.db
    self.cursor = self.sq.cursor()
    self.fg = FeatureGetter()
              
    self.dump_table = 'outer_products'
    self.request = "INSERT INTO "+self.dump_table+" (req_id, data) VALUES (%s, %s)"
    req_per_node = self.NUM_COUCHREQUESTS/comm_size 
    lower = comm_rank*req_per_node + self.START_OFFSET
    upper = req_per_node
    print 'node %d computes %d to %d'%(comm_rank, lower, lower+upper)
    if comm_rank == comm_size-1:
      # The last guy just takes the rest
      upper *= 2
    # Get the req_ids for this node
    req = "SELECT req_id from " + self.dump_table
    self.cursor.execute(req)
    ids = self.cursor.fetchall()
    self.existent_req_ids = [int(x[0]) for x in ids]
    
    request = "SELECT id, host_user_id, surf_user_id FROM couchrequest limit "\
      + str(lower) + ", " + str(upper)
    print 'get the req_user_map database...'
    t = time.time()
    self.cursor.execute(request)
    t -= time.time()
    print 'took %f secs'%(-t)
    rows = self.cursor.fetchall()
    self.req_user_map = {int(row[0]):(int(row[1]),int(row[2])) for row in rows}
    print 'len user_map: %d'%len(self.req_user_map)
    for r in self.existent_req_ids:
      if self.req_user_map.has_key(r):
        self.req_user_map.pop(r)
    print 'reduced len user_map: %d'%len(self.req_user_map)
    #embed()
    
  def dump_outer_product(self, datas):
    try:        
      self.cursor.executemany(self.request, datas)
    except MySQLdb.IntegrityError:
      pass
  
  def commit(self):
    self.sq.commit()
    
  def get_dicts(self, req_id):
    user1 = self.req_user_map[req_id][0]
    dict1 = load_data_for_user(self.cursor, user1)
    user2 = self.req_user_map[req_id][1]
    dict2 = load_data_for_user(self.cursor, user2)
    return (dict1, dict2)
  
  def get_features(self, req_id):
    (dict1, dict2) = self.get_dicts(req_id)
    data = self.fg.get_features_from_dct(dict1, dict2, req_id)
    return np.nonzero(data)[0].tolist()
    
  def execute(self):
    total_time = 0
    counter = 0
    commit_count = 0
    all_keys = self.req_user_map.keys()
    datas = []
    for req_idx, req_id in enumerate(all_keys):
      t = time.time()
      
      data = self.get_features(req_id)
      
      #print 'took %f sec'%(-t)
      counter += 1
      
      #print '%d dumps 100 rows'%comm_rank
      thedata = cPickle.dumps(data)
      datas.append((req_id, thedata))      
      
      t -= time.time()
      if counter % 10000 == 0 or req_idx == len(all_keys)-1:
        print '%s finished %s/%s' % (comm_rank, counter, 
                                     len(self.req_user_map.keys()))

        self.dump_outer_product(datas)
        datas = []
      total_time -= t
      
    print 'mean time: %f sec'%(total_time/float(counter))
    t = time.time()
    
    t -= time.time()
    print 'commit took %f sec'%(-t)
예제 #4
0
파일: train.py 프로젝트: sergeyk/csrec
def run(cfg):
    lambdas = cfg.lambdas
    memory_for_personalized_parameters = cfg.memory_for_personalized_parameters
    percentage = cfg.train_percentage
    outer_iterations = cfg.outer_iterations
    nepoches = cfg.nepoches
    alpha = cfg.alpha
    beta = cfg.beta
    verbose = cfg.verbose
    personalization = cfg.personalization
    rhostsize = cfg.rhostsize
    just_winning_sets = cfg.just_winning_sets
    testing = cfg.testing
    dirname = cfg.train_dirname

    if comm_rank == 0:
        print "using lambdas:", lambdas
    fg = FeatureGetter()
    if cfg.god_mode:
        featuredimension = 1
    else:
        featuredimension = fg.get_dimension()
    get_feature_function = fg.get_features
    sq = get_sqler()
    overallnum_sets = sq.get_num_compsets("train")
    num_sets = int(overallnum_sets * percentage)

    for i in range(2, comm_size + 2, 3):
        if comm_rank == i or comm_rank == i - 1 or comm_rank == i - 2:
            print ("Machine %d/%d - Start loading %s competitorsets for TRAIN" % (comm_rank + 1, comm_size, num_sets))
            t0 = time.time()
            cs_train = CompetitorSetCollection(num_sets=num_sets, mode="train")
            t1 = time.time()
            print "Machine %d/%d - Finished loading the competitorsets for TRAIN." % (comm_rank, comm_size)
            print "Loading competitorsets took %s." % (t1 - t0)

        safebarrier(comm)

    # sleeping so that we dont kill database
    sec = comm_rank
    print "machine %d is sleeping for %d sec." % (comm_rank, sec)
    time.sleep(sec)

    trainerrors = np.zeros((len(lambdas), len(lambdas)))
    testerrors = np.zeros((len(lambdas), len(lambdas)))
    trainmeannrank = np.zeros((len(lambdas), len(lambdas)))
    testmeannrank = np.zeros((len(lambdas), len(lambdas)))

    for lw in range(len(lambdas)):
        lambda_winner, lambda_reject = lambdas[lw]
        # Create sgd object
        if personalization:
            sgd = SGDLearningPersonalized(featuredimension, get_feature_function, memory_for_personalized_parameters)
        else:
            sgd = SGDLearningRHOSTHASH(featuredimension, get_feature_function, rhostsize=rhostsize)

        N = cs_train.get_nsamples()
        niter = int(N * nepoches)

        for outit in range(outer_iterations):
            # for each outer iteration we draw new samples iid per node
            sampleindices = []
            for _ in range(int(nepoches) + 1):
                sampleindices += range(N)

            random.shuffle(sampleindices)
            update_lookahead_cnt = 0
            req_ids = cs_train.get_req_ids_for_samples(sampleindices[0:LOOK_AHEAD_LENGTH])
            fg.upt_out_prod_get(req_ids)

            for innerit in range(niter):
                i = outit * niter + innerit
                eta_t = 1 / sqrt(alpha + i * beta)
                if not i % (niter / 5):
                    print (
                        "Machine %d/%d - Iterations out: %d/%d - in: %d/%d - eta %f - lambda %f"
                        % (comm_rank, comm_size, outit + 1, outer_iterations, innerit + 1, niter, eta_t, lambda_winner)
                    )

                update_lookahead_cnt += 1
                if update_lookahead_cnt == LOOK_AHEAD_LENGTH:
                    req_ids = cs_train.get_req_ids_for_samples(sampleindices[innerit : innerit + LOOK_AHEAD_LENGTH])
                    fg.upt_out_prod_get(req_ids)
                    update_lookahead_cnt = 0

                # draw random sample - UPDATE: now first get a random permutation, then do it
                sampleindex = sampleindices[innerit]
                competitorset = cs_train.get_sample(sampleindex)

                for l in competitorset.get_surferlist():
                    assert l[1] in req_ids

                if verbose and not i % (niter / 5) and i > 1:
                    print (
                        "Iterations \n\tout: %d/%d \n\tin: %d/%d - eta %f - lambda %f"
                        % (outit + 1, outer_iterations, innerit + 1, niter, eta_t, lambda_winner)
                    )
                    print "\ttheta", min(sgd.theta), max(sgd.theta)
                    print "\tr", sgd.r
                    print "\tr_hosts", min(sgd.r_hosts), max(sgd.r_hosts)
                    print "\ttrue", competitorset.get_winner()
                    print "\tpredicted", sgd.predict(competitorset)
                    print "\tranking", sgd.rank(competitorset)
                sgd.update(competitorset, eta=eta_t, lambda_winner=lambda_winner, lambda_reject=lambda_reject)

            # Now we aggregate theta(_host), r(_host)
            print ("outer iteration %d/%d: node %d at safebarrier" % (outit + 1, outer_iterations, comm_rank))
            safebarrier(comm)

            if comm_rank == 0:
                print "all nodes arrived and we start allreduce/broadcasting"
            theta = comm.allreduce(sgd.theta) / float(comm_size)
            if comm_rank == 0:
                print "allreduce done for theta"
            if personalization:
                theta_hosts = comm.allreduce(sgd.theta_hosts) / float(comm_size)
                if comm_rank == 0:
                    print "allreduce done for theta_hosts"
            r = comm.allreduce(sgd.r) / float(comm_size)
            if comm_rank == 0:
                print "allreduce done for r"

            r_hosts = comm.allreduce(sgd.r_hosts) / float(comm_size)
            if comm_rank == 0:
                print "allreduce done for r_hosts"

            print "spreading mean of parameters done!"
            sgd.theta = theta
            if personalization:
                sgd.theta_hosts = theta_hosts
            sgd.r = r
            sgd.r_hosts = r_hosts

        print "done with training"

        # Store the parameters to /tscratch/tmp/csrec
        if comm_rank == 0:
            if os.path.exists("/tscratch"):
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
            filename = "parameters_lwin_%f_lrej_%f_testing_%d_personalized_%d_numsets_%d_outerit_%d_nepoches_%d.pkl" % (
                lambda_winner,
                lambda_reject,
                testing,
                personalization,
                num_sets,
                outer_iterations,
                nepoches,
            )
            if not RON_MODE:
                os.system("chmod -R 777 " + dirname)
            if personalization:
                pickle.dump((sgd.theta, sgd.theta_hosts, sgd.r, sgd.r_hosts), open(dirname + filename, "wb"))
            else:
                pickle.dump((sgd.theta, sgd.r, sgd.r_hosts), open(dirname + filename, "wb"))
            print "Stored params at " + dirname + filename
        return filename