示例#1
0
def user_agg(si=None):
  '''
  Loads search.gl and aggregates it by UserID to get some features.
  NB: this did not help.
  '''
  start = datetime.now()
  if si is None:
    si = load('search.gl')
  D = 2**20
  si['SQexists'] = si['SearchQuery'].apply(lambda s : s != '')
  si['SQhash']   = si['SearchQuery'].apply(lambda s : abs(hash(s)) % D)
  si['SPexists'] = si['SearchParams'].apply(lambda d : d is not None)
  
  f = {'pctSQE'      : agg.AVG('SQexists'),
       'pctSPE'      : agg.AVG('SPexists'),
       'numSearches' : agg.COUNT(),
       'allCat'      : agg.CONCAT('CategoryID'),
       'allSQ'       : agg.CONCAT('SQhash')}
       
  si = si[['UserID', 
           'CategoryID', 
           'SearchParams', 
           'SQexists', 
           'SPexists', 
           'SQhash']]
  usr = si.groupby('UserID', f)
  usr['allSQ']  = usr['allSQ'].apply(lambda l : list(set(l)))
  usr['allCat'] = usr['allCat'].apply(lambda l : list(set(l)))
  usr_dict = sframe_to_dict('UserID', usr)
  avito2_io.put_artifact(usr_dict, 'user_si.pkl')
  print('elapsed time: %s' % (datetime.now() - start))
示例#2
0
def user_agg(si=None):
  '''
  Loads search.gl and aggregates it by UserID to get some features.
  NB: this did not help.
  '''
  start = datetime.now()
  if si is None:
    si = load('search.gl')
  D = 2**20
  si['SQexists'] = si['SearchQuery'].apply(lambda s : s != '')
  si['SQhash']   = si['SearchQuery'].apply(lambda s : abs(hash(s)) % D)
  si['SPexists'] = si['SearchParams'].apply(lambda d : d is not None)
  
  f = {'pctSQE'      : agg.AVG('SQexists'),
       'pctSPE'      : agg.AVG('SPexists'),
       'numSearches' : agg.COUNT(),
       'allCat'      : agg.CONCAT('CategoryID'),
       'allSQ'       : agg.CONCAT('SQhash')}
       
  si = si[['UserID', 
           'CategoryID', 
           'SearchParams', 
           'SQexists', 
           'SPexists', 
           'SQhash']]
  usr = si.groupby('UserID', f)
  usr['allSQ']  = usr['allSQ'].apply(lambda l : list(set(l)))
  usr['allCat'] = usr['allCat'].apply(lambda l : list(set(l)))
  usr_dict = sframe_to_dict('UserID', usr)
  avito2_io.put_artifact(usr_dict, 'user_si.pkl')
  print 'elapsed time: %s' % (datetime.now() - start)
示例#3
0
def make_user_dict():
  '''
  Loads user.gl and creates a dict-of-dicts {int: dict} like:
   {UserID: {other_fields:other_values}}
   
  Saves result at artifacts/user_dict.pkl. It can be loaded with 
  avito2_io.get_artifact.
  '''
  start = datetime.now()
  user = load('user.gl')
  user_dict = sframe_to_dict('UserID', user)
  avito2_io.put_artifact(user_dict, 'user_dict.pkl')
  print('elapsed time: %s' % (datetime.now() - start)) 
示例#4
0
def make_user_dict():
  '''
  Loads user.gl and creates a dict-of-dicts {int: dict} like:
   {UserID: {other_fields:other_values}}
   
  Saves result at artifacts/user_dict.pkl. It can be loaded with 
  avito2_io.get_artifact.
  '''
  start = datetime.now()
  user = load('user.gl')
  user_dict = sframe_to_dict('UserID', user)
  avito2_io.put_artifact(user_dict, 'user_dict.pkl')
  print 'elapsed time: %s' % (datetime.now() - start) 
示例#5
0
    for (k, line) in enumerate(reader):
      user_id = int(line['UserID'])
      if user_id in out:
        vis_ct = out[user_id].setdefault('vis_ct', 0)
        out[user_id]['vis_ct'] = vis_ct + 1
      if (k + 1) % 1000000 == 0:
        print 'read %d lines from VisitsStream.tsv.gz' % (k + 1)
  for k in out:
    out[k].setdefault('vis_ct', 0)

  return out
      
if __name__ == '__main__':
  start = datetime.now()
  print 'running at: ' + str(start)
  parser = argparse.ArgumentParser(description = 
                   'Collects counts of UserID from several data files.')
  parser.add_argument('--min_ct', type=int, default=None)
  parser.add_argument('--max_lines', type=int, default=None)
  args = parser.parse_args()
  user_counts = user_counts(args.min_ct)
  avito2_io.put_artifact(user_counts, 'user_counts.pkl')
  print 'elapsed time: %s' % (datetime.now() - start)







  
示例#6
0
  '''
  out = {}
  with open(avito2_io.ADS_INFO) as f:
    reader = csv.DictReader(f, delimiter='\t')
    for (k, line) in enumerate(reader):
      if k == maxlines:
        break
      if (int(line['IsContext']) == 1):
        if line['Price'] == '':
          line['Price'] = -1
        if line['CategoryID'] == '':
          line['CategoryID'] = -1
        values = [int(line['CategoryID']),
                  int(float(line['Price'])),
                  line['Title'],
                  line['Params']]
        out[int(line['AdID'])] = values
      if (k + 1) % 1000000 == 0:
        print 'read %d lines' % (k + 1)
  return out

  
if __name__=='__main__':
  start = datetime.now()
  print 'parsing AdsInfo.tsv'
  out = parse_ads() 
  print 'saving context ads to ARTIFACTS/'
  avito2_io.put_artifact(out, 'context_ads.pkl')
  print 'Finished, elapsed time: %s' % (datetime.now() - start)
  
  
示例#7
0
                      (k + 1))
    for k in out:
        out[k].setdefault('ph_ct', 0)

    with gzip.open(avito2_io.VISIT) as f_vis:
        reader = csv.DictReader(f_vis, delimiter='\t')
        for (k, line) in enumerate(reader):
            user_id = int(line['UserID'])
            if user_id in out:
                vis_ct = out[user_id].setdefault('vis_ct', 0)
                out[user_id]['vis_ct'] = vis_ct + 1
            if (k + 1) % 1000000 == 0:
                print('read %d lines from VisitsStream.tsv.gz' % (k + 1))
    for k in out:
        out[k].setdefault('vis_ct', 0)

    return out


if __name__ == '__main__':
    start = datetime.now()
    print('running at: ' + str(start))
    parser = argparse.ArgumentParser(
        description='Collects counts of UserID from several data files.')
    parser.add_argument('--min_ct', type=int, default=None)
    parser.add_argument('--max_lines', type=int, default=None)
    args = parser.parse_args()
    user_counts = user_counts(args.min_ct)
    avito2_io.put_artifact(user_counts, 'user_counts.pkl')
    print('elapsed time: %s' % (datetime.now() - start))