def full_train(): combo = sframes.load('combo.gl') extras = sframes.load('extras.gl') combo.add_columms(extras) prepareSFrame(combo) cit = chunk_iterator(combo) sit = select_train(cit, True) model = train(sit) return model
def get_context_ads(): ''' Loads the ads.gl file (all of AdsInfo.tsv) and extracts the context ads. ''' ads = sframes.load('ads.gl') ctx = ads[ads['IsContext']] del ctx['IsContext'] del ctx['LocationID'] return ctx
def basic_join(tss, si, user): ''' A generator that performs a rolling join over Graphlab SFrames tss, which stores data from train/testSearchStream.tsv and si, which is from SearchInfo.tsv. SFrame context_ads.gl, which has the contextual ads from AdsInfo.tsv, is loaded and joined in. UserInfo.tsv is joined in from loading the artifact user_dict.pkl from artifacts/. args: tss - an SFrame with data from trainSearchStream or testSearchStream, including samples or validation sets si - an SFrame with data from SearchInfo. Must have all of the SearchIDs in tss, but it can be a sample user - dict or None. A dict from UserID to a dict of features for that user. Caller should construct this if used. generates: a dict that combines all of the fields from tss, si and ads for a row ''' ctx = sframes.load('context_ads.gl') ctx = sframes.sframe_to_dict('AdID', ctx) si_it = iter(si) si_line = si_it.next() for tss_line in tss: search_id = tss_line['SearchID'] ad_id = tss_line['AdID'] user_id = si_line['UserID'] while search_id != si_line['SearchID']: si_line = si_it.next() # Now the SearchIDs match tss_line.update(ctx[ad_id]) # SearchInfo.CategoryID overwrites AdInfo.CategoryID in this line tss_line.update(si_line) if user is not None and user_id in user: tss_line.update(user[user_id]) yield tss_line
help='# of passes over training data.') args = parser.parse_args() if args.users=='full': users = build_user_dict() print 'loading full user data' elif args.users=='counts': users = avito2_io.get_artifact('user_counts.pkl') print 'loading user counts only from user_counts.pkl' elif args.users == 'si': users = avito2_io.get_artifact('user_si.pkl') print 'loading user dict from user_si.pkl' else: users = None D = 2**args.bits if args.all: tr = sframes.load('train_context.gl') si = sframes.load('search.gl') if not args.sub: raise Warning('--all without --sub is not sensible.') else: tr = sframes.load('train_ds.gl') si = sframes.load('search_ds.gl') # no interactions; it'd take days model = train(tr, si, args.alpha, args.beta, args.l1, args.l2, D, users,
def features2(): ''' This function implements and records the construction of the second set of graphlab sframe features. These include all of the integer-valued raw and lightly processed features from train/test, SearchInfo, Category, Location, AdsInfo and UserInfo. Only contextual ads are considered. NB: This leaves SearchID in the output(to allow for validation). Run script must delete SearchID. ''' start = datetime.now() print 'loading context ads' ctx = get_context_ads() ctx['LogPrice'] = ctx['Price'].apply(lambda x : round(log(x+1), 1)) ctx['ParamLen'] = ctx['Params'].apply(lambda d : len(d)).fillna(0) ctx['TitleLen'] = ctx['Title'].apply(lambda s : len(s)).fillna(0) del ctx['Price'] del ctx['Title'] del ctx['Params'] print 'loading users' users = sframes.load('user.gl') print 'loading category and location' ctg = sframes.load('category.gl') # Admins said this field could be deleted. del ctg['SubcategoryID'] loc = sframes.load('location.gl') print 'small objects loaded, elapsed time: %s' % (datetime.now() - start) print 'ingesting train.gl' tr = sframes.load('train.gl') tr = tr[tr['ObjectType'] == 3] del tr['ObjectType'] tr['log_ctr'] = tr['HistCTR'].apply(lambda x : -10 * round(log(x), 1)) del tr['HistCTR'] print 'train.gl ingested, elapsed time: %s' % (datetime.now() - start) # SearchDate: I can't decide what to do with it, so I'm leaving it in, # as-is. The run script will have to remove it. This allows sorting by # date without doing this huge join again. print 'ingesting search.gl' si = sframes.load('search.gl') si['SQexists'] = si['SearchQuery'].apply(lambda x : len(x) > 0) del si['SearchQuery'] si['SPexists'] = (si['SearchParams'].apply(lambda d : int(d is not None)) .fillna(0)) del si['SearchParams'] print 'search.gl ingested, elapsed time: %s' % (datetime.now() - start) print 'joining user.gl into search.gl' si = si.join(users, how='left', on='UserID') print 'user.gl joined in, elapsed time: %s' % (datetime.now() - start) print 'joining location.gl to search.gl' si = si.join(loc, how='left', on='LocationID') print 'location.gl joined in, elapsed time: %s' % (datetime.now() - start) print 'joining category.gl to search.gl' si = si.join(ctg, how='left', on='CategoryID') print 'category.gl joined in, elapsed time: %s' % (datetime.now() - start) # join category into context ads and rename to avoid name clash print 'joining category into ads' ctx = ctx.join(ctg, how='left', on='CategoryID') ctx.rename({'CategoryID':'AdCat'}) print 'category.gl joined into ads, elapsed time: %s' % (datetime.now() - start) print 'joining context ads into train' tr = tr.join(ctx, how='left', on='AdID') print 'context ads joined into train, elapsed time: %s' % (datetime.now() - start) print 'joining up training set (search and train)...' tr = tr.join(si, how='left', on='SearchID') print 'join completed, elapsed time: %s' % (datetime.now() - start) print 'sorting train by SearchDate, SearchID, AdID tr = tr.sort(['SearchDate', 'SearchID', 'AdID']) path = os.path.join(avito2_io.PROCESSED, 'gl_train2.csv') print 'saving training features to %s' % path tr.save(path, format='csv') print 'training features saved, elapsed time: %s' % (datetime.now() - start) # test print 'ingesting test.gl' test = sframes.load('test.gl') test = test[test['ObjectType'] == 3] del test['ObjectType'] test['log_ctr'] = test['HistCTR'].apply(lambda x : -10 * round(log(x), 1)) del test['HistCTR'] print 'test.gl ingested, elapsed time: %s' % (datetime.now() - start) print 'joining context ads into test' test = test.join(ctx, how='left', on='AdID') print 'context ads joined into test, elapsed time: %s' % (datetime.now() - start) print 'joining up test set...' ftest = test.join(si, how='left', on='SearchID') del ftest['SearchID'] print 'join completed, elapsed time: %s' % (datetime.now() - start) print 'sorting test...' ftest = ftest.sort('ID') path = os.path.join(avito2_io.PROCESSED, 'gl_test2.csv') print 'saving test features to %s' % path ftest.save(path, format='csv') print 'finished, elapsed time: %s' % (datetime.now() - start)
def features1(): ''' This function implements and records the construction of the first sframe features, which are the same as the features used in the pure python/pypy run1.py. This uses just SearchInfo and trainSearchStream and runs row-wise on context ads only. ''' # process trainSearchStream start = datetime.now() print 'ingesting train.gl' tr = sframes.load('train.gl') tr = tr[tr['ObjectType'] == 3] del tr['ObjectType'] tr['log_ctr'] = tr['HistCTR'].apply(lambda x : -10 * round(log(x), 1)) del tr['HistCTR'] print 'train.gl ingested, elapsed time: %s' % (datetime.now() - start) # process SearchInfo print 'ingesting search.gl' si = sframes.load('search.gl') # In run1.py, we didn't use date or IPID del si['SearchDate'] del si['IPID'] si['SQexists'] = si['SearchQuery'].apply(lambda x : len(x) > 0) del si['SearchQuery'] # NB: lambda d : 0 if d is None else len(d) doesn't seem to work si['SPexists'] = (si['SearchParams'].apply(lambda d : int(d is not None)) .fillna(0)) del si['SearchParams'] print 'search.gl ingested, elapsed time: %s' % (datetime.now() - start) # join up training set # NB: due to lazy evaluation, this might not time accurately print 'joining up training set...' f = tr.join(si, how='left', on='SearchID') # This line makes validation impossible. Run script must delete SearchID. #del f['SearchID'] print 'join completed, elapsed time: %s' % (datetime.now() - start) # save training features path = os.path.join(avito2_io.PROCESSED, 'gl_train1.csv') print 'saving training features to %s' % path f.save(path, format='csv') print 'training features saved, elapsed time: %s' % (datetime.now() - start) # load test set print 'ingesting test.gl' test = sframes.load('test.gl') test = test[test['ObjectType'] == 3] del test['ObjectType'] test['log_ctr'] = test['HistCTR'].apply(lambda x : -10 * round(log(x), 1)) del test['HistCTR'] print 'test.gl ingested, elapsed time: %s' % (datetime.now() - start) # join up test set print 'joining up test set...' ftest = test.join(si, how='left', on='SearchID') del ftest['SearchID'] print 'join completed, elapsed time: %s' % (datetime.now() - start) # save test set path = os.path.join(avito2_io.PROCESSED, 'gl_test1.csv') print 'saving test features to %s' % path ftest.save(path, format='csv') print 'finished, elapsed time: %s' % (datetime.now() - start)
def run_test(model): test = sframes.load('combo_test.gl') prepareSFrame(test) cit = chunk_iterator(test) pred = predict(cit, model) return pred
help='# of passes over training data.') args = parser.parse_args() if args.users == 'full': users = build_user_dict() print('loading full user data') elif args.users == 'counts': users = avito2_io.get_artifact('user_counts.pkl') print('loading user counts only from user_counts.pkl') elif args.users == 'si': users = avito2_io.get_artifact('user_si.pkl') print('loading user dict from user_si.pkl') else: users = None D = 2**args.bits if args.all: tr = sframes.load('train_context.gl') si = sframes.load('search.gl') if not args.sub: raise Warning('--all without --sub is not sensible.') else: tr = sframes.load('train_ds.gl') si = sframes.load('search_ds.gl') # no interactions; it'd take days model = train(tr, si, args.alpha, args.beta, args.l1, args.l2, D, users, False, args.maxlines, args.passes) print('finished training') if args.all: offset = 0.0 else: offset = compute_offset(tr, args.maxlines)