Пример #1
0
class WordvecRegressor(object):
    def __init__(self, pickle_model="", datadir=None):
        self.wb= wordbatch.WordBatch(normalize_text,
                                            extractor=(Hstack,
                                   [(WordVec, {"wordvec_file": "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                                    "normalize_text": normalize_text}),
                                   (WordVec, {"wordvec_file": "../../../data/word2vec/glove.6B.50d.txt.gz",
                                 "normalize_text": normalize_text})]))

        self.wb.dictionary_freeze= True

        self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link= "identity")

        if datadir==None:  (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
        else: self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels = self.wb.shuffle_batch(texts, labels, rcount)
        print("Transforming", rcount)
        texts= self.wb.fit_transform(texts, reset= False)
        print("Training", rcount)
        self.clf.fit(texts, labels, reset= False)

    def train(self, datadir, pickle_model=""):
        texts= []
        labels= []
        training_data= os.listdir(datadir)
        rcount= 0
        batchsize= 100000

        p= None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try: line= json.loads(line.strip())
                    except:  continue
                    for review in line["Reviews"]:
                        rcount+= 1
                        if rcount % 100000 == 0:  print(rcount)
                        if rcount % 6 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5)
                        if len(texts) % batchsize == 0:
                            if p != None:  p.join()
                            p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount))
                            p.start()
                            texts= []
                            labels= []
        if p != None:  p.join()
        self.fit_batch(texts, labels, rcount)

        if pickle_model!="":
            with gzip.open(pickle_model, 'wb') as model_file:
                pkl.dump((self.wb, self.clf), model_file, protocol=2)

    def predict(self, texts):
        vecs= self.wb.transform(texts)
        return self.clf.predict(vecs)
Пример #2
0
class WordbagRegressor(object):
    def __init__(self, pickle_model="", datadir=None):
        self.wordbatch = wordbatch.WordBatch(normalize_text, extractors=[(wordbatch.WordBag, {"hash_ngrams":3,
          "hash_ngrams_weights":[-1.0, -1.0, 1.0],"hash_size":2**23, "norm":'l2', "tf":'binary', "idf":50.0})])
        self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link="identity")
        if datadir==None:  (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, u'rb'))
        else: self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels= self.wordbatch.shuffle_batch(texts, labels, rcount)
        print "Transforming", rcount
        texts= self.wordbatch.transform(texts)
        print "Training", rcount
        self.clf.fit(texts, labels)

    def train(self, datadir, pickle_model=""):
        texts= []
        labels= []
        training_data= os.listdir(datadir)
        rcount= 0
        batchsize= 100000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, u'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try: line = json.loads(line.strip())
                    except:  continue
                    for review in line["Reviews"]:
                        rcount+= 1
                        if rcount % 100000 == 0:  print rcount
                        if rcount % 7 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5)
                        if len(texts) % batchsize == 0:
                            if p != None:  p.join()
                            p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount))
                            p.start()
                            texts= []
                            labels= []
        if p != None:  p.join()
        self.fit_batch(texts, labels, rcount)

        self.wordbatch.dictionary_freeze= True

        if pickle_model!="":
            with gzip.open(pickle_model, u'wb') as model_file:
                pkl.dump((self.wordbatch, self.clf), model_file, protocol=2)

    def predict(self, texts):
        counts= self.wordbatch.transform(texts)
        return self.clf.predict(counts)
class vanila_FTRL_Regressor:
    def __init__(self, param_dict, feature_dim):
        alpha = param_dict['alpha']
        beta = param_dict['beta']
        L1 = param_dict['L1']
        L2 = param_dict['L2']
        iters = param_dict['iters']

        self.model = FTRL(alpha=alpha,
                          beta=beta,
                          L1=L1,
                          L2=L2,
                          D=feature_dim,
                          iters=iters,
                          inv_link="identity",
                          threads=6)

    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_valid):
        return self.model.predict(X_valid)
Пример #4
0
def FTRL_train(train_X, train_y, isQuickRun):

    if isQuickRun:
        model = FTRL(alpha=0.01,
                     beta=0.1,
                     L1=0.00001,
                     L2=1.0,
                     D=train_X.shape[1],
                     iters=9,
                     inv_link="identity",
                     threads=4)
    else:
        model = FTRL(alpha=0.01,
                     beta=0.1,
                     L1=0.00001,
                     L2=1.0,
                     D=train_X.shape[1],
                     iters=47,
                     inv_link="identity",
                     threads=4)

    model.fit(train_X, train_y)

    return model
Пример #5
0
#from sklearn.feature_extraction.text import HashingVectorizer
#from sklearn.linear_model import *
#vct= HashingVectorizer()
#clf= SGDRegressor()

import wordbatch
from wordbatch.models import FTRL
from wordbatch.extractors import WordBag
wb= wordbatch.WordBatch(extractor=(WordBag, {"hash_ngrams":2, "hash_ngrams_weights":[0.5, -1.0], "hash_size":2**23, "norm":'l2', "tf":'log', "idf":50.0}))
clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1)

train_texts= ["Cut down a tree with a herring? It can't be done.", "Don't say that word.", "How can we not say the word if you don't tell us what it is?"]
train_labels= [1, 0, 1]
test_texts= ["Wait! I said it! I said it! Ooh! I said it again!"]

values = wb.transform(train_texts)
clf.fit(values, train_labels)
preds= clf.predict(wb.transform(test_texts))
print("values={}".format(values))
print("values={}".format(len(values)))
print("texts={}".format(test_texts))
print("transformed={}".format(wb.transform(test_texts)))
print(preds)
Пример #6
0
class WordvecRegressor(object):
    def __init__(self, pickle_model="", datadir=None):
        self.wb = wordbatch.WordBatch(
            normalize_text,
            extractor=(Hstack, [
                (WordVec, {
                    "wordvec_file":
                    "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                    "normalize_text": normalize_text
                }),
                (WordVec, {
                    "wordvec_file":
                    "../../../data/word2vec/glove.6B.50d.txt.gz",
                    "normalize_text": normalize_text
                })
            ]))

        self.wb.dictionary_freeze = True

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=2**25,
                        iters=1,
                        inv_link="identity")

        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels = self.wb.shuffle_batch(texts, labels, rcount)
        print("Transforming", rcount)
        texts = self.wb.transform(texts)
        print("Training", rcount)
        self.clf.fit(texts, labels)

    def train(self, datadir, pickle_model=""):
        texts = []
        labels = []
        training_data = os.listdir(datadir)
        rcount = 0
        batchsize = 100000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try:
                        line = json.loads(line.strip())
                    except:
                        continue
                    for review in line["Reviews"]:
                        rcount += 1
                        if rcount % 100000 == 0: print(rcount)
                        if rcount % 6 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append(
                            (float(review["Ratings"]["Overall"]) - 3) * 0.5)
                        if len(texts) % batchsize == 0:
                            if p != None: p.join()
                            p = threading.Thread(target=self.fit_batch,
                                                 args=(texts, labels, rcount))
                            p.start()
                            texts = []
                            labels = []
        if p != None: p.join()
        self.fit_batch(texts, labels, rcount)

        if pickle_model != "":
            with gzip.open(pickle_model, 'wb') as model_file:
                pkl.dump((self.wb, self.clf), model_file, protocol=2)

    def predict(self, texts):
        vecs = self.wb.transform(texts)
        return self.clf.predict(vecs)
Пример #7
0
def main():
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    ###train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    ###test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    train = pd.read_table('../input/train.tsv', engine='c')
    test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    submission = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=50,
                 inv_link="identity",
                 threads=1)

    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=15,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    params = {
        'learning_rate': 0.6,
        'application': 'regression',
        'max_depth': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
        'feature_fraction': 0.6,
        'nthread': 4,
        'min_data_in_leaf': 100,
        'max_bin': 31
    }

    # Remove features with document frequency <=100
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1),
                    dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    d_train = lgb.Dataset(train_X, label=train_y)
    watchlist = [d_train]
    if develop:
        d_valid = lgb.Dataset(valid_X, label=valid_y)
        watchlist = [d_train, d_valid]

    model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \
                      early_stopping_rounds=1000, verbose_eval=1000)

    if develop:
        preds = model.predict(valid_X)
        print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsL = model.predict(X_test)

    print('[{}] Predict LGB completed.'.format(time.time() - start_time))

    preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5)

    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
Пример #8
0
class WordbagRegressor(object):
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(normalize_text=normalize_text,
                            extractor=WordBag(
                                hash_ngrams=3,
                                hash_ngrams_weights=[-1.0, -1.0, 1.0],
                                hash_size=2**23,
                                norm='l2',
                                tf='binary',
                                idf=50.0),
                            batcher=batcher)

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=2**23,
                        iters=1,
                        inv_link="identity")
        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels = self.wb.batcher.shuffle_batch(texts, labels, rcount)
        print("Transforming", rcount)
        texts = self.wb.fit_transform(texts, reset=False)
        print("Training", rcount)
        self.clf.fit(texts, labels, reset=False)

    def train(self, datadir, pickle_model=""):
        texts = []
        labels = []
        training_data = os.listdir(datadir)
        rcount = 0
        batchsize = 100000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try:
                        line = json.loads(line.strip())
                    except:
                        continue
                    for review in line["Reviews"]:
                        rcount += 1
                        if rcount % 100000 == 0: print(rcount)
                        if rcount % 7 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append(
                            (float(review["Ratings"]["Overall"]) - 3) * 0.5)
                        if len(texts) % batchsize == 0:
                            if p != None: p.join()
                            p = threading.Thread(target=self.fit_batch,
                                                 args=(texts, labels, rcount))
                            p.start()
                            texts = []
                            labels = []
        if p != None: p.join()
        self.fit_batch(texts, labels, rcount)

        self.wb.dictionary_freeze = True

        if pickle_model != "":
            with gzip.open(pickle_model, 'wb') as model_file:
                backend = self.wb.batcher.backend
                backend_handle = self.wb.batcher.backend_handle
                self.wb.batcher.backend = "serial"
                self.wb.batcher.backend_handle = None
                pkl.dump((self.wb, self.clf), model_file, protocol=2)
                self.wb.batcher.backend = backend
                self.wb.batcher.backend_handle = backend_handle

    def predict(self, texts):
        counts = self.wb.transform(texts)
        return self.clf.predict(counts)
Пример #9
0
def get_pred_ftrl(submission):
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    train = pd.read_table(
        '../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    test = pd.read_table(
        '../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    #train = pd.read_table('../input/train.tsv', engine='c')
    #test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    train = train[train["price"] != 0]
    #Xtrain,Xvalid = train_test_split(train, test_size=0.01,random_state=1)
    nrow_train = train.shape[0]
    #nrow_valid = Xvalid.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, test])
    #submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_train:]
    print(sparse_merge.shape)

    gc.collect()
    train_X, train_y = X, y
    #'''
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=50,
                 inv_link="identity",
                 threads=1)

    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    submission['price_FTRL'] = predsF
    #print(rmsle(np.expm1(predsF),y_valid))
    #'''
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))
    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=17,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
    submission['price_FM_FTRL'] = predsFM
Пример #10
0
    "hash_size": 2**23,
    "norm": 'l2',
    "tf": 'log',
    "idf": 50.0
}))
clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2**25, iters=1)

train_texts = [
    "Cut down a tree with a herring? It can't be done.",
    "Don't say that word.",
    "How can we not say the word if you don't tell us what it is?"
]
train_labels = [1, 0, 1]
test_texts = ["Wait! I said it! I said it! Ooh! I said it again!"]

clf.fit(wb.transform(train_texts), train_labels)
preds = clf.predict(wb.transform(test_texts))
print("wordbatch ok")

import pyltr
print("pyltr ok")

from tqdm import tqdm
import time

text = ""
for char in tqdm(["a", "b", "c", "d"]):
    time.sleep(0.25)
    text = text + char
print("tqdm ok")
Пример #11
0
def main():
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    train = pd.read_table(
        '../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    test = pd.read_table(
        '../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    #train = pd.read_table('../input/train.tsv', engine='c')
    #test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, dftt, test])
    submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))
    del X_dummies, merge, X_description, lb, X_brand, X_category1, X_category2, X_category3, X_name
    gc.collect()

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=30,
                 inv_link="identity",
                 threads=1)
    del X
    gc.collect()
    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.012,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=17,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    del train_X, train_y
    gc.collect()
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
    del X_test
    gc.collect()
    params = {
        'learning_rate': 0.65,
        'application': 'regression',
        'max_depth': 4,
        'num_leaves': 42,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.71,
        'bagging_freq': 5,
        'feature_fraction': 0.67,
        'nthread': 4,
        'min_data_in_leaf': 120,
        'max_bin': 40
    }

    # Remove features with document frequency <=100
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1),
                    dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)
    del sparse_merge
    gc.collect()
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)
    del X, y
    gc.collect()
    d_train = lgb.Dataset(train_X, label=train_y)
    # del train_X, train_y; gc.collect()
    watchlist = [d_train]
    if develop:
        d_valid = lgb.Dataset(valid_X, label=valid_y)
        del valid_y
        gc.collect()
        watchlist = [d_train, d_valid]

    #model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
    #                  early_stopping_rounds=1000, verbose_eval=1000)

    model = lgb.train(params, train_set=d_train, num_boost_round=3000, valid_sets=watchlist, \
                      early_stopping_rounds=1000, verbose_eval=1000)

    del d_train
    gc.collect()
    if develop:
        preds = model.predict(valid_X)
        del valid_X
        gc.collect()
        print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsL = model.predict(X_test)
    # del X_test; gc.collect()
    print('[{}] Predict LGB completed.'.format(time.time() - start_time))

    #--- BEGIN Huber
    # Details: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.HuberRegressor.html

    # class sklearn.linear_model.HuberRegressor(epsilon=1.35,
    #      max_iter=100, alpha=0.0001, warm_start=False, fit_intercept=True,
    #      tol=1e-05)[source]

    setup_Huber = 2

    if (setup_Huber == 1):
        model = HuberRegressor(fit_intercept=True,
                               alpha=0.01,
                               max_iter=80,
                               epsilon=363)

    if (setup_Huber == 2):
        model = HuberRegressor(fit_intercept=True,
                               alpha=0.05,
                               max_iter=200,
                               epsilon=1.2)

    model.fit(train_X, train_y)
    print('[{}] Predict Huber completed.'.format(time.time() - start_time))
    predsH = model.predict(X=X_test)
    #--- END Huber

    # original
    # preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5)

    # modified setup (IT NEEDS MORE TUNING TESTS)
    w = (0.09, 0.11, 0.23, 0.57)

    preds = predsH * w[0] + predsF * w[1] + predsL * w[2] + predsFM * w[3]

    submission['price'] = np.expm1(preds)
    submission.to_csv("sub ftrl_fm_lgb_huber v3.csv", index=False)

    nm = (time.time() - start_time) / 60
    print("Total processing time %s min" % nm)
Пример #12
0
def main(test, logger):

    logger.info('Start . . .')
    train = pd.read_table('../input/train.tsv', engine='c')
    logger.info('Load train')
    logger.info('train shape {}'.format(train.shape))
    logger.info('test shape {}'.format(test.shape))
    nrow_train = train.shape[0]
    y = np.log1p(train['price'])

    train_low_price = train.loc[train['price'] < 1.]
    train = train.drop(train[train['price'] < 1.].index)
    del train_low_price['price']
    logger.info('train_low_price shape {}'.format(train_low_price.shape))

    df_full = pd.concat([train, train_low_price, test])
    logger.info('df_full shape {}'.format(df_full.shape))

    sub = test[['test_id']]
    logger.info('sub shape {}'.format(sub.shape))

    del train, test
    gc.collect()

    df_full['general_cat'], df_full['subcat_1'], df_full['subcat_2'] = zip(
        *df_full['category_name'].apply(lambda x: split_category(x)))
    df_full.drop(['category_name'], axis=1, inplace=True)
    logger.info('Split category_name')
    gc.collect()

    df_full = impute_missing_value(df_full)
    logger.info('Impute missing value')
    gc.collect()

    df_full = cut_df(df_full)
    logger.info('Cut categories')
    gc.collect()

    df_full = to_categorical(df_full)
    logger.info('Convert to categorical features')
    gc.collect()

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(df_full['name'])
    del wb
    gc.collect()
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    logger.info('Vectorize name')
    gc.collect()

    cnt_vec = CountVectorizer()
    X_cat_1 = cnt_vec.fit_transform(df_full['general_cat'])
    X_cat_2 = cnt_vec.fit_transform(df_full['subcat_1'])
    X_cat_3 = cnt_vec.fit_transform(df_full['subcat_2'])
    df_full.drop(['general_cat', 'subcat_1', 'subcat_2'], axis=1, inplace=True)
    del cnt_vec
    gc.collect()
    logger.info('Vectorize category (general_cat, subcat_1, subcat_2)')

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**29,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=2)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(df_full['item_description'])
    del wb
    gc.collect()
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    logger.info('Vectorize item_description')
    gc.collect()

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(df_full['brand_name'])
    df_full.drop(['brand_name'], axis=1, inplace=True)
    del lb
    gc.collect()
    logger.info('Label binarize brand_name')

    X_dummies = csr_matrix(
        pd.get_dummies(df_full[['item_condition_id', 'shipping']],
                       sparse=True).values)
    df_full.drop(['item_condition_id', 'shipping'], axis=1, inplace=True)
    logger.info('Get dummies on item_condition_id and shipping')
    gc.collect()

    sparse_merge = hstack((X_dummies, X_description, X_brand, X_cat_1, X_cat_2,
                           X_cat_3, X_name)).tocsr()
    logger.info('Create sparse features')
    logger.info('sparse_merge shape {}'.format(sparse_merge.shape))
    del X_dummies, X_description, X_brand, X_cat_1, X_cat_2, X_cat_3, X_name
    gc.collect()

    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    logger.info('Remove features with doc frequency <= 1')
    logger.info('sparse_merge shape {}'.format(sparse_merge.shape))

    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_train:]
    sparse_merge_shape = sparse_merge.shape
    del sparse_merge
    gc.collect()

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge_shape[1],
                 iters=30,
                 inv_link="identity",
                 threads=1)
    model.fit(X, y)
    logger.info('Fit FTRL')
    preds_FTRL = model.predict(X_test)
    logger.info('Predict FTRL')

    model = FM_FTRL(alpha=0.01,
                    beta=0.1,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge_shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=20,
                    inv_link="identity",
                    threads=4)
    model.fit(X, y)
    logger.info('Fit FM_FTRL')
    preds_FM_FTRL = model.predict(X_test)
    logger.info('Predict FM_FTRL')

    preds = (np.expm1(preds_FTRL) * 0.15 + np.expm1(preds_FM_FTRL) * 0.85)
    logger.info('Final predictions generated')
    return preds
Пример #13
0
def main():
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    #train = pd.read_table('../input/train.tsv', engine='c')
    #test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, dftt, test])
    submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() - start_time))

    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                                                  "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                                                  "idf": None,
                                                                  }), procs=8)
    wb.dictionary_freeze= True
    X_name = wb.fit_transform(merge['name'])
    del(wb)
    X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                                  "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                                  "idf": None})
                             , procs=8)
    wb.dictionary_freeze= True
    X_description = wb.fit_transform(merge['item_description'])
    del(wb)
    X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))

    X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() - start_time))

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

    model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1)

    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,
                    D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    params = {
        'learning_rate': 0.6,
        'application': 'regression',
        'max_depth': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
        'feature_fraction': 0.6,
        'nthread': 4,
        'min_data_in_leaf': 100,
        'max_bin': 31
    }

    # Remove features with document frequency <=100
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

    d_train = lgb.Dataset(train_X, label=train_y)
    watchlist = [d_train]
    if develop:
        d_valid = lgb.Dataset(valid_X, label=valid_y)
        watchlist = [d_train, d_valid]

    model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \
                      early_stopping_rounds=1000, verbose_eval=1000)

    if develop:
        preds = model.predict(valid_X)
        print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsL = model.predict(X_test)

    print('[{}] Predict LGB completed.'.format(time.time() - start_time))

    preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5)

    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
    if enable_validation:
        truth_sr = np.log1p(truth)

    del train, target
    gc.collect()

    FTRL_model = FTRL(alpha=0.01,
                      beta=0.1,
                      L1=0.00001,
                      L2=1.0,
                      D=X.shape[1],
                      iters=50,
                      inv_link="identity",
                      threads=1)
    FTRL_model.fit(X, y)
    print("[{}] Train FTRL completed".format(time.time() - start_time))

    FM_FTRL_model = FM_FTRL(alpha=0.01,
                            beta=0.01,
                            L1=0.00001,
                            L2=0.1,
                            D=X.shape[1],
                            alpha_fm=0.01,
                            L2_fm=0.0,
                            init_fm=0.01,
                            D_fm=200,
                            e_noise=0.0001,
                            iters=17,
                            inv_link="identity",
                            threads=4)
Пример #15
0
 X_description = X_description[:, np.where(X_description.getnnz(axis=0) > 1)[0]]
 lb = LabelBinarizer(sparse_output=True)
 X_brand = lb.fit_transform(merge['Source'])
 X_dummies = csr_matrix(pd.get_dummies(merge[['IDLink', 'Facebook']],
                                       sparse=True).values)
 sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_name)).tocsr()
 sparse_merge = sparse_merge[:, np.where(sparse_merge.getnnz(axis=0) > 100)[0]]
 X = sparse_merge[:nrow_train]
 X_test = sparse_merge[nrow_test:]
 print(sparse_merge.shape)
 gc.collect()
 train_X, train_y = X, y
 if develop:
     train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)
 model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1)
 model.fit(train_X, train_y)
 print('[{}] Train FTRL completed'.format(time.time() - start_time))
 if develop:
     preds = model.predict(X=valid_X)
     print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))
 predsF = model.predict(X_test)
 print('[{}] Predict FTRL completed'.format(time.time() - start_time))
 model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,
                 D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4)
 model.fit(train_X, train_y)
 print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
 if develop:
     preds = model.predict(X=valid_X)
     print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))
 predsFM = model.predict(X_test)
 print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
Пример #16
0
class WordvecRegressor(object):
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(
            normalize_text,
            extractor=Hstack([
                WordVec(wordvec_file=
                        "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                        normalize_text=normalize_text,
                        encoding="utf8"),
                WordVec(
                    wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
                    normalize_text=normalize_text,
                    encoding="utf8")
            ]))
        # from wordbatch.pipelines import FeatureUnion
        # from wordbatch.transformers import Dictionary, TextNormalizer
        # from sklearn.pipeline import Pipeline
        # tn= TextNormalizer(normalize_text=normalize_text)
        # dct= Dictionary()
        # vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
        # 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
        # vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
        # 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
        # self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))])
        self.batcher = batcher

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=100 + 50,
                        iters=1,
                        inv_link="identity")

        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels = shuffle(texts, labels)
        print("Transforming", rcount)
        #texts= self.wb.fit_transform(texts, tn__batcher=self.batcher, dct__reset= False, dct__batcher= self.batcher)
        texts = self.wb.fit_transform(texts)
        print("Training", rcount)
        self.clf.fit(texts, labels, reset=False)

    def train(self, datadir, pickle_model=""):
        texts = []
        labels = []
        training_data = os.listdir(datadir)
        rcount = 0
        batchsize = 80000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try:
                        line = json.loads(line.strip())
                    except:
                        continue
                    for review in line["Reviews"]:
                        rcount += 1
                        if rcount % 100000 == 0: print(rcount)
                        if rcount % 6 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append(
                            (float(review["Ratings"]["Overall"]) - 3) * 0.5)
                        if len(texts) % batchsize == 0:
                            if p != None: p.join()
                            p = threading.Thread(target=self.fit_batch,
                                                 args=(texts, labels, rcount))
                            p.start()
                            texts = []
                            labels = []
        if p != None: p.join()
        self.fit_batch(texts, labels, rcount)

        # if pickle_model!="":
        # 	with gzip.open(pickle_model, 'wb') as model_file:
        # 		backend = self.wb.batcher.backend
        # 		backend_handle = self.wb.batcher.backend_handle
        # 		self.wb.batcher.backend = "serial"
        # 		self.wb.batcher.backend_handle = None
        # 		pkl.dump((self.wb, self.clf), model_file, protocol=2)
        # 		self.wb.batcher.backend = backend
        # 		self.wb.batcher.backend_handle = backend_handle

    def predict(self, texts):
        vecs = self.wb.transform(texts)
        return self.clf.predict(vecs)
Пример #17
0
class WordbagRegressor(object):
    def __init__(self, pickle_model="", datadir=None):
        from pyspark import SparkContext
        self.sc= SparkContext()
        self.wordbatch = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams":3,
          "hash_ngrams_weights":[-1.0, -1.0, 1.0],"hash_size":2**23, "norm":'l2', "tf":'binary', "idf":50.0}))
        self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link="identity")
        self.wordbatch.use_sc= True
        if datadir==None:  (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
        else: self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels= self.wordbatch.shuffle_batch(texts, labels, rcount)
        print("Transforming", rcount)
        if self.sc != None:
            data_rdd= self.wordbatch.lists2rddbatches([texts, labels], self.sc)
            data_rdd= self.wordbatch.transform(data_rdd)
            [texts, labels]= self.wordbatch.rddbatches2lists(data_rdd)
        else:
            texts= self.wordbatch.transform(texts)
        print("Training", rcount)
        self.clf.fit(texts, labels)

    def train(self, datadir, pickle_model=""):
        texts= []
        labels= []
        training_data= os.listdir(datadir)
        rcount= 0
        batchsize= 20000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try: line = json.loads(line.strip())
                    except:  continue
                    for review in line["Reviews"]:
                        rcount+= 1
                        if rcount % 100000 == 0:  print(rcount)
                        if rcount % 7 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5)
                        if len(texts) % batchsize == 0:
                            if p != None:  p.join()
                            p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount))
                            p.start()
                            texts= []
                            labels= []
        if p != None:  p.join()
        self.fit_batch(texts, labels, rcount)

        self.wordbatch.dictionary_freeze= True

        if pickle_model!="":
            with gzip.open(pickle_model, 'wb') as model_file:
                pkl.dump((self.wordbatch, self.clf), model_file, protocol=2)

    def predict(self, texts):
        if self.sc != None:
            data_rdd= self.wordbatch.lists2rddbatches([texts, []], self.sc)
            data_rdd= self.wordbatch.transform(data_rdd)
            [counts, labels]= self.wordbatch.rddbatches2lists(data_rdd)
        else: counts= self.wordbatch.transform(texts)
        return self.clf.predict(counts)

    def predict_parallel(self, texts):
        if self.sc != None:
            data_rdd= self.wordbatch.lists2rddbatches([texts, []], self.sc)
            counts_rdd= self.wordbatch.transform(data_rdd)
            return self.wordbatch.rddbatches2lists(self.wordbatch.predict_parallel(counts_rdd, self.clf))[0]
        counts= self.wordbatch.transform(texts)
        return self.wordbatch.predict_parallel(counts, self.clf)
Пример #18
0
def wordbatch_algo():
    import time

    # print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    train = pd.read_table('../input/train.tsv', engine='c')
    # Drop rows where price = 0
    train = train[train.price != 0].reset_index(drop=True)
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)

    y = np.log1p(train["price"])

    nrow_train = train.shape[0]

    # Training
    train['general_cat'], train['subcat_1'], train['subcat_2'] = \
        zip(*train['category_name'].apply(lambda x: split_cat(x)))
    train.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(train)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(train)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(train)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    # Add some new features:
    X_len_desc = train['item_description'].apply(
        lambda x: len(x)).as_matrix().reshape(-1, 1)
    X_len_name = train['name'].apply(lambda x: len(x)).as_matrix().reshape(
        -1, 1)

    print('[{}] Length of text completed.'.format(time.time() - start_time))

    # Name
    wb_name = wordbatch.WordBatch(normalize_text,
                                  extractor=(WordBag, {
                                      "hash_ngrams": 2,
                                      "hash_ngrams_weights": [1.5, 1.0],
                                      "hash_size": 2**29,
                                      "norm": None,
                                      "tf": 'binary',
                                      "idf": None,
                                  }),
                                  procs=8)

    wb_name.dictionary_freeze = True
    wb_name.fit(train['name'])
    X_name = wb_name.transform(train['name'])

    # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb_cat1 = CountVectorizer()
    wb_cat2 = CountVectorizer()
    wb_cat3 = CountVectorizer()
    wb_cat1.fit(train['general_cat'])
    wb_cat2.fit(train['subcat_1'])
    wb_cat3.fit(train['subcat_2'])

    X_category1 = wb_cat1.transform(train['general_cat'])
    X_category2 = wb_cat2.transform(train['subcat_1'])
    X_category3 = wb_cat3.transform(train['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb_desc = wordbatch.WordBatch(normalize_text,
                                  extractor=(WordBag, {
                                      "hash_ngrams": 2,
                                      "hash_ngrams_weights": [1.0, 1.0],
                                      "hash_size": 2**28,
                                      "norm": "l2",
                                      "tf": 1.0,
                                      "idf": None
                                  }),
                                  procs=8)
    wb_desc.dictionary_freeze = True
    wb_desc.fit(train['item_description'])
    X_description = wb_desc.transform(train['item_description'])

    # X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    lb.fit(train['brand_name'])
    X_brand = lb.transform(train['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_cond, d_cond = fit_dummy(train['item_condition_id'].tolist())
    X_ship, d_ship = fit_dummy(train['shipping'].tolist())

    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))

    del train
    gc.collect()

    print(X_cond.shape, X_ship.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_cond, X_ship, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))
    del X_description, X_brand, X_category1, X_category2, X_category3, X_name
    gc.collect()

    # Remove features with document frequency <=1

    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    print(sparse_merge.shape)
    X = sparse_merge

    # ---------------------------------------
    # FM model fit
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED)

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=train_X.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=FM_iter,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train FM_FTRL completed'.format(time.time() - start_time))
    print('-' * 20)
    if develop:
        preds = model.predict(X=valid_X)
        print("->>>>  FM_FTRL dev RMSLE:",
              rmsle(np.expm1(valid_y), np.expm1(preds)))

    # ---------------------------------------
    # FTRL model fit
    model2 = FTRL(alpha=0.01,
                  beta=0.01,
                  L1=0.00001,
                  L2=1.0,
                  D=train_X.shape[1],
                  iters=FTRL_iter,
                  inv_link="identity",
                  threads=1)
    # del X; gc.collect()
    model2.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model2.predict(X=valid_X)
        print("->>>>  FTRL dev RMSLE:",
              rmsle(np.expm1(valid_y), np.expm1(preds)))

    # Clear variables:
    del X, train_X, train_y, sparse_merge
    gc.collect()

    # ---------------------------------------
    # Testing by chunk
    print(' FM/FTRL: ...reading the test data...')
    predsFM = []
    predsF = []

    for test in load_test():
        test['general_cat'], test['subcat_1'], test['subcat_2'] = \
            zip(*test['category_name'].apply(lambda x: split_cat(x)))
        test.drop('category_name', axis=1, inplace=True)

        handle_missing_inplace(test)
        #print('[{}] Handle missing completed.'.format(time.time() - start_time))

        cutting(test)
        # print('[{}] Cut completed.'.format(time.time() - start_time))

        to_categorical(test)
        # print('[{}] Convert categorical completed'.format(time.time() - start_time))

        # Add some new features:
        X_len_desc_test = test['item_description'].apply(
            lambda x: len(x)).as_matrix().reshape(-1, 1)
        X_len_name_test = test['name'].apply(
            lambda x: len(x)).as_matrix().reshape(-1, 1)

        X_name_test = wb_name.transform(test['name'])
        # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]

        X_category1_test = wb_cat1.transform(test['general_cat'])
        X_category2_test = wb_cat2.transform(test['subcat_1'])
        X_category3_test = wb_cat3.transform(test['subcat_2'])

        X_description_test = wb_desc.transform(test['item_description'])
        # X_description_test = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]

        X_brand_test = lb.transform(test['brand_name'])

        X_cond_test = transform_dummy(test['item_condition_id'].tolist(),
                                      d_cond)
        X_ship_test = transform_dummy(test['shipping'].tolist(), d_ship)


        X_test = hstack((X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, \
                         X_category2_test, X_category3_test, X_name_test)).tocsr()
        X_test = X_test[:, mask]

        # Clear variables:
        del X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, X_category2_test, X_category3_test, X_name_test
        del test
        gc.collect()

        predsFM_batch = model.predict(X_test)
        predsFM += np.array(predsFM_batch).flatten().tolist()

        predsF_batch = model2.predict(X_test)
        predsF += np.array(predsF_batch).flatten().tolist()

    print(np.array(predsFM))
    print('-' * 20)

    print(np.array(predsF))
    print('-' * 20)

    return np.array(predsFM), np.array(predsF)
Пример #19
0
####
# model.fit(train_X, train_y)
# print('[{}] Train FTRL completed'.format(time.time() - start_time))
# if develop:
#     preds = model.predict(X=valid_X)
#     print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

# predsF = model.predict(X_test)
print('[{}] Predict FTRL completed'.format(time.time() - start_time))

model = FM_FTRL(alpha=0.1410, beta=0.1896, L1=4.9447, L2=9.8198, D=d_shape, alpha_fm=0.0498, L2_fm=0.0027, 
    init_fm=0.0040, D_fm=int(99), e_noise=0.0172, iters=int(3), inv_link="identity", 
    threads=4, seed=2017)
gc.collect() 
model.fit(train_X, train_y)
print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
del train_X
del train_y
gc.collect()

if develop:
    preds = model.predict(X=valid_X)
    print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

predsFM = model.predict(X_test)
print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))



preds = predsFM