def createListStopWords(): stopwords = json.load(codecs.open('stopwords.json', 'r', 'utf-8-sig')) listStopwords = list() for keys in stopwords.keys(): for word in stopwords[keys]: listStopwords.append(word) listStopwords = np.array(listStopwords) listStopwords = np.unique(listStopwords) np.save("listStopWords", listStopwords)
''' which word has no meaning in text,called stopword ''' import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize example_sentence = "এটা হল একটা উধারন যেটা দিয়ে আমরা কাজ করব এবং যেটা অনেক প্যাড়া দিবে ।" stopwords = {} if __name__ == '__main__': # map the stopwards in dictionary with open('stopwords-bn.txt', 'r', encoding='utf-8') as f: x = f.readlines() for i in range(0, len(x)): ss = x[i] word = ss.split('\n')[0] #print(word) stopwords.update({word: 1}) #now tokenize the sentence and check its existance.... for i in word_tokenize(example_sentence): if i in stopwords.keys(): print(i) f.close()
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 4 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ix = (merge['brand_name'] == merge['brand_name']) & ( ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin( merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) stopw = set(stopwords.keys() + ['&']) merge['intersection'] = [ list(set(n.lower().split()) & set(d.lower().split())) for (n, d) in zip(merge.name.values, merge.item_description.values) ] merge['intersection'] = [ ' '.join([w for w in l if w not in stopw]) for l in merge['intersection'] ] tech_mapper = { 'unblocked': 'unlocked', 'bnwt': 'brand new with tags', 'nwt': 'new with tags', 'bnwot': 'brand new without tags', 'bnwob': 'brand new without box', 'nwot': 'new without tags', 'bnip': 'brand new in packet', 'nip': 'new in packet', 'bnib': 'brand new in box', 'nib': 'new in box', 'mib': 'mint in box', 'mwob:': 'mint without box', 'mip': 'mint in packet', 'mwop': 'mint without packet', } import multiprocessing as mp pool = mp.Pool(processes=4) for col in ['name', 'item_description']: merge[col] = merge[col].str.lower() merge[col] = pool.map(replace_maps, merge[col].values) pool.close ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2']) crossed_columns_d = cross_columns(x_cols) categorical_columns = list(merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash))) % D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del (lb) ''' Encode Original Strings ''' ''' for col in ['item_description', 'name']: lb = LabelBinarizer(sparse_output=True) if 'X_orig' not in locals(): X_orig = lb.fit_transform(merge[col].apply(hash)) else: X_orig = hstack((X_orig, lb.fit_transform(merge[col].apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['item_description']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['subcat_2']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']+merge['item_description']).apply(hash)))) X_orig = X_orig.tocsr() X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 2, 0, 1), dtype=bool)] X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 5000, 1, 0), dtype=bool)] print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocoo() ''' gc.collect() cpuStats() ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del (wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_orig.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_orig)).tocsr() ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
merge.description_token.str.lower(), tok_raw_dsc) merge['seq_item_description_rev'] = [ list(reversed(l)) for l in merge.seq_item_description ] gc.collect() merge["seq_name"] = fit_sequence(merge.name.str.lower(), tok_raw_nam) gc.collect() merge["seq_name_token"] = fit_sequence(merge.name_token.str.lower(), tok_raw_ntk, filt=False) gc.collect() print('[{}] Finished PROCESSING TEXT DATA...'.format(time.time() - start_time)) merge.head() #EXTRACT DEVELOPTMENT TEST stopw = set(stopwords.keys()) # Get the dot product def posn_to_sparse(dt, embedding_map): sprow = [] spcol = [] spdata = [] embw = set(embedding_map.keys()) embid_lens = [] for c, (nm, ct) in enumerate( zip(dt['name_token'].values, dt['category_token'].values)): sent = " ".join([nm, ct]) ids = [ embedding_map[s] for s in sent.split(' ') if (s in embw) and (s not in stopw)
tok_raw_cat = myTokenizerFit( mergetrn.category_name_split[:nrow_train].str.lower().unique(), max_words=800) gc.collect() tok_raw_nam = myTokenizerFit(mergetrn.name[:nrow_train].str.lower().unique(), max_words=25000) gc.collect() tok_raw_dsc = myTokenizerFit( mergetrn.description_token[:nrow_train].str.lower().unique(), max_words=25000) gc.collect() tok_raw_ntk = myTokenizerFit( mergetrn.name_token[:nrow_train].str.lower().unique(), max_words=50000) gc.collect() stoppers = set(stopwords.keys()) tok_raw_cat_drop = dict([(v, k) for (k, v) in tok_raw_cat.items() if len(k) == 1 or k in stoppers]) tok_raw_nam_drop = dict([(v, k) for (k, v) in tok_raw_nam.items() if len(k) == 1 or k in stoppers]) tok_raw_dsc_drop = dict([(v, k) for (k, v) in tok_raw_dsc.items() if len(k) == 1 or k in stoppers]) tok_raw_ntk_drop = dict([(v, k) for (k, v) in tok_raw_ntk.items() if len(k) == 1 or k in stoppers]) mergetrn = seqTokenDf(mergetrn) tstls = [seqTokenDf(df) for (df) in tstls] nrow_test = sum([df.shape[0] for (df) in tstls]) gc.collect() ''' Pretrained embeddings
for col in ['name_token', 'category_token']: flat_counter = list_flatten(mergetrn[[col]].values[:, 0]) wordlist += [k for (k, v) in flat_counter.items() if v > 0] wordlist = list(set(wordlist)) wordlist = set(wordlist) embeddings_matrix = [] embedding_map = {} #f = open('../feat/wiki.en.vec') f = open(glove_file) counter = 0 for line in f: values = line.split() word = values[0] if word not in wordlist: continue if word in set(stopwords.keys()): continue #coefs = np.asarray(values[1:], dtype='float32') embedding_map[word] = counter embeddings_matrix.append(values[1:]) counter += 1 if (counter % 10000 == 0) and (counter != 0): print('Found %s word vectors.' % counter) f.close() print('Found %s word vectors.' % counter) embeddings_matrix = np.array(embeddings_matrix, dtype='float16') ''' Embeddings to dense ''' mergetrn.head()