def createListStopWords():
    stopwords = json.load(codecs.open('stopwords.json', 'r', 'utf-8-sig'))
    listStopwords = list()
    for keys in stopwords.keys():
        for word in stopwords[keys]:
            listStopwords.append(word)
    listStopwords = np.array(listStopwords)
    listStopwords = np.unique(listStopwords)
    np.save("listStopWords", listStopwords)
Пример #2
0
'''
 which word has no meaning in text,called stopword
'''
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sentence = "এটা হল একটা উধারন যেটা দিয়ে আমরা কাজ করব এবং যেটা অনেক প্যাড়া দিবে ।"

stopwords = {}

if __name__ == '__main__':

    # map the stopwards in dictionary
    with open('stopwords-bn.txt', 'r', encoding='utf-8') as f:
        x = f.readlines()

        for i in range(0, len(x)):
            ss = x[i]
            word = ss.split('\n')[0]
            #print(word)
            stopwords.update({word: 1})

    #now tokenize the sentence and check its existance....
    for i in word_tokenize(example_sentence):
        if i in stopwords.keys():
            print(i)

        f.close()
Пример #3
0
def getFMFTRL():
    #os.chdir('/Users/dhanley2/Documents/mercari/data')
    os.chdir('/home/darragh/mercari/data')
    train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
    test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8')

    glove_file = '../feat/glove.6B.50d.txt'
    threads = 4
    save_dir = '../feat'

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]

    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    merge['target'] = np.log1p(merge["price"])
    submission = test[['test_id']]
    ix = (merge['brand_name'] == merge['brand_name']) & (
        ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(
            merge['name'].str.lower()))
    merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix]

    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(train.shape[0]),
                                      random_state=233,
                                      train_size=0.90)

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    #merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    stopw = set(stopwords.keys() + ['&'])
    merge['intersection'] = [
        list(set(n.lower().split()) & set(d.lower().split()))
        for (n, d) in zip(merge.name.values, merge.item_description.values)
    ]
    merge['intersection'] = [
        ' '.join([w for w in l if w not in stopw])
        for l in merge['intersection']
    ]

    tech_mapper = {
        'unblocked': 'unlocked',
        'bnwt': 'brand new with tags',
        'nwt': 'new with tags',
        'bnwot': 'brand new without tags',
        'bnwob': 'brand new without box',
        'nwot': 'new without tags',
        'bnip': 'brand new in packet',
        'nip': 'new in packet',
        'bnib': 'brand new in box',
        'nib': 'new in box',
        'mib': 'mint in box',
        'mwob:': 'mint without box',
        'mip': 'mint in packet',
        'mwop': 'mint without packet',
    }

    import multiprocessing as mp
    pool = mp.Pool(processes=4)
    for col in ['name', 'item_description']:
        merge[col] = merge[col].str.lower()
        merge[col] = pool.map(replace_maps, merge[col].values)
    pool.close
    '''
    Crossed columns
    '''

    # my understanding on how to replicate what layers.crossed_column does. One
    # can read here: https://www.tensorflow.org/tutorials/linear.
    def cross_columns(x_cols):
        """simple helper to build the crossed columns in a pandas dataframe
        """
        crossed_columns = dict()
        colnames = ['_'.join(x_c) for x_c in x_cols]
        for cname, x_c in zip(colnames, x_cols):
            crossed_columns[cname] = x_c
        return crossed_columns

    merge['item_condition_id_str'] = merge['item_condition_id'].astype(str)
    merge['shipping_str'] = merge['shipping'].astype(str)
    x_cols = (
        ['brand_name', 'item_condition_id_str'],
        ['brand_name', 'subcat_1'],
        ['brand_name', 'subcat_2'],
        ['brand_name', 'general_cat'],
        #['brand_name',  'subcat_1',  'item_condition_id_str'],
        #['brand_name',  'subcat_2',  'item_condition_id_str'],
        #['brand_name',  'general_cat',  'item_condition_id_str'],
        ['brand_name', 'shipping_str'],
        ['shipping_str', 'item_condition_id_str'],
        ['shipping_str', 'subcat_2'],
        ['item_condition_id_str', 'subcat_2'])
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(merge.select_dtypes(include=['object']).columns)

    D = 2**30
    for k, v in crossed_columns_d.items():
        print('Crossed column ', k)
        outls_ = []
        indicator = 0
        for col in v:
            outls_.append((np.array(merge[col].apply(hash))) % D + indicator)
            indicator += 10**6
        merge[k] = sum(outls_).tolist()
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    lb = LabelBinarizer(sparse_output=True)
    x_col = lb.fit_transform(merge[cross_nm[0]])
    for i in range(1, len(cross_nm)):
        x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]])))
    del (lb)
    '''
    Encode Original Strings
    '''
    '''
    for col in ['item_description', 'name']:    
        lb = LabelBinarizer(sparse_output=True)
        if 'X_orig' not in locals():
            X_orig = lb.fit_transform(merge[col].apply(hash))
        else:
            X_orig = hstack((X_orig, lb.fit_transform(merge[col].apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['item_description']+merge['name']).apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']).apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['subcat_2']+merge['name']).apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']+merge['item_description']).apply(hash))))
    X_orig = X_orig.tocsr()
    X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 2, 0, 1), dtype=bool)]
    X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 5000, 1, 0), dtype=bool)]    
    print ('Shape of original hash', X_orig.shape)
    X_orig = X_orig.tocoo()
    '''
    gc.collect()
    cpuStats()
    '''
    Hash name
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    Hash category
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**20,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    cat = merge["category_name"].str.replace('/', ' ')
    X_cat = wb.fit_transform(cat)
    del (wb)
    X_cat = X_cat[:,
                  np.array(np.clip(X_cat.getnnz(axis=0) -
                                   1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `category` completed.'.format(time.time() -
                                                        start_time))
    '''
    Count category
    '''

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)

    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    '''
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape, X_orig.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat,
                           x_col, X_orig)).tocsr()
    '''

    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape)
    sparse_merge = hstack(
        (X_dummies, X_description, X_brand, X_category1, X_category2,
         X_category3, X_name, X_cat, x_col)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    if develop:
        #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233)
        train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[
            trnidx], y.values[validx]

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=1,
                    inv_link="identity",
                    threads=threads)  #iters=15

    baseline = 1.
    for i in range(15):
        model.fit(train_X, train_y, verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline:
            baseline = score_
        else:
            break

    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y),
                                          np.expm1(predsfm)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
Пример #4
0
    merge.description_token.str.lower(), tok_raw_dsc)
merge['seq_item_description_rev'] = [
    list(reversed(l)) for l in merge.seq_item_description
]
gc.collect()
merge["seq_name"] = fit_sequence(merge.name.str.lower(), tok_raw_nam)
gc.collect()
merge["seq_name_token"] = fit_sequence(merge.name_token.str.lower(),
                                       tok_raw_ntk,
                                       filt=False)
gc.collect()
print('[{}] Finished PROCESSING TEXT DATA...'.format(time.time() - start_time))
merge.head()
#EXTRACT DEVELOPTMENT TEST

stopw = set(stopwords.keys())


# Get the dot product
def posn_to_sparse(dt, embedding_map):
    sprow = []
    spcol = []
    spdata = []
    embw = set(embedding_map.keys())
    embid_lens = []
    for c, (nm, ct) in enumerate(
            zip(dt['name_token'].values, dt['category_token'].values)):
        sent = " ".join([nm, ct])
        ids = [
            embedding_map[s] for s in sent.split(' ')
            if (s in embw) and (s not in stopw)
Пример #5
0
tok_raw_cat = myTokenizerFit(
    mergetrn.category_name_split[:nrow_train].str.lower().unique(),
    max_words=800)
gc.collect()
tok_raw_nam = myTokenizerFit(mergetrn.name[:nrow_train].str.lower().unique(),
                             max_words=25000)
gc.collect()
tok_raw_dsc = myTokenizerFit(
    mergetrn.description_token[:nrow_train].str.lower().unique(),
    max_words=25000)
gc.collect()
tok_raw_ntk = myTokenizerFit(
    mergetrn.name_token[:nrow_train].str.lower().unique(), max_words=50000)
gc.collect()

stoppers = set(stopwords.keys())
tok_raw_cat_drop = dict([(v, k) for (k, v) in tok_raw_cat.items()
                         if len(k) == 1 or k in stoppers])
tok_raw_nam_drop = dict([(v, k) for (k, v) in tok_raw_nam.items()
                         if len(k) == 1 or k in stoppers])
tok_raw_dsc_drop = dict([(v, k) for (k, v) in tok_raw_dsc.items()
                         if len(k) == 1 or k in stoppers])
tok_raw_ntk_drop = dict([(v, k) for (k, v) in tok_raw_ntk.items()
                         if len(k) == 1 or k in stoppers])

mergetrn = seqTokenDf(mergetrn)
tstls = [seqTokenDf(df) for (df) in tstls]
nrow_test = sum([df.shape[0] for (df) in tstls])
gc.collect()
'''
Pretrained embeddings
Пример #6
0
for col in ['name_token', 'category_token']:
    flat_counter = list_flatten(mergetrn[[col]].values[:, 0])
    wordlist += [k for (k, v) in flat_counter.items() if v > 0]
    wordlist = list(set(wordlist))
wordlist = set(wordlist)
embeddings_matrix = []
embedding_map = {}
#f = open('../feat/wiki.en.vec')
f = open(glove_file)
counter = 0
for line in f:
    values = line.split()
    word = values[0]
    if word not in wordlist:
        continue
    if word in set(stopwords.keys()):
        continue
    #coefs = np.asarray(values[1:], dtype='float32')
    embedding_map[word] = counter
    embeddings_matrix.append(values[1:])
    counter += 1
    if (counter % 10000 == 0) and (counter != 0):
        print('Found %s word vectors.' % counter)
f.close()
print('Found %s word vectors.' % counter)
embeddings_matrix = np.array(embeddings_matrix, dtype='float16')
'''
Embeddings to dense
'''
mergetrn.head()