Python HashingVectorizer.HashingVectorizer 예제들, sklearn.feature_extraction.text.HashingVectorizer.HashingVectorizer Python 예제들

예제 #1

0

파일 보기

파일: naivebiasHashST.py 프로젝트: YaminiMD/Sentiment-analysis

def convert(x):
    if x < -0.05:
        return 0
    elif -0.05 < x < 0.05:
        return 1
    else :
        return 2 
# Labeling based on returned values
data_df['label_stemmed'] = data_df['sentiment_stemmed'].apply(lambda x: convert(x['compound']))
#importing HashingVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
#hashing vectorization
X= data_df['tweet_stemmed']
hashing_vectorizer = HashingVectorizer(stop_words = 'english',alternate_sign= False)
hash_stem = hashing_vectorizer.fit_transform(X)
y= data_df['label_stemmed']
#print("Data vectorized")

#vectorization time
Vectorizing_time = time.time()
print("Vectorizing_time :",Vectorizing_time - start_time)

#train and test set formed
hashing_trainset = hash_stem[:319685, :]
hashing_testset  = hash_stem[319685:,:]

x_train, x_test , y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
x_train = hashing_trainset[y_train.index]
x_test= hashing_trainset[y_test.index]

예제 #2

0

파일 보기

파일: pipeline.py 프로젝트: jocoder22/PythonDataScience

        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])



    # Import HashingVectorizer
    from sklearn.feature_extraction.text import HashingVectorizer

    # Get text data: text_data
    text_data=combine_text_columns(X_train)

    # Create the token pattern: TOKENS_ALPHANUMERIC
    TOKENS_ALPHANUMERIC='[A-Za-z0-9]+(?=\\s+)'

    # Instantiate the HashingVectorizer: hashing_vec
    hashing_vec=HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

    # Fit and transform the Hashing Vectorizer
    hashed_text=hashing_vec.fit_transform(text_data)

    # Create DataFrame and print the head
    hashed_df=pd.DataFrame(hashed_text.data)
    print(hashed_df.head())



    # Import the hashing vectorizer
    from sklearn.feature_extraction.text import HashingVectorizer

    # Instantiate the winning model pipeline: pl
    pl=Pipeline([

예제 #3

0

파일 보기

파일: kmeans_document_clustering.py 프로젝트: dm-homework/hw

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

labels = dataset.target
true_k = np.unique(labels).shape[0]

print("Extracting features from the training dataset "
      "using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    if opts.use_idf:
        # Perform an IDF normalization on the output of HashingVectorizer
        hasher = HashingVectorizer(n_features=opts.n_features,
                                   stop_words='english',
                                   alternate_sign=False,
                                   norm=None,
                                   binary=False)
        vectorizer = make_pipeline(hasher, TfidfTransformer())
    else:
        vectorizer = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       alternate_sign=False,
                                       norm='l2',
                                       binary=False)
else:
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=opts.n_features,
                                 min_df=2,
                                 stop_words='english',
                                 use_idf=opts.use_idf)

예제 #4

0

파일 보기

파일: test_text.py 프로젝트: slowthingsfastthings/david

 def func():
     hv = HashingVectorizer()
     hv.fit_transform(['hello world', np.nan, 'hello hello'])

예제 #5

0

파일 보기

from sklearn.feature_extraction.text import HashingVectorizer

corpus = {'ass', 'bdfs', 'cer', 'dsssdf'}
vectorizer = HashingVectorizer(n_features=14)
print vectorizer.transform(corpus).todense()

예제 #6

0

파일 보기

파일: document_classification_20newsgroups.py 프로젝트: DanielHHP/mlnd_capstone

print("%d documents - %0.3fMB (training set)" %
      (len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" %
      (len(data_test.data), data_test_size_mb))
print("%d categories" % len(categories))
print()

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    vectorizer = HashingVectorizer(stop_words='english',
                                   alternate_sign=False,
                                   n_features=opts.n_features)
    X_train = vectorizer.transform(data_train.data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)

예제 #7

0

파일 보기

파일: hashvect.py 프로젝트: soundbooze/soundbooze-mame

    def __init__(self):

        self.vectorizer = HashingVectorizer(n_features=2**4)
        self.H = {}
        self.model = None
        self.period = 1000

예제 #8

0

파일 보기

        return self

    def transform(self, X):
        return X.toarray()


# ------- Feature Builder -----------------
def is_list_or_tuple(obj):
    return isinstance(obj, tuple) or isinstance(obj, list)


# Feature model specifications
# For Chinese
fm_spec = {
    'hashing':
    HashingVectorizer(tokenizer=tokenize_zh),
    'count':
    Count(ngram_range=(1, 5),
          min_df=5,
          max_df=0.9,
          max_features=4000,
          tokenizer=tokenize_zh),
    'tfidf': ['count', Tfidf()],
    'tfidf_dense': ['tfidf', SparseToDense()],
    'lsa_200': ['tfidf', SVD(n_components=200)],
    'lsa_500': ['tfidf', SVD(n_components=500)],
    'lsa_1k': ['tfidf', SVD(n_components=1000)],
    'lsa_500_minmax': ['lsa_500', MinMaxScaler()],
    'lsa_1k_minmax': ['lsa_1k', MinMaxScaler()],

    # smaller vocabulary (removed more stop and infrequent words)

예제 #9

0

파일 보기

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import ReadFromText, ReadAllFromText

from dsba6155project.constants import Constants
import os
import re

from collections import defaultdict

import numpy as np
from scipy.sparse import vstack
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=20000,
                               strip_accents='unicode',
                               stop_words="english",
                               norm=None)


class PerformIncrementalPCA(beam.DoFn):
    def process(self, elem):
        ipca = IncrementalPCA(n_components=n_components, batch_size=10)
        X_ipca = ipca.fit_transform(X)


class ReadBooks(beam.DoFn):
    def process(self, elem):
        return ReadFromTextWithFilename(elem)


class Hashing(beam.DoFn):

예제 #10

0

파일 보기

파일: plot_out_of_core_classification.py 프로젝트: AndersErikBojsen/scikit-learn

    def iterdocs(self):
        """Iterate doc by doc, yield a dict."""
        for root, _dirnames, filenames in os.walk(self.data_path):
            for filename in fnmatch.filter(filenames, '*.sgm'):
                path = os.path.join(root, filename)
                parser = ReutersParser()
                for doc in parser.parse(open(path)):
                    yield doc


###############################################################################
# Main
###############################################################################
# Create the hasher and limit the number of features to a reasonable maximum
hasher = HashingVectorizer(charset_error='ignore', n_features=2**18)

# Create an online classifier i.e. supporting `partial_fit()`
classifier = SGDClassifier()

# Create the data_streamer that parses Reuters SGML files and iterates on
# documents as a stream
data_streamer = ReutersStreamReader('reuters').iterdocs()

# Here we propose to learn a binary classification between the positive class
# and all other documents."""
all_classes = np.array([0, 1])
# NB: the 'acq' class was chosen as it is more or less evenly distributed in
# the Reuters files. For other datasets, one should take care of creating a
# test set with a realistic portion of positive instances.
positive_class = 'acq'

예제 #11

0

파일 보기

def initHashVectorization(n_features=2**16):
    return HashingVectorizer(n_features=n_features)

예제 #12

0

파일 보기

파일: Use_Sklearn.py 프로젝트: ZJCODE/CodeArchive

def get_word_feature(Train,Test):
    hv = HashingVectorizer(n_features=80000, non_negative=True)
    vectorizer = make_pipeline(hv, TfidfTransformer())
    train_feature = vectorizer.fit_transform(Train).toarray()
    test_feature = vectorizer.transform(Test) 
    return train_feature,test_feature

예제 #13

0

파일 보기

파일: document_clustering.py 프로젝트: souravbose1991/scikit-learn_implementations

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

labels = dataset.target
true_k = np.unique(labels).shape[0]

print(
    "Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    if opts.use_idf:
        # Perform an IDF normalization on the output of HashingVectorizer
        hasher = HashingVectorizer(n_features=opts.n_features,
                                   stop_words='english',
                                   non_negative=True,
                                   norm=None,
                                   binary=False)
        vectorizer = make_pipeline(hasher, TfidfTransformer())
    else:
        vectorizer = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       non_negative=False,
                                       norm='l2',
                                       binary=False)
else:
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=opts.n_features,
                                 min_df=2,
                                 stop_words='english',
                                 use_idf=opts.use_idf)

예제 #14

0

파일 보기

def wordVec():
    dataset = fetch_20newsgroups(subset='all',
                                 categories=categories,
                                 shuffle=True,
                                 random_state=42)

    print("%d documents" % len(dataset.data))
    print("%d categories" % len(dataset.target_names))
    print()

    labels = dataset.target

    print("Extracting features from the training dataset "
          "using a sparse vectorizer")
    t0 = time.clock()

    if sklearn.naive_bayes.check_X_y():
        if sklearn.naive_bayes.safe_sparse_dot():

            hasher = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       alternate_sign=False,
                                       norm=None)
            vectorizer = make_pipeline(hasher, TfidfTransformer())
        else:
            vectorizer = HashingVectorizer(n_features=opts.n_features,
                                           stop_words='english',
                                           alternate_sign=False,
                                           norm='l2')
    else:
        vectorizer = TfidfVectorizer(max_df=0.5,
                                     max_features=opts.n_features,
                                     min_df=2,
                                     stop_words='english',
                                     use_idf=sklearn.metrics.roc_curve())
    X = vectorizer.fit_transform(dataset.data)

    print("done in %fs" % (time.time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)
    print()

    if True:
        print("Performing dimensionality reduction using LSA")
        t0 = time.time()
        svd = TruncatedSVD(sklearn.linear_model.SGDClassifier.predict())
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        X = lsa.fit_transform(X)

        print("done in %fs" % (time() - t0))

        explained_variance = svd.explained_variance_ratio_.sum()
        print("Explained variance of the SVD step: {}%".format(
            int(explained_variance * 100)))

        print()

    if opts.minibatch:
        km = MiniBatchKMeans(n_clusters=true_k,
                             init='k-means++',
                             n_init=1,
                             init_size=1000,
                             batch_size=1000,
                             verbose=opts.verbose)
    else:
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    max_iter=100,
                    n_init=1,
                    verbose=opts.verbose)

    print("Clustering sparse data with %s" % km)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(X, km.labels_, sample_size=1000))

    print()

    if not opts.use_hashing:
        print("Top terms per cluster:")

        if opts.n_components:
            original_space_centroids = svd.inverse_transform(
                km.cluster_centers_)
            order_centroids = original_space_centroids.argsort()[:, ::-1]
        else:
            order_centroids = km.cluster_centers_.argsort()[:, ::-1]

        terms = vectorizer.get_feature_names()
        for i in range(true_k):
            print("Cluster %d:" % i, end='')
            for ind in order_centroids[i, :10]:
                print(' %s' % terms[ind], end='')
            print()

예제 #15

0

파일 보기

            print(string)


# open data-set and divide it
data = pandas.read_csv(
    'C:/Users/egedi/OneDrive/Belgeler/GitHub/ceng-407-408-2019-2020-Spam-SMS-Detection/SpamSmsDetection/spamSmsDetectionMobileApp/api/spam.csv',
    encoding='latin-1')
learn = data[:4400]  # 4400 items
test = data[4400:]  # 1172 items

perform([
    BernoulliNB(),
    RandomForestClassifier(n_estimators=100, n_jobs=-1),
    AdaBoostClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    GradientBoostingClassifier(),
    DecisionTreeClassifier(),
    CalibratedClassifierCV(),
    DummyClassifier(),
    PassiveAggressiveClassifier(),
    RidgeClassifier(),
    RidgeClassifierCV(),
    SGDClassifier(),
    OneVsRestClassifier(SVC(kernel='linear')),
    OneVsRestClassifier(LogisticRegression()),
    KNeighborsClassifier()
], [CountVectorizer(),
    TfidfVectorizer(),
    HashingVectorizer()], learn, test)

예제 #16

0

파일 보기

        if x % 20 == 0:
            print 'building classifier number %d' % x
        ensemble.append(naive_bayes_builder(pos_sample, neg_sample,
                                            vectorizer))

    votes = np.zeros(len(test_y))
    for clf in ensemble:
        votes += clf.predict(test_x)

    votes = [1 if x > voting_threshhold else 0 for x in votes]

    print precision_recall_fscore_support(test_y, votes)


vectorizer = HashingVectorizer(decode_error='ignore',
                               n_features=2**18,
                               non_negative=True,
                               ngram_range=(1, 1))

test_x, test_y = create_test_set(test_pos, test_neg, vectorizer)

create_ensemble(pos_sample=train_pos,
                neg_sample=train_neg,
                number_of_classifiers=500,
                vectorizer=vectorizer,
                voting_threshhold=250,
                test_x=test_x,
                test_y=test_y)

# clf1 = naive_bayes_builder(train_pos, train_neg, vectorizer)

# y_pred = clf1.predict(test_x)

예제 #17

0

파일 보기

파일: member_spammer_estimator_build.py 프로젝트: vabada/openstack-org

cursor = None
config = DBConfig(root_dir+"/db.ini").read_db_config()

try:
    # Open database connection
    db = MySQLdb.connect(**config)
    # prepare a cursor object using cursor() method
    cursor  = db.cursor()
    pd = sql.read_sql(queryGetClassifiedMembers, db)
    data = pd.replace(np.nan, '', regex=True)
    labels = pd.Type
    trainData = data.drop(['Type'], axis=1)

    email_pipe = Pipeline([
        ('data', DataFrameColumnExtracter('Email')),
        ('vectorizer', HashingVectorizer(non_negative=True))
    ])

    fname_pipe = Pipeline([
        ('data', DataFrameColumnExtracter('FirstName')),
        ('vectorizer', HashingVectorizer(non_negative=True))
    ])

    lname_pipe = Pipeline([
        ('data', DataFrameColumnExtracter('Surname')),
        ('vectorizer', HashingVectorizer(non_negative=True))
    ])

    bio_pipe = Pipeline([
        ('data', DataFrameColumnExtracter('Bio')),
        ('preprocessor', StripHTMLTransformer()),

예제 #18

0

파일 보기

        'the',
        'a',
        'an',
        'is',
        'it',
        'this',
    ])
    # 'i', 'so', 'its', 'am', 'are'])

    vectorizer = FeatureUnion([
        ('name',
         Pipeline([('select', ItemSelector('name', start_time=start_time)),
                   ('transform',
                    HashingVectorizer(ngram_range=(1, 2),
                                      n_features=2**27,
                                      norm='l2',
                                      lowercase=False,
                                      stop_words=stopwords)),
                   ('drop_cols', DropColumnsByDf(min_df=2))])),
        ('category_name',
         Pipeline([
             ('select', ItemSelector('category_name', start_time=start_time)),
             ('transform',
              HashingVectorizer(ngram_range=(1, 1),
                                token_pattern='.+',
                                tokenizer=split_cat,
                                n_features=2**27,
                                norm='l2',
                                lowercase=False)),
             ('drop_cols', DropColumnsByDf(min_df=2))
         ])),

예제 #19

0

파일 보기

파일: docCX_vA_1C_1.py 프로젝트: apoorvakh/Expertise-Classification-of-Researchers


print("Training :: %d documents - " % (len(data_train.data)))
print("Testing :: %d documents - " % (len(data_test.data)))
print("%d categories" % len(categories))

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print(
    "For training : Extracting features from the training data using a sparse vectorizer"
)
t0 = time()
if False:  #.use_hashing:
    vectorizer = HashingVectorizer(stop_words='english',
                                   non_negative=True,
                                   n_features=opts.n_features)
    X_train = vectorizer.transform(data_train.data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 ngram_range=tuple([1, 3]),
                                 stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
duration = time() - t0
print("done in %fs" % (duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print(
    "For testing : Extracting features from the test data using the same vectorizer"

예제 #20

0

파일 보기

파일: Model.py 프로젝트: Yarharm/Embedded-ML-Model

    :return: array of strings, array of ints
        array of strings contains nubmer of records
        array of ints contains correspoding number of labels
    """
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

vect = HashingVectorizer(decode_error='ignore',
                         tokenizer=tokenizer,  # Callable tokenizer function
                         n_features=2**21)  # Reduce hash collisions
clf = SGDClassifier(loss='log', random_state=1, max_iter=1, tol=1e-3)
doc_stream = stream_docs(path='movie_data.csv')

# OUT-OF-CORE LEARNING
classes = np.array([0, 1])
# Allocate 45000 records for the training
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)

# Allocate 5000 records for the testing

예제 #21

0

파일 보기

    for col in df.columns:
        print(df[col].dtypes)

    # 연관성높은 레시피의 recipeId 100개를 json형태로 반환한다.
    return jsonify(recomandResult=responseData)


# lazy Loading을 하여 모델을 불러오는 시간을 단축한다.
if __name__ == '__main__':
    #현재 인식할 수 있는 식재료들
    labels = [
        'chilli', 'egg', 'pork meat', 'potato', 'pa', 'onion', 'carrot',
        'cucumber'
    ]
    #벡터라이저 선언
    vectorize = HashingVectorizer()

    # DB와의 연동하는 부분
    # DB와의 연결을 생성
    engine = create_engine(
        'mysql://*****:*****@localhost:3307/testDB?charset=utf8',
        convert_unicode=True,
        encoding='UTF-8')
    # engine = create_engine('mysql://*****:*****@sts.c2yt44rkrmcp.us-east-2.rds.amazonaws.com:3306/finalproject?charset=utf8', convert_unicode=True,encoding='UTF-8')
    conn = engine.connect()
    # recipe 테이블을 읽어온다.
    data = pd.read_sql_table('recipe', conn)
    # 누락된 값을 0으로 전환
    data = data.fillna(0)
    # json형태로 반환하기 위해 int32 or int64 형태의 변수를 float로 전환
    data["id"] = data['id'].astype("float")

예제 #22

0

파일 보기

파일: prepare_yelp_as_text.py 프로젝트: jeffzhengye/fancy-cnn

    reviews_wvs_test = yelp.to_word_level_idx(test_reviews, global_gb,
                                              WORDS_PER_TEXT)
    # -- testing data save
    np.save('Yelp_useful_test_fulltext_glove_300_X.npy', reviews_wvs_test)
    np.save('Yelp_useful_test_fulltext_glove_300_y.npy', test_labels)

    reviews_wvs_test = yelp.to_word_level_idx(test_reviews, yelp_gb,
                                              WORDS_PER_TEXT)
    # -- testing data save
    np.save('Yelp_useful_test_fulltext_Yelp_glove_300_X.npy', reviews_wvs_test)
    np.save('Yelp_useful_test_fulltext_Yelp_glove_300_y.npy', test_labels)

    del reviews_wvs_test

    log('Hashing BOW features, might be used by some NN models')
    hv = HashingVectorizer(
        n_features=BOW_HASH_DIMENSION)  # Int: maybe try without normalization
    train_bow_hash = hv.transform(train_reviews)
    test_bow_hash = hv.transform(test_reviews)
    np.save('Yelp_useful_train_hashbow.npy', train_bow_hash.todense())
    np.save('Yelp_useful_test_hashbow.npy', test_bow_hash.todense())

    ##################################
    ### YELP FUNNY
    ##################################
    log('Saving "funny" votes data')
    (train_reviews, train_labels, test_reviews, test_labels) = \
        yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST)

    reviews_wvs_train = yelp.to_word_level_idx(train_reviews, global_gb,
                                               WORDS_PER_TEXT)
    # -- training data save

예제 #23

0

파일 보기

파일: vectorizer.py 프로젝트: JozeeLin/python-machine-learning

from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle

cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'stopwords.pkl'), 'rb'))

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|p)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')

    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)

예제 #24

0

파일 보기

파일: HashVect43555.py 프로젝트: ffedericoni/Mercari

def main():
    start_time = time.time()

    train = pd.read_table('../input/train.tsv', engine='c')
    test = pd.read_table('../input/test.tsv', engine='c')
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)

    nrow_train = train.shape[0]
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, test])
    submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    handle_missing_inplace(merge)
    print('[{}] Finished to handle missing'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Finished to cut'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Finished to convert categorical'.format(time.time() -
                                                        start_time))

    merge.item_description = merge.item_description.str.lower()

    normalize_desc(merge)
    print('[{}] Finished to normalize'.format(time.time() - start_time))

    #    handle_no_description(merge)
    #    print('[{}] Finished to copy names to missing desc'.format(time.time() - start_time))

    cv = CountVectorizer(min_df=NAME_MIN_DF)
    X_name = cv.fit_transform(merge['name'])
    print('[{}] Finished count vectorize `name`'.format(time.time() -
                                                        start_time))

    cv = CountVectorizer()
    X_category = cv.fit_transform(merge['category_name'])
    print('[{}] Finished count vectorize `category_name`'.format(time.time() -
                                                                 start_time))
    #FF
    from nltk.corpus import stopwords
    interesting_words = [
        'new',
        'perfect',
        'fit',
        'used',  #'super', 'cute', 'excellent',
        'great',
        'retail',
        '[rm]',
        'never used',
        'bundle',  #'diamond', 'ruby',
        'platinum',
        'gold',
        'set',
        'case',
        'unused',
        'unopened',
        'sealed'
    ]
    X_intcol = pd.DataFrame()
    for word in interesting_words:
        X_intcol[word] = merge['item_description'].apply(lambda x: word in x)

    X_des = merge['item_description'].apply(lambda x: len(x)).astype('float32')
    X_des = X_des[:, np.newaxis]
    scaler = MaxAbsScaler()
    X_des = scaler.fit_transform(X_des)

    ignore_words = [
        'cant', 'ask', 'size', 'inch', 'inches', 'already', 'inside', 'easy'
    ]
    stop = stopwords.words('english') + ignore_words
    #FF
    #FF version 5 n_features=2**18 --> n_features=2**17
    hv = HashingVectorizer(input='content',
                           stop_words=stop,
                           n_features=2**17,
                           lowercase=False)
    X_description = hv.transform(merge['item_description'])
    print(
        '[{}] Finished Hash vectorize `item_description`'.format(time.time() -
                                                                 start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Finished label binarize `brand_name`'.format(time.time() -
                                                             start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'.
          format(time.time() - start_time))

    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category,
                           X_name, X_intcol, X_des)).tocsr()
    print('[{}] Finished to create sparse merge'.format(time.time() -
                                                        start_time))

    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_train:]

    # def rmsle(y, y0):
    #     assert len(y) == len(y0)
    #     return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

    modelR1 = Ridge(solver="sag",
                    fit_intercept=True,
                    random_state=2,
                    alpha=4,
                    tol=0.0006,
                    max_iter=800)
    modelR1.fit(X, y)
    print('[{}] Finished to train ridge sag'.format(time.time() - start_time))
    predsR = modelR1.predict(X=X_test)
    print('[{}] Finished to predict ridge sag'.format(time.time() -
                                                      start_time))

    modelR2 = Ridge(solver="sag",
                    fit_intercept=True,
                    random_state=145,
                    alpha=0.4)
    modelR2.fit(X, y)
    print('[{}] Finished to train ridge lsqrt'.format(time.time() -
                                                      start_time))
    predsR2 = modelR2.predict(X=X_test)
    print('[{}] Finished to predict ridge lsqrt'.format(time.time() -
                                                        start_time))

    train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                          y,
                                                          test_size=0.01,
                                                          random_state=144)
    d_train = lgb.Dataset(train_X, label=train_y, max_bin=8192)
    d_valid = lgb.Dataset(valid_X, label=valid_y, max_bin=8192)
    watchlist = [d_train, d_valid]

    params = {
        'learning_rate': 0.56,
        'application': 'regression',
        'max_depth': 5,
        'num_leaves': 40,
        'verbosity': -1,
        'metric': 'RMSE',
        'feature_fraction': 0.8,  #cambiato da 0.6
        'nthread': 4
    }

    params2 = {
        'learning_rate': 0.85,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 110,
        'verbosity': -1,
        'metric': 'RMSE',
        'nthread': 4
    }

    modelL1 = lgb.train(params, train_set=d_train, num_boost_round=8000, valid_sets=watchlist, \
    early_stopping_rounds=None, verbose_eval=500)
    predsL = modelL1.predict(X_test)

    print('[{}] Finished to predict lgb 1'.format(time.time() - start_time))

    train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X,
                                                              y,
                                                              test_size=0.01,
                                                              random_state=101)
    d_train2 = lgb.Dataset(train_X2, label=train_y2, max_bin=8192)
    d_valid2 = lgb.Dataset(valid_X2, label=valid_y2, max_bin=8192)
    watchlist2 = [d_train2, d_valid2]

    modelL2 = lgb.train(params2, train_set=d_train2, num_boost_round=3200, valid_sets=watchlist2, \
    early_stopping_rounds=None, verbose_eval=500)
    predsL2 = modelL2.predict(X_test)

    print('[{}] Finished to predict lgb 2'.format(time.time() - start_time))

    preds = predsR2 * 0.2 + predsR * 0.1 + predsL * 0.5 + predsL2 * 0.2

    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_lgbm_ridge_8.csv", index=False)

예제 #25

0

파일 보기

파일: vectorize.py 프로젝트: Gallactide/ML2019

def vectorize_hash(mails):
    vec = HashingVectorizer(n_features=2**10)
    data = vec.fit_transform(mails)
    return (vec, data)

예제 #26

0

파일 보기

파일: TextClassification.py 프로젝트: zhangty96/StackExchangeTag

# y_train = [['New York'],['New York'],['New York'],['New York'],['New York'],['New York'],['London'],['London'], ['London'],['London'],['London'],['London'],['New York', 'London'],['New York', 'London'] ]
y_train = mlb.fit_transform(y_train)
# print y_train
print "classes", list(mlb.classes_)
print len(list(mlb.classes_))
print len(X_train)
# print "-----Binarize y_train----------"
# print y_train

# Pipeline(vectorization, tfid weighting and classifier)
# ppl = Pipeline([
#     ('vectorizer', HashingVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', OneVsRestClassifier(LinearSVC()))])

ppl = Pipeline([('vectorizer', HashingVectorizer()),
                ('clf', OneVsRestClassifier(LinearSVC()))])

ppl.fit(X_train, y_train)

# # Test
# X_test = np.array(['nice day in nyc',
#                    'welcome to london',
#                    'hello welcome to new york. enjoy it here and london too'])
# target_names = ['New York', 'London']     # index--> names
y_predict = ppl.predict(X_test)
labels_predicted = mlb.inverse_transform(y_predict)
print labels_predicted
# print "\tInverting binary encoding......"
# print "=============y_predicted============"
# print labels_predicted

예제 #27

0

파일 보기

파일: clean_wallstreet_data.py 프로젝트: justinminsk/thesis

neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def pre_processing(row):
    first_process = re.sub(combined_pat, '', row)
    second_process = re.sub(www_pat, '', first_process)
    third_process = second_process.lower()
    fourth_process = neg_pattern.sub(lambda x: negations_dic[x.group()], third_process)
    result = re.sub(r'[^A-Za-z ]','',fourth_process)
    return result.strip()

df.ws_content = df.ws_content.apply(pre_processing)

print("Preprocessed")
print(df.shape)

vectorizer = HashingVectorizer(stop_words="english", ngram_range=(2,5), n_features=75000)

text_vector = vectorizer.fit_transform(df.ws_content)

print("text hashed")

print("text vector:", text_vector.shape)

tfifd_vectorizer = TfidfTransformer()

tfifd_vector = tfifd_vectorizer.fit_transform(text_vector)

scaler = MinMaxScaler()

int_values = df[["time_since_col", "stock_price_col"]]

예제 #28

0

파일 보기

파일: R0_XGBoost.py 프로젝트: carlosevi94/Kaggle_Jigsaw_Toxicity

    def __init__(self, MAX_LEN, vec='hash', aug=True):

        self.arrGood = [
            'absolutely', 'abundant', 'accept', 'acclaimed', 'accomplishment',
            'achievement', 'action', 'active', 'activist', 'acumen', 'adjust',
            'admire', 'adopt', 'adorable', 'adored', 'adventure',
            'affirmation', 'affirmative', 'affluent', 'agree', 'airy', 'alive',
            'alliance', 'ally', 'alter', 'amaze', 'amity', 'animated',
            'answer', 'appreciation', 'approve', 'aptitude', 'artistic',
            'assertive', 'astonish', 'astounding', 'astute', 'attractive',
            'authentic', 'basic', 'beaming', 'beautiful', 'believe',
            'benefactor', 'benefit', 'bighearted', 'blessed', 'bliss', 'bloom',
            'bountiful', 'bounty', 'brave', 'bright', 'brilliant', 'bubbly',
            'bunch', 'burgeon', 'calm', 'care', 'celebrate', 'certain',
            'change', 'character', 'charitable', 'charming', 'cheer',
            'cherish', 'clarity', 'classy', 'clean', 'clever', 'closeness',
            'commend', 'companionship', 'complete', 'comradeship', 'confident',
            'connect', 'connected', 'constant', 'content', 'conviction',
            'copious', 'core', 'coupled', 'courageous', 'creative', 'cuddle',
            'cultivate', 'cure', 'curious', 'cute', 'dazzling', 'delight',
            'direct', 'discover', 'distinguished', 'divine', 'donate', 'eager',
            'earnest', 'easy', 'ecstasy', 'effervescent', 'efficient',
            'effortless', 'electrifying', 'elegance', 'embrace',
            'encompassing', 'encourage', 'endorse', 'energized', 'energy',
            'enjoy', 'enormous', 'enthuse', 'enthusiastic', 'entirely',
            'essence', 'established', 'esteem', 'everyday', 'everyone',
            'excited', 'exciting', 'exhilarating', 'expand', 'explore',
            'express', 'exquisite', 'exultant', 'faith', 'familiar', 'family',
            'famous', 'feat', 'fit', 'flourish', 'fortunate', 'fortune',
            'freedom', 'fresh', 'friendship', 'full', 'funny', 'gather',
            'generous', 'genius', 'genuine', 'give', 'glad', 'glow', 'good',
            'gorgeous', 'grace', 'graceful', 'gratitude', 'green', 'grin',
            'group', 'grow', 'handsome', 'happy', 'harmony', 'healed',
            'healing', 'healthful', 'healthy', 'heart', 'hearty', 'heavenly',
            'helpful', 'here', 'hold', 'holy', 'honest', 'honor', 'hug',
            'idea', 'ideal', 'imaginative', 'increase', 'incredible',
            'independent', 'ingenious', 'innate', 'innovate', 'inspire',
            'instantaneous', 'instinct', 'intellectual', 'intelligence',
            'intuitive', 'inventive', 'joined', 'jovial', 'joy', 'jubilation',
            'keen', 'key', 'kind', 'kiss', 'knowledge', 'laugh', 'leader',
            'learn', 'legendary', 'light', 'lively', 'love', 'loveliness',
            'lucidity', 'lucrative', 'luminous', 'maintain', 'marvelous',
            'master', 'meaningful', 'meditate', 'mend', 'metamorphosis',
            'mind-blowing', 'miracle', 'mission', 'modify', 'motivate',
            'moving', 'natural', 'nature', 'nourish', 'nourished', 'novel',
            'now', 'nurture', 'nutritious', 'one', 'open', 'openhanded',
            'optimistic', 'paradise', 'party', 'peace', 'perfect',
            'phenomenon', 'pleasure', 'plenteous', 'plentiful', 'plenty',
            'plethora', 'poise', 'polish', 'popular', 'positive', 'powerful',
            'prepared', 'pretty', 'principle', 'productive', 'project',
            'prominent', 'prosperous', 'protect', 'proud', 'purpose', 'quest',
            'quick', 'quiet', 'ready', 'recognize', 'refinement', 'refresh',
            'rejoice', 'rejuvenate', 'relax', 'reliance', 'rely', 'remarkable',
            'renew', 'renowned', 'replenish', 'resolution', 'resound',
            'resources', 'respect', 'restore', 'revere', 'revolutionize',
            'rewarding', 'rich', 'robust', 'rousing', 'safe', 'secure', 'see',
            'sensation', 'serenity', 'shift', 'shine', 'show', 'silence',
            'simple', 'sincerity', 'smart', 'smile', 'smooth', 'solution',
            'soul', 'sparkling', 'spirit', 'spirited', 'spiritual', 'splendid',
            'spontaneous', 'still', 'stir', 'strong', 'style', 'success',
            'sunny', 'support', 'sure', 'surprise', 'sustain', 'synchronized',
            'team', 'thankful', 'therapeutic', 'thorough', 'thrilled',
            'thrive', 'today', 'together', 'tranquil', 'transform', 'triumph',
            'trust', 'truth', 'unity', 'unusual', 'unwavering', 'upbeat',
            'value', 'vary', 'venerate', 'venture', 'very', 'vibrant',
            'victory', 'vigorous', 'vision', 'visualize', 'vital', 'vivacious',
            'voyage', 'wealthy', 'welcome', 'well', 'whole', 'wholesome',
            'willing', 'wonder', 'wonderful', 'wondrous', 'xanadu', 'yes',
            'yippee', 'young', 'youth', 'youthful', 'zeal', 'zest', 'zing',
            'zip'
        ]

        self.arrBad = [
            'acrotomophilia',
            'anal',
            'anilingus',
            'anus',
            'arsehole',
            'ass',
            'asshole',
            'assmunch',
            'autoerotic',
            'babeland',
            'bangbros',
            'bareback',
            'barenaked',
            'bastardo',
            'bastinado',
            'bbw',
            'bdsm',
            'bestiality',
            'bimbos',
            'birdlock',
            'bitch',
            'blumpkin',
            'bollocks',
            'bondage',
            'boner',
            'boob',
            'boobs',
            'bukkake',
            'bulldyke',
            'bunghole',
            'busty',
            'butt',
            'buttcheeks',
            'butthole',
            'camgirl',
            'camslut',
            'camwhore',
            'carpetmuncher',
            'circlejerk',
            'c**t',
            'clitoris',
            'clusterfuck',
            'c**k',
            'cocks',
            'coprolagnia',
            'coprophilia',
            'cornhole',
            'cum',
            'cumming',
            'cunnilingus',
            'c**t',
            'darkie',
            'daterape',
            'deepthroat',
            'dick',
            'd***o',
            'doggiestyle',
            'doggystyle',
            'dolcett',
            'domination',
            'dominatrix',
            'dommes',
            'ecchi',
            'e*********n',
            'erotic',
            'erotism',
            'escort',
            'eunuch',
            'f****t',
            'fecal',
            'felch',
            'f******o',
            'feltch',
            'femdom',
            'figging',
            'fingering',
            'fisting',
            'footjob',
            'frotting',
            'f**k',
            'f*****g',
            'f*********r',
            'futanari',
            'gay',
            'genitals',
            'goatcx',
            'g****e',
            'gokkun',
            'goodpoop',
            'goregasm',
            'grope',
            'guro',
            'handjob',
            'hardcore',
            'hentai',
            'homoerotic',
            'honkey',
            'hooker',
            'kill',
            'murder',
            'fat',
            'humping',
            'incest',
            'intercourse',
            'jack',
            'jerk',
            'jigaboo',
            'jiggaboo',
            'jiggerboo',
            'j**z',
            'juggs',
            'kike',
            'kinbaku',
            'kinkster',
            'kinky',
            'knobbing',
            'lolita',
            'lovemaking',
            'm********e',
            'm**********r',
            'muffdiving',
            'nambla',
            'nawashi',
            'negro',
            'neonazi',
            'n***a',
            'nigger',
            'nimphomania',
            'nipple',
            'nipples',
            'nude',
            'nudity',
            'nympho',
            'nymphomania',
            'octopussy',
            'omorashi',
            'o****m',
            'orgy',
            'paedophile',
            'panties',
            'panty',
            'pedobear',
            'pedophile',
            'pegging',
            'penis',
            'pissing',
            'pisspig',
            'playboy',
            'ponyplay',
            'poof',
            'poopchute',
            'p**n',
            'porno',
            'pornography',
            'pthc',
            'pubes',
            'pussy',
            'queaf',
            'raghead',
            'rape',
            'raping',
            'rapist',
            'rectum',
            'cowgirl',
            'rimjob',
            'rimming',
            'sadism',
            'scat',
            'schlong',
            'scissoring',
            's***n',
            'sex',
            'sexo',
            'sexy',
            'beaver',
            'pussy',
            'shemale',
            'shibari',
            'shit',
            'shota',
            'shrimping',
            'slanteye',
            's**t',
            'smut',
            'snatch',
            'snowballing',
            'sodomize',
            'sodomy',
            'spic',
            'spooge',
            'strapon',
            'strappado',
            'strip',
            'suck',
            'sucks',
            'suicide',
            'sultry',
            'swastika',
            'swinger',
            'threesome',
            'throating',
            'tit',
            't**s',
            'titties',
            'titty',
            'topless',
            'tosser',
            'towelhead',
            'tranny',
            'tribadism',
            'tubgirl',
            'tushy',
            'twat',
            'twink',
            'twinkie',
            'undressing',
            'upskirt',
            'urophilia',
            'v****a',
            'vibrator',
            'vorarephilia',
            'voyeur',
            'vulva',
            'wank',
            'wetback',
            'xx',
            'xxx',
            'yaoi',
            'yiffy',
        ]

        self.CHARS_TO_REMOVE = '!¡"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'
        self.MAX_LEN = MAX_LEN
        self.STOP_WORDS = list(stopwords.words('english'))
        self.num_partitions = psutil.cpu_count() * 8
        self.num_cores = psutil.cpu_count()
        #        self.stemmer = PorterStemmer()
        self.aug = aug

        if vec == 'hash':
            self.vectorizer = HashingVectorizer(n_features=self.MAX_LEN,
                                                analyzer='word',
                                                lowercase=False)
        else:
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                                              min_df=3,
                                              max_df=0.3,
                                              strip_accents='unicode',
                                              use_idf=1,
                                              smooth_idf=1,
                                              sublinear_tf=1,
                                              max_features=self.MAX_LEN,
                                              lowercase=False)

예제 #29

0

파일 보기

    X = np.array(["numpy", "scipy", "sklearn"])
    vectorizer = TfidfVectorizer(dtype=vectorizer_dtype)

    warning_msg_match = "'dtype' should be used."
    warning_cls = UserWarning
    expected_warning_cls = warning_cls if warning_expected else None
    with pytest.warns(expected_warning_cls, match=warning_msg_match) as record:
        X_idf = vectorizer.fit_transform(X)
    if expected_warning_cls is None:
        relevant_warnings = [w for w in record if isinstance(w, warning_cls)]
        assert len(relevant_warnings) == 0
    assert X_idf.dtype == output_dtype


@pytest.mark.parametrize("vec", [
    HashingVectorizer(ngram_range=(2, 1)),
    CountVectorizer(ngram_range=(2, 1)),
    TfidfVectorizer(ngram_range=(2, 1))
])
def test_vectorizers_invalid_ngram_range(vec):
    # vectorizers could be initialized with invalid ngram range
    # test for raising error message
    invalid_range = vec.ngram_range
    message = ("Invalid value for ngram_range=%s "
               "lower boundary larger than the upper boundary." %
               str(invalid_range))
    if isinstance(vec, HashingVectorizer) and IS_PYPY:
        pytest.xfail(reason='HashingVectorizer is not supported on PyPy')

    assert_raise_message(ValueError, message, vec.fit, ["good news everyone"])
    assert_raise_message(ValueError, message, vec.fit_transform,

예제 #30

0

파일 보기

파일: FE.py 프로젝트: austinkk/competition_lib

 def __init__(self, m):
     self.fh = FeatureHasher(n_features = m, input_type = 'string')
     self.hv = HashingVectorizer(n_features = m)