예제 #1
0
파일: utils.py 프로젝트: wonyonyon/nlpia
def embed_wordvecs(w2v=None, df=None, vocab='name', embedder=TSNE, **kwargs):
    w2v = os.path.join(DATA_PATH, 'GoogleNews-vectors-negative300.bin') if w2v is None else w2v
    try:
        model = Word2Vec.loadWord2Vec.load_word2vec_format(w2v, binary=True) if isinstance(w2v, str) else w2v
    except IOError:
        model = os.path.join(DATA_PATH, w2v)
        model = Word2Vec.loadWord2Vec.load_word2vec_format(model, binary=True)
    if df is None:
        df = get_data('cities')
    if isinstance(vocab, str) and vocab in df.columns:
        vocab = set([s.replace(' ', '_') for s in vocab.name] + [s.replace(' ', '_') for s in df.country])

    vocab = [word for word in vocab if word in model.wv]
    vectors = pd.DataFrame([model.wv[word] for word in vocab], index=vocab, columns=range(300))
    tsne = embedder(**kwargs)
    tsne = tsne.fit(vectors)
    return pd.DataFrame(tsne.embedding_, columns=['x', 'y'])
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from nltk.tokenize.casual import casual_tokenize
from matplotlib import pyplot as plt
import seaborn  # noqa

from nlpia.data import get_data

from nltk.sentiment import SentimentIntensityAnalyzer
from nlpia.models import LinearRegressor
from sklearn.linear_model import SGDRegressor


sms = get_data('sms-spam')
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
tfidf_docs = pd.DataFrame(tfidf_docs, columns=list(zip(*sorted([(v, k) for (k, v) in tfidf.vocabulary_.items()])))[1])

# TFIDF
tfidf_lda = LDA(n_components=1)
tfidf_lda.fit(tfidf_docs, sms.spam)
# UserWarning: Variables are collinear. warnings.warn("Variables are collinear.")
sms['tfidf_lda_spam_prob'] = tfidf_lda.predict_proba(tfidf_docs)[:, 1]
# Almost all 00000...0001 or .9999999...

# TFIDF->PCA
pca = PCA(n_components=256)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
예제 #3
0
파일: ch04.py 프로젝트: tom-d/nlpia
word_vector['cat']   =  .3 * topic['pet'] + .1 * topic['animal'] +  0 * topic['city']
word_vector['dog']   =  .3 * topic['pet'] + .1 * topic['animal'] - .1 * topic['city']
word_vector['apple'] =   0 * topic['pet'] - .1 * topic['animal'] + .2 * topic['city']
word_vector['lion']  =   0 * topic['pet'] + .5 * topic['animal'] - .1 * topic['city']
word_vector['NYC']   = -.2 * topic['pet'] + .1 * topic['animal'] + .5 * topic['city']
word_vector['love']  =  .2 * topic['pet'] - .1 * topic['animal'] + .1 * topic['city']



import pandas as pd
from sklearn.decomposition import PCA
import seaborn
from matplotlib import pyplot as plt
from nlpia.data import get_data

df = get_data('pointcloud').sample(1000)
pca = PCA(n_components=2)
df2d = pd.DataFrame(pca.fit_transform(df), columns=list('xy'))
df2d.plot(kind='scatter', x='x', y='y')
plt.show()



from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
from nlpia.data import get_data

sms = get_data('sms-spam')
sms.head(3)
#    spam                                               text
# 0     0  Go until jurong point, crazy.. Available only ...