Exemplos de TfidVectorizer em Python, exemplos de sklearn.feature_extraction.text.TfidVectorizer em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: Lawbot.py Projeto: maniamartial/ChatLaw

def answers(User_resp):
    chatbot_response=' '
    sentense_tokens.append(User_resp)

    vectorizer=TfidVectorizer(tokenizer = Lmnormalize,stop_words='english')
    tfidf=vectorizer.fit_transform(sentense_tokens)
    vals=cosine_similarity(tfidf[-1],tfidf)
    flaten=vals.flatten()
    flaten.sort()
    req_tfidf=flaten[-2]
    if(req_tfidf==0):
        chatbot_response=chatbot)_response+"Sorry i dont have an idea about that rule"

Exemplo n.º 2

0

Exibir arquivo

    def similarity_results(seen_items, not_seen_items):
        """
        This method calculates the similarity between seen items and not
        seen items based of their keywords.
        :param seen_items: List of tuples with the URL of seen items and
        their associated keywords
        :param not_seen_items: List of tuples with the URL of not seen items
        and their associated keywords
        :return: A list of tuples with their score of similarity, the URL of the
        seen item and the URL of the not seen item
        """
        seen_items_length = len(seen_items)
        # used_items_keywords = [y for x, y in used_items]
        not_seen_items_keywords = [y for x, y in not_seen_items]
        vectorizer = TfidVectorizer(tokenizer=CalculateSimilarity.tokens)
        tfidf = vectorizer.fit_transform(seen_items + not_seen_items_keywords)
        seen_items_vectors = [x.reshape(1, -1) for x in tfidf.toarray()[:seen_items_length]]
        seen_items_vectors = list(zip(seen_items_vectors, seen_items))
        not_seen_items_vectors = [x.reshape(1, -1) for x in tfidf.toarray()[seen_items_length:]]
        not_seen_items_vectors = list(zip(not_seen_items_vectors, not_seen_items))
        paired_seen_unseen_items = []
        similarities_result = []

        for r in itertools.product(seen_items_vectors, not_seen_items_vectors):
            paired_seen_unseen_items.extend([(r[0], r[1])])

        for key, value in paired_seen_unseen_items:
            keywords1, des1 = key
            # name1, des1 = pair1
            keywords2, pair2 = value
            name2, des2 = pair2
            similarity = cosine_similarity(keywords1, keywords2)
            similarities_result.extend([(similarity.item(0, 0), name2)])

        return similarities_result

Exemplo n.º 3

0

Exibir arquivo

      Unscramble the leters to make a word.
      (press the enter key at prompt to quit)
      """)
print("The jumble is:", jumble)
guess = input("Your guess: ")
while guess != correct and guess != "":
    print("Sorry, that's not it")
    guess = input("Your guess: ")
if guess == correct:
    print("That's it, you guessed it!\n")
print("Thanks for playing")

input("\n\nPress the enter key to exit")

from sklearn.feature_extraction.text import TfidVectorizer
import pandas as pd
texts = [
    "good movie", "nota good movie", "did not like", "i like it", "good one"
]

tfidf = TfidVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
features = tfidf.fit_transform(texts)
pd.DataFrame(features.todense(), columns=tfidf.get_feature_names())

import nltk
import sklearn

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))

Exemplo n.º 4

0

Exibir arquivo

One of the simplest methods of encoding data is by word counts: you take each snippet of text, count the occurrences of
each word within it,a nd put the results in a table.

sample = ['problem of evil', 'evil queen', 'horizon problem']
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(sample)

pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

This however leads to raw word counts putting too much weight on words that appear frequently, and this can be suboptimal
in some classification algorithms. One approach to fix this is using Term Frequency - Inverse Document Frequency (TF-IDF),
which weights the word counts by a measure of how often they appear in the documents.

from sklearn.feature_extraction.text import TfidVectorizer
vec = TfidVectorizer()
X = vec.fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())


# Image Features

The simplest approach is to use the pixel values themselves. Although this may not be optimal at times.
Scikit-Image Project provides much more details on this.


# Imputation of Missing Data

Another common need in feature engineering is handling missing data.

Some options include:

Exemplo n.º 5

0

Exibir arquivo

hillary_tweets = miner.mine_user_tweets("HillaryClinton")
hillary_df = pd.DataFrame(hillary_tweets)
print("Hillary_df = ", hillary_df,
      '\n ##########################################\n', hillary_df.shape)

tweets = pd.concat([donald_df, hillary_df], axis=0)

print("tweet_shape = ", tweets.shape)

############################################ Any interesting ngrams going on with Trump or Hillary? ##########################

from sklearn.feature_extraction.text import TfidVectorizer
from collections import Counter

# We can use the TfidfVectorizer to find ngrams for us
vect = TfidVectorizer(ngram_range=(2, 5), stop_words='english')

# Pulls all of trumps tweet text's into one giant string
summaries = ''.join(donald_df['text'])
ngrams_summaries = vect.build_analyzer()(summaries)

print("Common ngrams = ", Counter(ngrams_summaries).most_common(20))

############################ Fake news....figures ############################3

vect = TfidVectorizer(ngram_range=(2, 5), stop_words='english')

summaries = ''.join(hillary_df['text'])
ngrams_summaries = vect.build_analyzer()(summaries)

print("Common ngrams for hillary = ",

Exemplo n.º 6

0

Exibir arquivo

Arquivo: main.py Projeto: hunterabraham/hate_speech_detection_twitter

data["preprocessed_tweet"] = preprocess(data["tweet"])
print(data["tweet"].head())

y = data["class"].values


X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, stratify=y)

print("Train data: ", X_train, y_train)
print("Test data: ", X_test, y_test)


vect = CountVectorizer(min_df=1) # min_df is min # of times a word can appear in doc

X_train_phrase_bow = vect.fit_transform(X_train["tweet"]) # change to preprocessed column
X_test_phrase_bow = vect.fit_transform(X_test["tweet"]) # change to preprocessed column

vectorize = TfidVectorizer(min_df=1)

X_train_phrase_tfidf = vectorizer.fit_transform(X_train["preprocessed_tweet"])
X_test_phrase_tfidf = vectorizer.transform(X_test["preprocessed_tweet"])


lin_reg = LinearRegression(penalty="l1")

lin_reg.fit(X_train_phrase_tfidf, y_train)

predicted_results = lin_reg.predict(X_test_phrase_tfidf)

print("Accuracy: ", accuracy_score(y_test, predicted_results))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: semantic_distance.py Projeto: bazanovalyubov/HW_NIS

__author__ = 'admin'
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidVectorizer
from math import fabs

i = 2
j = 1
dataset = fetch_20newsgroups(categories = ['alt.atheism', 'talk.religion.misc', 'sci.space'])
vector = TfidVectorizer()
X = vector.fit_transform(dataset.data).toarray()
a = []
for c in range(X.shape[0]):
    a.append(fabs(X[c][i] - X[c][j]))
result = 0
for x in a:
    result += x
print (result)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: model.py Projeto: salman01zp/Predict-Future-Sales

train_data = pd.read_csv('../input/ sales_train.csv')
test_data = pd.read_csv('../input/test.csv')

print('train: ', train_data.shape,'test: ', test_data.shape)
print(test_data.head())

items = pd.read_csv('../input/items.csv')
item_cat = pd.read_csv('../input/item_categories.csv')
shops = pd.read_Csv('../input/shops.csv')


from sklearn.feature_extraction.text import TfidVectorizer

feature_cnt =30
tfid = TfidVectorizer(max_df = 0.6, max_features=feature_cnt, ngram_range=(1,2))
item_cat['item_category_name_len'] = item_cat['item_category_name'].apply(len)
item_cat['item_category_name_wc'] = item_cat['item_category_name'].apply(lambda x: len(str(x).split(' ')))
print(item_cat.head())

txtFeatures = pd.DataFrame(tfid.fit_transform(item_cat['item_category_name']).toarray())
cols = txtFeatures.columns
for i in range(feature_cnt):
	item_cat['item_category_name_tfid_' + str(i)] = txtFeatures[cols[i]]


items['item_name_len'] = item['item_name'].apply(len)
items['item_name_wc'] = item['item_name'].apply(lambda x: len(str(x).split(' ')))
txtFeatures = pd.DataFrame(tfid.fit_transform(item['item_name']).toarray())

cols = txtFeatures.columns

Exemplo n.º 9

0

Exibir arquivo

movies = pd.read_csv("movies_metadata.csv")
movies.head()

print("Credits: ", credits.shape)
print("Movies: ", movies.shape)

# moviesMerge = movies.merge(credits, on = "id")
# moviesMerge.head()

moviesSorted = movies.drop(columns = ["homepage", "production_countries", "status"])
moviesSorted.info()
# moviesSorted.head(1)["overview"]


tfv = TfidVectorizer(min_df = 3, maxFeatures = None, stripAccents = "unicode", analyzer = "word", tokenPattern =r"\w{1,}", ngramRange = (1, 3), stopWords = "english")

moviesSorted["overview"] = movies["overview"].fillna("")

tfvMatrix = tfv.fit_transform(moviesSorted["overview"])
tfvMatrix
tfvMatrix.shape

sig = sigmoid_kernel(tfvMartrix, tfvMatrix)
sig[0]

indices = pd.Series()(moviesSorted.index, index = moviesSorted["original_title"]).drop_duplicates()
indices

def giveRec(title, sig = sig):
    idx = indices[title]

Exemplo n.º 10

0

Exibir arquivo

Arquivo: semantic_distance.py Projeto: MarinaIvanova94/HW_NIS

__author__ = 'admin'
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidVectorizer
from math import fabs

i = 2
j = 1
dataset = fetch_20newsgroups(
    categories=['alt.atheism', 'talk.religion.misc', 'sci.space'])
vector = TfidVectorizer()
X = vector.fit_transform(dataset.data).toarray()
a = []
for c in range(X.shape[0]):
    a.append(fabs(X[c][i] - X[c][j]))
result = 0
for x in a:
    result += x
print(result)