Exemplo n.º 1
0
def answers(User_resp):
    chatbot_response=' '
    sentense_tokens.append(User_resp)

    vectorizer=TfidVectorizer(tokenizer = Lmnormalize,stop_words='english')
    tfidf=vectorizer.fit_transform(sentense_tokens)
    vals=cosine_similarity(tfidf[-1],tfidf)
    flaten=vals.flatten()
    flaten.sort()
    req_tfidf=flaten[-2]
    if(req_tfidf==0):
        chatbot_response=chatbot)_response+"Sorry i dont have an idea about that rule"
Exemplo n.º 2
0
    def similarity_results(seen_items, not_seen_items):
        """
        This method calculates the similarity between seen items and not
        seen items based of their keywords.
        :param seen_items: List of tuples with the URL of seen items and
        their associated keywords
        :param not_seen_items: List of tuples with the URL of not seen items
        and their associated keywords
        :return: A list of tuples with their score of similarity, the URL of the
        seen item and the URL of the not seen item
        """
        seen_items_length = len(seen_items)
        # used_items_keywords = [y for x, y in used_items]
        not_seen_items_keywords = [y for x, y in not_seen_items]
        vectorizer = TfidVectorizer(tokenizer=CalculateSimilarity.tokens)
        tfidf = vectorizer.fit_transform(seen_items + not_seen_items_keywords)
        seen_items_vectors = [x.reshape(1, -1) for x in tfidf.toarray()[:seen_items_length]]
        seen_items_vectors = list(zip(seen_items_vectors, seen_items))
        not_seen_items_vectors = [x.reshape(1, -1) for x in tfidf.toarray()[seen_items_length:]]
        not_seen_items_vectors = list(zip(not_seen_items_vectors, not_seen_items))
        paired_seen_unseen_items = []
        similarities_result = []

        for r in itertools.product(seen_items_vectors, not_seen_items_vectors):
            paired_seen_unseen_items.extend([(r[0], r[1])])

        for key, value in paired_seen_unseen_items:
            keywords1, des1 = key
            # name1, des1 = pair1
            keywords2, pair2 = value
            name2, des2 = pair2
            similarity = cosine_similarity(keywords1, keywords2)
            similarities_result.extend([(similarity.item(0, 0), name2)])

        return similarities_result
Exemplo n.º 3
0
      Unscramble the leters to make a word.
      (press the enter key at prompt to quit)
      """)
print("The jumble is:", jumble)
guess = input("Your guess: ")
while guess != correct and guess != "":
    print("Sorry, that's not it")
    guess = input("Your guess: ")
if guess == correct:
    print("That's it, you guessed it!\n")
print("Thanks for playing")

input("\n\nPress the enter key to exit")

from sklearn.feature_extraction.text import TfidVectorizer
import pandas as pd
texts = [
    "good movie", "nota good movie", "did not like", "i like it", "good one"
]

tfidf = TfidVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
features = tfidf.fit_transform(texts)
pd.DataFrame(features.todense(), columns=tfidf.get_feature_names())

import nltk
import sklearn

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))
Exemplo n.º 4
0
One of the simplest methods of encoding data is by word counts: you take each snippet of text, count the occurrences of
each word within it,a nd put the results in a table.

sample = ['problem of evil', 'evil queen', 'horizon problem']
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(sample)

pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

This however leads to raw word counts putting too much weight on words that appear frequently, and this can be suboptimal
in some classification algorithms. One approach to fix this is using Term Frequency - Inverse Document Frequency (TF-IDF),
which weights the word counts by a measure of how often they appear in the documents.

from sklearn.feature_extraction.text import TfidVectorizer
vec = TfidVectorizer()
X = vec.fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())


# Image Features

The simplest approach is to use the pixel values themselves. Although this may not be optimal at times.
Scikit-Image Project provides much more details on this.


# Imputation of Missing Data

Another common need in feature engineering is handling missing data.

Some options include:
Exemplo n.º 5
0
hillary_tweets = miner.mine_user_tweets("HillaryClinton")
hillary_df = pd.DataFrame(hillary_tweets)
print("Hillary_df = ", hillary_df,
      '\n ##########################################\n', hillary_df.shape)

tweets = pd.concat([donald_df, hillary_df], axis=0)

print("tweet_shape = ", tweets.shape)

############################################ Any interesting ngrams going on with Trump or Hillary? ##########################

from sklearn.feature_extraction.text import TfidVectorizer
from collections import Counter

# We can use the TfidfVectorizer to find ngrams for us
vect = TfidVectorizer(ngram_range=(2, 5), stop_words='english')

# Pulls all of trumps tweet text's into one giant string
summaries = ''.join(donald_df['text'])
ngrams_summaries = vect.build_analyzer()(summaries)

print("Common ngrams = ", Counter(ngrams_summaries).most_common(20))

############################ Fake news....figures ############################3

vect = TfidVectorizer(ngram_range=(2, 5), stop_words='english')

summaries = ''.join(hillary_df['text'])
ngrams_summaries = vect.build_analyzer()(summaries)

print("Common ngrams for hillary = ",
data["preprocessed_tweet"] = preprocess(data["tweet"])
print(data["tweet"].head())

y = data["class"].values


X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, stratify=y)

print("Train data: ", X_train, y_train)
print("Test data: ", X_test, y_test)


vect = CountVectorizer(min_df=1) # min_df is min # of times a word can appear in doc

X_train_phrase_bow = vect.fit_transform(X_train["tweet"]) # change to preprocessed column
X_test_phrase_bow = vect.fit_transform(X_test["tweet"]) # change to preprocessed column

vectorize = TfidVectorizer(min_df=1)

X_train_phrase_tfidf = vectorizer.fit_transform(X_train["preprocessed_tweet"])
X_test_phrase_tfidf = vectorizer.transform(X_test["preprocessed_tweet"])


lin_reg = LinearRegression(penalty="l1")

lin_reg.fit(X_train_phrase_tfidf, y_train)

predicted_results = lin_reg.predict(X_test_phrase_tfidf)

print("Accuracy: ", accuracy_score(y_test, predicted_results))
Exemplo n.º 7
0
__author__ = 'admin'
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidVectorizer
from math import fabs

i = 2
j = 1
dataset = fetch_20newsgroups(categories = ['alt.atheism', 'talk.religion.misc', 'sci.space'])
vector = TfidVectorizer()
X = vector.fit_transform(dataset.data).toarray()
a = []
for c in range(X.shape[0]):
    a.append(fabs(X[c][i] - X[c][j]))
result = 0
for x in a:
    result += x
print (result)
Exemplo n.º 8
0
train_data = pd.read_csv('../input/ sales_train.csv')
test_data = pd.read_csv('../input/test.csv')

print('train: ', train_data.shape,'test: ', test_data.shape)
print(test_data.head())

items = pd.read_csv('../input/items.csv')
item_cat = pd.read_csv('../input/item_categories.csv')
shops = pd.read_Csv('../input/shops.csv')


from sklearn.feature_extraction.text import TfidVectorizer

feature_cnt =30
tfid = TfidVectorizer(max_df = 0.6, max_features=feature_cnt, ngram_range=(1,2))
item_cat['item_category_name_len'] = item_cat['item_category_name'].apply(len)
item_cat['item_category_name_wc'] = item_cat['item_category_name'].apply(lambda x: len(str(x).split(' ')))
print(item_cat.head())

txtFeatures = pd.DataFrame(tfid.fit_transform(item_cat['item_category_name']).toarray())
cols = txtFeatures.columns
for i in range(feature_cnt):
	item_cat['item_category_name_tfid_' + str(i)] = txtFeatures[cols[i]]


items['item_name_len'] = item['item_name'].apply(len)
items['item_name_wc'] = item['item_name'].apply(lambda x: len(str(x).split(' ')))
txtFeatures = pd.DataFrame(tfid.fit_transform(item['item_name']).toarray())

cols = txtFeatures.columns
Exemplo n.º 9
0
movies = pd.read_csv("movies_metadata.csv")
movies.head()

print("Credits: ", credits.shape)
print("Movies: ", movies.shape)

# moviesMerge = movies.merge(credits, on = "id")
# moviesMerge.head()

moviesSorted = movies.drop(columns = ["homepage", "production_countries", "status"])
moviesSorted.info()
# moviesSorted.head(1)["overview"]


tfv = TfidVectorizer(min_df = 3, maxFeatures = None, stripAccents = "unicode", analyzer = "word", tokenPattern =r"\w{1,}", ngramRange = (1, 3), stopWords = "english")

moviesSorted["overview"] = movies["overview"].fillna("")

tfvMatrix = tfv.fit_transform(moviesSorted["overview"])
tfvMatrix
tfvMatrix.shape

sig = sigmoid_kernel(tfvMartrix, tfvMatrix)
sig[0]

indices = pd.Series()(moviesSorted.index, index = moviesSorted["original_title"]).drop_duplicates()
indices

def giveRec(title, sig = sig):
    idx = indices[title]
Exemplo n.º 10
0
__author__ = 'admin'
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidVectorizer
from math import fabs

i = 2
j = 1
dataset = fetch_20newsgroups(
    categories=['alt.atheism', 'talk.religion.misc', 'sci.space'])
vector = TfidVectorizer()
X = vector.fit_transform(dataset.data).toarray()
a = []
for c in range(X.shape[0]):
    a.append(fabs(X[c][i] - X[c][j]))
result = 0
for x in a:
    result += x
print(result)