class Chatbot: def __init__(self, questions): ''' questions = [] with codecs.open(file, 'r', 'utf-8') as f: for line in f: questions.append(line) status = [] for i in range(len(questions)): status.append(i) self.__data = { 'text': questions, 'status': status } ''' questions_text = [] for question in questions: questions_text.append(question.Title) status = [] for i in range(len(questions_text)): status.append(i) self.__data = {'text': questions_text, 'status': status} def frame(self): frame = pandas.DataFrame(self.__data) self.frame_x = frame['text'] self.frame_y = frame['status'] def learning(self): self.vect = TfidfVectorizer(min_df=1) self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self.frame_x, self.frame_y, test_size=0.2, random_state=4) self.x_trainvect = self.vect.fit_transform(self.x_train) self.x_trainvect.toarray() self.vect1 = TfidfVectorizer(min_df=1) self.x_trainvect = self.vect1.fit_transform(self.x_train) a = self.x_trainvect.toarray() self.vect1.inverse_transform(a[0]) def bayes(self): self.mnb = MultinomialNB() self.y_train = self.y_train.astype('int') self.mnb.fit(self.x_trainvect, self.y_train) def ask(self, sentence): logger.debug('ask to bot for: {question}'.format(question=sentence)) start = datetime.datetime.now() self.frame() self.learning() self.bayes() x_testvect = self.vect1.transform([sentence]) pred = self.mnb.predict(x_testvect) end = datetime.datetime.now() logger.debug('time elapsed {time}'.format(time=end - start)) return self.frame_x[pred[0]]
def preprocess(train, test, max_feature=3000, stop_word=True): if stop_word: vectorizer = TfidfVectorizer(stop_words='english', max_features=max_feature) else: vectorizer = TfidfVectorizer(max_features=max_feature) # this may take a while tmp = vectorizer.fit_transform(train) tmp2 = vectorizer.transform(test) # inverse back to normal words tmp = vectorizer.inverse_transform(tmp) tmp2 = vectorizer.inverse_transform(tmp2) return tmp, tmp2
def get(self, request, *args, **kwargs): lines = [] contents = [] dup_lines = [] file = File.objects.get(id=self.kwargs['file_id']) df = pd.read_csv(file.file.url, sep='delimiter', header=None) start_time = time.time() # Performance test # removing stopwords df[0] = df[0].apply(lambda x: ' '.join( [word for word in x.split() if word not in (stop)])) # removing special characters df[0] = df[0].str.replace(r"[^a-zA-Z ]+", " ").str.strip() pks = df[0].values pattern = '(?u)[^ ]+' tv = TfidfVectorizer(encoding='utf-8', token_pattern=pattern) vect = tv.fit_transform(df[0]) vect_array = vect.toarray() distances = pd.DataFrame( distance.cdist(vect_array, vect_array, 'hamming')) for i in range(0, len(distances.columns)): # Finding duplicates dup_index = distances.index[distances[i] < file.threshold].tolist() if len(dup_index) > 1: for d in dup_index: if d != i: print(distances[i][d]) print('Line', i) lines.append(i) print(df[0][i]) print(tv.inverse_transform(vect_array[i])) print('Line', d) dup_lines.append(d) print(tv.inverse_transform(vect_array[d])) print(df[0][d]) print('-----------------------------') if d == i: print('here') contents = (df[0][lines].tolist()) print("TFIDF vector generated.\nTime: %s seconds\n" % (time.time() - start_time)) # Performance test return render(request, self.template_name, { 'lines': lines, 'contents': contents, 'dup_lines': dup_lines })
def main(mbox_path: ('MBox Path', 'option', 'm')): messages = [] texts = [] for idx, message in enumerate(mailbox.mbox(mbox_path)): content = message.get_payload()[0].get_payload() stripped_content = striphtml(content).replace('\r', '').replace( '\n', '').replace('=2C', '').replace('=', '') matches = re.findall(r'"([^"]*)"', stripped_content) if len(matches) == 0: print("{}: Failed to extract message.".format(idx)) continue messages.append(message) texts.append({'text': matches[0]}) df = pd.DataFrame(texts) vectorizer = TfidfVectorizer() vectorized = vectorizer.fit_transform(df['text'].values).toarray() indexes_to_keep = np.flip(vectorized.argsort(axis=-1), -1)[:, :5] arr = np.zeros(vectorized.shape) for idx, set_one_idxs in enumerate(indexes_to_keep): arr[idx][set_one_idxs] = 1. terms_per_document = vectorizer.inverse_transform(arr) all_terms = [] for terms in terms_per_document: all_terms += terms.tolist() count = Counter(all_terms) print(count.most_common(20))
def __vectorize(self): """ Vectorize training data, i.e. perform a 2-gram feature extraction and selection using a TF-IDF method :return: Result is a numeric and weighted feature vector notation for each item """ logging.debug("Vectorizing text contents...") tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2, max_df=0.5, stop_words=stopwords.words('portuguese')) self.__tfidf_matrix = tfidf.fit_transform( self.__dataframe['video_contents']) vectors = self.__tfidf_matrix.toarray() i = 0 for video_id, row in self.__dataframe.iterrows(): tokens = ", ".join(tfidf.inverse_transform(vectors[i])[0]) video_id = row['video_id'] i += 1 # videotokens = VideoTokens.objects.filter(video_id=video_id) # if videotokens.count() == 0: # videotokens = VideoTokens(video_id=video_id, tokens=tokens) # videotokens.save() # else: # videotokens[0].tokens = tokens # videotokens[0].save() logging.debug("Number of features found: %s" % len(tfidf.vocabulary_))
def buildVectorizer(bio): nounlist = [] for doc in bio: st = "" for (word, pos) in tag(doc): if pos in ["JJ", "NNS", "NN", "NNP"]: st = st+word+" " else: if st!= "": st = st[0:-1]+" " #print "got one" nounlist.extend([st]) sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal', 'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal', 'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation', 'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems', 'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments', 'provide', 'analysis', 'problem', 'method', 'used', 'methods']) #now doing the new vectorizer from sklearn.feature_extraction.text import TfidfVectorizer english = nltk.corpus.stopwords.words('english') newstop = english+list(sciencestopwords) vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words=newstop, decode_error='ignore') X = vectorizer.fit_transform(nounlist) Xinv = vectorizer.inverse_transform(X) #X is a sparse matrix of docs x vocab size (7638). #so X[doc_num] is the sparse vector of its words. #the ||X[doc_num]|| = 1 there are 7638 unique words and 755 docs. with a total number of 38888 non-zeros. #Xinv[doc_num] is the list of words in the doc. return nounlist, vectorizer, X, Xinv
def keyword_extract(): '''Extract keywords from tagline using tf-idf''' tagline_doc = [] slug_list = [] sql = "SELECT project_slug, tagline FROM project" self._cursor.execute(sql) results = self._cursor.fetchall() for row in results: tagline_doc.append(row[1]) slug_list.append(row[0]) vectorizer = TfidfVectorizer(stop_words="english", max_features=1000) response = vectorizer.fit_transform(tagline_doc) keyword_array = vectorizer.inverse_transform(response) for i in range(0, len(keyword_array), 1): try: sql = "UPDATE project SET keywords = '" + json.dumps( keyword_array[i].tolist( )) + "' WHERE project_slug = '" + slug_list[i] + "'" self._cursor.execute(sql) self._db.commit() except MySQLdb.Error as e: try: logging.error("MySQL Error [%d]: %s; Error SQL: %s", e.args[0], e.args[1], sql) except IndexError: logging.error("MySQL Error %s", str(e))
def weibo(): f_stop = open('dataset/stop_word', 'r', encoding='utf-8') stopwords=['ru0b1os', 'ruyjamg', 'zy1qwp0', 'oxukk', '\t'] for line in f_stop.readlines(): stopwords.append(line[:-1]) train = [] for (root, dir, files) in os.walk("dataset/weibo_train/"): for f in files: train += pkl.load(open('dataset/weibo_train/' + f, 'rb')) vec = TfidfVectorizer(max_features=5000, min_df=5, tokenizer=jieba.cut, stop_words=stopwords) vec.fit(train) attention = pkl.load(open('attention.pkl','rb')) l=vec.inverse_transform(attention)[0] fout = open('attention','w', encoding='utf-8') for w in l: fout.write(str(w)) # print(np.sum(vec.transform(train).toarray()[:5])) f_train = open("dataset/weibo_train.pkl", 'wb') f_test = open("dataset/weibo_test.pkl", 'wb') print(vec.vocabulary_) for (root, dir, files) in os.walk("dataset/weibo_train/"): for f in files: l = pkl.load(open('dataset/weibo_train/' + f, 'rb')) pkl.dump(vec.transform(l).toarray(), f_train) # print(f.split('_')[1][0]) pkl.dump(int(f.split('_')[1][0]), f_train) for (root, dir, files) in os.walk("dataset/weibo_test/"): for f in files: l = pkl.load(open('dataset/weibo_test/' + f, 'rb')) pkl.dump(vec.transform(l).toarray(), f_test) # print(f.split('_')[1][0]) pkl.dump(int(f.split('_')[1][0]), f_test)
class WeightedWordVectors(TransformerMixin): """ Libraries & Versions: Python==3.6.5 Pandas=='0.23.1' as pd nltk=='3.3' numpy=='1.14.5' Keyword arguments: X -- Pandas Series of text as strings """ def __init__(self, model=None, meta=None, disk=None): self.meta = meta if model == None: self.word2vec = Word2Vec(sentences=self.meta.str.split().tolist()) else: self.word2vec = model self.word2vec.train(sentences=self.meta.str.split().tolist()) #Fit TFIDF self.tfidf = TfidfVectorizer() self.idfs = self.tfidf.fit_transform(self.meta) self.inverse_idfs = self.tfidf.inverse_transform(self.idfs) return self def fit(self, X, y=None, meta=None, P=None): #Split into sentences for i in X.index: self.word2vec.build_vocab(X.loc[i].str.split().tolist(), keep_raw_vocab=True, update=True) self.word2vec.model_trimmed_post_training = False self.word2vec.min_alpha_yet_reached = False self.word2vec.batch_words = X.loc[i].apply(lambda x: len(x.split())).max() self.word2vec.train(X.loc[i].str.split().tolist(), total_examples=X.loc[i].shape[0], start_alpha=0.05, end_alpha=0.01, epochs=1, compute_loss=True) disk.append( (euclidean_distances(self.transform(self, meta)) @ P).sum(1) ) return self def transform(self, meta): #Weighted Vectors weighted_docs = [] for idf, inv in zip(self.idfs, self.inverse_idfs): try: weight = idf[idf!=0] vector = self.word2vec[inv] weighted_doc = weight.dot(vector) except: weighted_doc = np.empty((1, 300), np.float64) weighted_docs.append(weighted_doc.tolist()[0]) return pd.DataFrame(weighted_docs).fillna(0) '''
class MachineLearning(): def __init__(self): questions = [] with open("deneme.txt", "r") as f: for line in f: questions.append(line) status = [] for i in range(len(questions)): status.append(i) self.__data = {'text': questions, 'status': status} def frame(self): frame = pandas.DataFrame(self.__data) self.frame_x=frame["text"] self.frame_y=frame["status"] def learning(self): self.vect = TfidfVectorizer(min_df=1) self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.frame_x, self.frame_y, test_size=0.2, random_state=4) self.x_trainvect = self.vect.fit_transform(self.x_train) self.x_trainvect.toarray() self.vect1 = TfidfVectorizer(min_df=1) self.x_trainvect = self.vect1.fit_transform(self.x_train) a = self.x_trainvect.toarray() self.vect1.inverse_transform(a[0]) def bayes(self): self.mnb = MultinomialNB() self.y_train=self.y_train.astype('int') self.mnb.fit(self.x_trainvect,self.y_train) def find(self, sentence): self.frame() self.learning() self.bayes() x_testvect = self.vect1.transform([sentence]) pred = self.mnb.predict(x_testvect) return self.frame_x[pred[0]]
class TopicModeling(object): def __init__(self, n_topics, method='LSA'): assert method in ['LSA', 'LDA'] if method == 'LDA': raise NotImplementedError self.method = method self.n_topics = n_topics self.tfidf = TfidfVectorizer() if method == 'LSA': self.model = TruncatedSVD(n_components=n_topics) else: self.model = LDA(n_components=n_topics) def __call__(self, corpus): self.term_matrix = self.tfidf.fit_transform(corpus) self.topic_matrix = self.model.fit_transform(self.term_matrix) self.topic_keys = self.topic_matrix.argmax(axis=1).tolist() def get_count_pairs(self): return np.unique(self.topic_keys, return_counts=True) def get_top_n_words(self, n): ''' returns a list of n_topic strings, where each string contains the n most common words in a predicted category (topic), in order ''' top_word_indices = [] for topic in range(self.n_topics): temp_vector_sum = 0 for j in range(len(self.topic_keys)): if self.topic_keys[j] == topic: temp_vector_sum += self.term_matrix[j] temp_vector_sum = temp_vector_sum.toarray() top_n_word_indices = np.flip( np.argsort(temp_vector_sum)[0][-n:], 0) top_word_indices.append(top_n_word_indices) top_words = [] for topic in top_word_indices: topic_words = [] for index in topic: temp_word_vector = np.zeros((1, self.term_matrix.shape[1])) temp_word_vector[:, index] = 1 the_word = self.tfidf.inverse_transform(temp_word_vector)[0][0] topic_words.append(the_word.encode('ascii').decode('utf-8')) top_words.append(" ".join(topic_words)) return top_words def plot_tsne(self, n_components=2): topic_embedding = TSNE(n_components=n_components).fit_transform( self.topic_matrix) _, ax = plt.subplots(figsize=(16, 10)) scatter = ax.scatter(topic_embedding[:, 0], topic_embedding[:, 1], c=self.topic_keys, cmap='tab20') legend = ax.legend(*scatter.legend_elements(), title='Topics') ax.add_artist(legend)
def tfidf1(c1,c2): tfidf = TfidfVectorizer() data = tfidf.fit_transform([c1,c2]) print(tfidf.get_feature_names()) print(data.toarray()) print(tfidf.inverse_transform(data))
def test_add_hashtag_bow_to_graph(self): g = IU.add_hastag_bow_to_graph(self.g_undecom) tfidf = TfidfVectorizer(preprocessor=None, tokenizer=lambda s: s.split(), stop_words=None) tfidf.fit([' '.join(g.node[n]['hashtags']) for n in g.nodes_iter()]) for n in g.nodes_iter(): assert_true(issparse(g.node[n]['hashtag_bow'])) assert_equal( sorted(g.node[n]['hashtags']), sorted( tfidf.inverse_transform( g.node[n]['hashtag_bow'])[0].tolist()))
def processing_question(ques, paragraphs, domain_lemma_cache, domain_pickle): """Return answer""" #Lemmatizing whole csv text column lemma_cache = domain_lemma_cache if not os.path.isfile(lemma_cache): lemmas = [lemmatize(par) for par in tqdm(paragraphs)] df = pd.DataFrame(data={'context': paragraphs, 'lemmas': lemmas}) df.to_feather(lemma_cache) df = pd.read_feather(lemma_cache) paragraphs = df.context lemmas = df.lemmas #Vectorizor cache if not os.path.isfile(VEC_PICKLE_LOC): vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=.5, ngram_range=(1, 3)) vectorizer.fit_transform(lemmas) pickle.dump(vectorizer, open(VEC_PICKLE_LOC, "wb")) #Vectorized lemmas cache cache if not os.path.isfile(domain_pickle): tfidf = vectorizer.fit_transform(lemmas) pickle.dump(tfidf, open(domain_pickle, "wb")) #loading the pickle file vectorizer = pickle.load(open(VEC_PICKLE_LOC, "rb")) tfidf = pickle.load(open(domain_pickle, "rb")) question = ques #Transform the lemmatized questions and paragraph to vector representation query = vectorizer.transform([lemmatize(question)]) (query > 0).sum(), vectorizer.inverse_transform(query) scores = (tfidf * query.T).toarray() #finding the cosine similarity of the question and the paragraph and take the top 10 paragraphs and put those paragraphs in the question answering pipeline results = (np.flip(np.argsort(scores, axis=0))) qapipe = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='bert-base-uncased', device=0) #after putting it through the pipeline, it will get the top 10 answers and put it in a dataframe candidate_idxs = [(i, scores[i]) for i in results[0:10, 0]] contexts = [(paragraphs[i], s) for (i, s) in candidate_idxs if s > 0.01] question_df = pd.DataFrame.from_records([{ 'question': question, 'context': ctx } for (ctx, s) in contexts]) preds = qapipe(question_df.to_dict(orient="records")) answer_df = pd.DataFrame.from_records(preds) answer_df["context"] = question_df["context"] answer_df = answer_df.sort_values(by="score", ascending=False) #return a dataframe that contains the answers return answer_df
def calculateNgramAccuracy(range_min, range_max): vectorClassifier = TfidfVectorizer(min_df=1, stop_words='english', ngram_range=(range_min, range_max)) messages_trainData, messages_testData, label_trainData, label_testData = train_test_split( messagesDataSet, labelDataSet, test_size=0.2, random_state=4) x_traincv = vectorClassifier.fit_transform(messages_trainData) trainData = x_traincv.toarray() print(trainData) featureNames = vectorClassifier.get_feature_names() print(featureNames) values = trainData[0] print(values) length = len(trainData[0]) print(length) data = vectorClassifier.inverse_transform(trainData[0]) print(data) actualData = messages_trainData.iloc[0] print(actualData) multiNB = MultinomialNB() label_trainData = label_trainData.astype('int') multiNBData = multiNB.fit(x_traincv, label_trainData) print(multiNBData) x_testcv = vectorClassifier.transform(messages_testData) predict = multiNB.predict(x_testcv) print(predict) actualTestDataLabels = np.array(label_testData) print(actualTestDataLabels) testEqualResult = 0 for i in range(len(label_testData)): if (actualTestDataLabels[i] == predict[i]): testEqualResult += 1 TtestEqualResult = testEqualResult PredictionDataLength = len(predict) #n-gram print("range", range_min, "-", range_max, "Ngram Equal Data count = ", TtestEqualResult) print("range", range_min, "-", range_max, "Ngram tested data count = ", PredictionDataLength) print("range", range_min, "-", range_max, "Ngram accuracy = ", TtestEqualResult * 100.0 / PredictionDataLength, "% ~> ", TtestEqualResult * 100.0 // PredictionDataLength, "%")
def processing_question(ques, paragraphs, domain_lemma_cache, domain_pickle): """Return answer""" #Lemmatizing whole csv text column lemma_cache = domain_lemma_cache if not os.path.isfile(lemma_cache): lemmas = [lemmatize(par) for par in tqdm(paragraphs)] df = pd.DataFrame(data={'context': paragraphs, 'lemmas': lemmas}) df.to_feather(lemma_cache) df = pd.read_feather(lemma_cache) paragraphs = df.context lemmas = df.lemmas #Vectorizor cache if not os.path.isfile(VEC_PICKLE_LOC): vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=.5, ngram_range=(1, 3)) vectorizer.fit_transform(lemmas) pickle.dump(vectorizer, open(VEC_PICKLE_LOC, "wb")) #Vectorized lemmas cache cache if not os.path.isfile(domain_pickle): tfidf = vectorizer.fit_transform(lemmas) pickle.dump(tfidf, open(domain_pickle, "wb")) vectorizer = pickle.load(open(VEC_PICKLE_LOC, "rb")) tfidf = pickle.load(open(domain_pickle, "rb")) question = ques query = vectorizer.transform([lemmatize(question)]) (query > 0).sum(), vectorizer.inverse_transform(query) scores = (tfidf * query.T).toarray() results = (np.flip(np.argsort(scores, axis=0))) qapipe = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='bert-base-uncased', device=0) candidate_idxs = [(i, scores[i]) for i in results[0:10, 0]] contexts = [(paragraphs[i], s) for (i, s) in candidate_idxs if s > 0.01] question_df = pd.DataFrame.from_records([{ 'question': question, 'context': ctx } for (ctx, s) in contexts]) preds = qapipe(question_df.to_dict(orient="records")) torch.cuda.empty_cache() gc.collect() answer_df = pd.DataFrame.from_records(preds) #torch.cuda.empty_cache() #gc.collect() answer_df['context'] = question_df['context'] answer_df = answer_df.sort_values(by="score", ascending=False) return answer_df
def fit(self, preload=False): vectorizer = TfidfVectorizer(tokenizer=self.__custom, strip_accents='ascii', stop_words='english', token_pattern=None) vectorizer.fit(self.corpus) self.X = vectorizer.transform(self.corpus) self.vocab = vectorizer.get_feature_names() self.V = len(self.vocab) self.word_to_ix = vectorizer.vocabulary_ self.idf = vectorizer.idf_ self.doc_terms = vectorizer.inverse_transform(self.X)
def get_semantic_features(self, data): # Determine which tokens are used for each data point vectorizer_to_find_tokens = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, \ min_df=2, max_df=0.2, ngram_range=(1, 1), stop_words = self.stop_word_list_longer) transformed_to_find_tokens = vectorizer_to_find_tokens.fit_transform( data) semantic_features = [] for data_point in data: transformed_point = vectorizer_to_find_tokens.transform( [data_point]) inversed = vectorizer_to_find_tokens.inverse_transform( transformed_point)[0] summed_tokens = numpy.copy(self.default_vector) for token in inversed: summed_tokens = summed_tokens + self.get_vector(token) semantic_features.append(summed_tokens) return semantic_features
def test_add_hashtag_bow_to_graph(self): g = IU.add_hastag_bow_to_graph(self.g_undecom) tfidf = TfidfVectorizer(preprocessor=None, tokenizer=lambda s: s.split(), stop_words=None) tfidf.fit([' '.join(g.node[n]['hashtags']) for n in g.nodes_iter()]) for n in g.nodes_iter(): assert_true(issparse(g.node[n]['hashtag_bow'])) assert_equal( sorted(g.node[n]['hashtags']), sorted( tfidf.inverse_transform( g.node[n]['hashtag_bow'] )[0].tolist() ) )
def test11(self): corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?' ] vectorizer = TfidfVectorizer() # X = vectorizer.fit_transform(corpus) model = vectorizer.fit(corpus) X = model.transform(corpus) print(vectorizer.get_feature_names()) print(vectorizer.get_stop_words()) print(vectorizer.inverse_transform(X)) print(X.shape) print(X) print(model.vocabulary_) print(model.idf_)
def tfidf_vectorizer(): """ 汉语句子文本特征值化 使用tf-idf方法 :return: """ # 切分汉语句子, 得出分词结果 c1, c2 = cut_words() tf = TfidfVectorizer() # 调用 fit_transform 输入数据并转换 列表形式 response = tf.fit_transform([c1, c2]) print(response) # 转为sparse矩阵形式 print('*' * 15) print(tf.get_feature_names()) # 统计文章中没有重复出现的词 print('*' * 15) print(response.toarray()) # 转为数组形式 print('*' * 15) print(tf.inverse_transform([c1, c2]))
def buildVectorizer(bio): nounlist = [] for doc in bio: st = "" for (word, pos) in tag(doc): if pos in ["JJ", "NNS", "NN", "NNP"]: st = st + word + " " else: if st != "": st = st[0:-1] + " " #print "got one" nounlist.extend([st]) sciencestopwords = set([ u'model', 'according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal', 'results', 'using', 'research', 'consumers', 'scientists', 'model', 'models', 'journal', 'researchers', 'paper', 'new', 'study', 'time', 'case', 'simulation', u'simulation', 'equation', 'based', 'years', 'better', 'theory', 'particular', 'many', 'due', 'much', 'set', 'studies', 'systems', 'simple', 'example', 'work', 'non', 'experiments', 'large', 'small', 'experiment', u'experiments', 'provide', 'analysis', 'problem', 'method', 'used', 'methods' ]) #now doing the new vectorizer from sklearn.feature_extraction.text import TfidfVectorizer english = nltk.corpus.stopwords.words('english') newstop = english + list(sciencestopwords) vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words=newstop, decode_error='ignore') X = vectorizer.fit_transform(nounlist) Xinv = vectorizer.inverse_transform(X) #X is a sparse matrix of docs x vocab size (7638). #so X[doc_num] is the sparse vector of its words. #the ||X[doc_num]|| = 1 there are 7638 unique words and 755 docs. with a total number of 38888 non-zeros. #Xinv[doc_num] is the list of words in the doc. return nounlist, vectorizer, X, Xinv
def vectorizerAllData(newsgroups_data): # Create vectorizer vectorizer = TfidfVectorizer() # check dataset print('Total Datasets : ') print(newsgroups_data.data.__len__()) print('First Dataset : ') print(newsgroups_data.data[0]) arr = vectorizer.fit_transform(newsgroups_data.data).toarray() print(arr[0]) # What's the length? print('First Dataset (vectorized) length: ') print(len(arr[0])) # Check words? print('To the source:') print(vectorizer.inverse_transform(arr[0])) print()
tfidf = TfidfVectorizer(ngram_range=(1,1)) tfidf.fit(example_doc) top = 10 # get idf score of vocabularies idf = tfidf.idf_ print('[vocabularies with smallest idf scores]') sorted_idx = idf.argsort() for i in range(top): print('%s: %.2f' % (tfidf.get_feature_names()[sorted_idx[i]], idf[sorted_idx[i]])) doc_tfidf = tfidf.transform(example_doc).toarray() tfidf_sum = np.sum(doc_tfidf, axis=0) print("\n[vocabularies with highest tf-idf scores]") for tok, v in zip(tfidf.inverse_transform(np.ones(tfidf_sum.shape[0]))[0][tfidf_sum.argsort()[::-1]][:top], np.sort(tfidf_sum)[::-1][:top]): print('%s: %f' % (tok, v)) # [vocabularies with smallest idf scores] # 蟋蟀: 2.87 # 可以: 4.36 # 就是: 4.41 # 聲音: 4.46 # 這樣: 4.46 # 你們: 4.56 # 真的: 4.62 # 還有: 4.68 # 豆油伯: 4.68 # 比較: 4.68
df.head() df_x=df["messege"] df_y=df["class"] cv = TfidfVectorizer(min_df=1,stop_words='english') x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.5, random_state=4) x_train=x_train.astype('str') x_test=x_test.astype('str') x_train.head() cv1 = TfidfVectorizer(min_df=1,stop_words='english') x_traincv=cv1.fit_transform(x_train) a=x_traincv.toarray() cv1.inverse_transform(a[0]) x_train.iloc[0] x_testcv=cv1.transform(x_test) #mnb = MultinomialNB() mnb = svm.SVC() y_train y_train mnb.fit(x_traincv,y_train) y_sc=mnb.decision_function(x_testcv) y_predict=mnb.predict(x_testcv)
X = df['CONTENT'] y = df['CLASS'] # Split dataset into training and testing data. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2) # Use a count vectorizer to keep count of all the words used. # This is what the classifier will be using to distinguish between ham and spam. cv = TfidfVectorizer(min_df=1, stop_words='english') # Fit the vectorizer, each word counts as a feature. X_traincv = cv.fit_transform(X_train) X_traincv = X_traincv.toarray() cv.inverse_transform(X_traincv[0]) X_testcv = cv.transform(X_test) X_testcv = X_testcv.toarray() cv.inverse_transform(X_testcv[0]) # Train the classifier. clf = MultinomialNB() clf.fit(X_traincv, y_train) pred = clf.predict(X_testcv) # See how well it performed. accuracy = clf.score(X_traincv, y_train) print(accuracy)
df.head() df_x=df["Message"] df_y=df["label"] tfd=TfidfVectorizer(min_df=1, stop_words='english') x_train, x_test, y_train, y_test = train_test_split( df_x, df_y, test_size=0.20, random_state=0) # In[161]: x_traincv = tfd.fit_transform(x_train) a=x_traincv.toarray() tfd.inverse_transform(a[0]) x_train.iloc[0] #from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB mnb=MultinomialNB() #clf = RandomForestClassifier(max_depth=2, random_state=0)
def main(args): logger.debug("Arguments: %r", args) tfidf_vect = TfidfVectorizer( preprocessor=get_preprocessor(args.fields), analyzer='word', # maybe callable token_pattern=r'\b[a-z]\w+\b', ngram_range=(args.min_ngrams, args.max_ngrams), max_df=args.max_df, max_features=args.max_features, sublinear_tf=args.sublinear_tf, stop_words=STOP_WORDS, norm=args.norm, ) with LogRuntime("Loaded input data in {elapsed} seconds", logger): data = get_data(args) if data: logger.debug("Corpus size: {0}".format(len(data))) else: logger.error("Empty data") return with LogRuntime("Fitted in {0.elapsed} seconds", logger): X = tfidf_vect.fit_transform(data) logger.debug("Vocabulary size: {}".format(len(tfidf_vect.vocabulary_))) logger.debug("Max DF stop words size: {}".format( len(tfidf_vect.stop_words_))) logger.debug("Stop words size: {}".format(len(tfidf_vect.stop_words))) if args.clusters: true_k = args.clusters else: # ref: http://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set#Finding_Number_of_Clusters_in_Text_Databases m_docs, n_terms = X.shape t_nonzeros = len(X.nonzero()[0]) true_k = (m_docs * n_terms) / t_nonzeros logger.debug("Calculated number of clusters: {}".format(true_k)) if args.minibatch: km = MiniBatchKMeans( n_clusters=true_k, init='k-means++', n_init=10, init_size=1000, batch_size=1000, verbose=-1) else: km = KMeans( n_clusters=args.clusters, init='random', max_iter=100, n_init=10, verbose=1, n_jobs=-1) with LogRuntime("KMeans Fitted in {0.elapsed} seconds", logger): km.fit(X) if args.sample_random and args.sample_size: sample = [ data[i] for i in np.random.random_integers(0, len(data), args.sample_size) ] elif args.sample_size: sample = data[args.sample_skip:args.sample_size] else: sample = data Y = tfidf_vect.transform(sample) sample_terms = tfidf_vect.inverse_transform(Y) labels = km.predict(Y) distances = km.transform(Y) center_terms = tfidf_vect.inverse_transform(km.cluster_centers_) clusters = defaultdict(list) vocabulary = tfidf_vect.vocabulary_ for i, doc in enumerate(sample): clusters[labels[i]].append((i, doc)) truncate = lambda t: t[:100] + '...' if len(t) > 100 else t for label, result in sorted(clusters.iteritems()): # skip single results if len(result) < args.cluster_minsize: continue terms_joined = ', '.join( sorted( center_terms[label], reverse=True, key=lambda t: km.cluster_centers_[label, vocabulary[t]])) print '=' * 79 print '=' * 79 print '=' * 79 print '-> ' + truncate(terms_joined) + '\n\n' result = sorted( result, key=lambda (i, _): distances[i, label], ) j = 0 for i, doc in result: j += 1 doc_terms = ', '.join( sorted( sample_terms[i], reverse=True, key=lambda t: Y[i, vocabulary[t]], )) print doc['headline'] print get_corpus_key(doc) print doc['url'] print truncate(doc_terms) print if j > 10: print '...' break print if args.shell: from IPython import embed embed()
npindL = np.array(indL) freq_th = max(3, int(X.shape[0]*0.0025)) cluster_score = {} # score_tweet = {} for clfreq in freqTwCl.most_common(50): cl = clfreq[0] freq = clfreq[1] cluster_score[cl] = 0 # only get cluster have frequent appear higher than frequent threshold if freq >= freq_th: clidx = (npindL == cl).nonzero()[0].tolist() cluster_centroid = X[clidx].sum(axis=0) # print("center ", cluster_centroid.shape) try: cluster_tweet = vectorizer.inverse_transform(cluster_centroid) # print("ttt ", cluster_tweet) for term in np.nditer(cluster_tweet): try: # cluster_score[cl] = max(cluster_score[cl], boosted_wtfVoc[str(term).strip()]) # print(term) cluster_score[cl] += boosted_wtfVoc[str(term).strip()] except: pass except: pass # print("cscs, ", cluster_score) cluster_score[cl] /= freq else: break sorted_clusters = sorted( ((v,k) for k,v in cluster_score.items()), reverse=True) # print ("sorted cluster_score:") # print (sorted_clusters) # print(cluster_score)
class CommentsAnalyzer(pmlutil.Configurable): def configTypes(self): return dict(amount=int, min_ngram=int, max_ngram=int, min_df=int, max_df=float, use_idf=int, alpha=readArray, l1_ratio=readArray, n_folds=int) def _loadData(self): logging.info("loading data") self.data = [] count = 0 for fn in os.listdir(self._datafolder): if not self._amount < 1 and count >= self._amount: break if fn.endswith(self._metaextension): mfn = self._datafolder + "/" + fn ddm = pml.Datum(mfn,None) if len(ddm.meta()['comments'])>0: self.data.append(ddm) count +=1 logging.info("loaded %d data" % count) def __init__(self): self.data=[] def _aggregateComments(self, subset): allcomments = [] for datum in subset: comments = [] for comment in datum.meta()['comments']: comments.append(comment['text']) allcomments.append(" ".join(comments)) return np.array(allcomments) def _buildDictionary(self, allcomments): print allcomments self.vectorizer = TfidfVectorizer(analyzer=self._analyzer, ngram_range=(self._min_ngram,self._max_ngram), min_df=self._min_df, max_df=self._max_df, norm='l2', smooth_idf=True, use_idf=bool(self._use_idf)) self.vectorizer.fit(allcomments) def run(self): allcomments = self._aggregateComments(self.data) self._buildDictionary(allcomments) # create representation of documents tfidfArray = self.vectorizer.transform(allcomments) # create labelling labels = [] for datum in self.data: labels.append(len(datum.meta()['favorites'])) labels = np.array(labels) print self.vectorizer.get_params() print self.vectorizer.get_feature_names() # training self.elasticNet = ElasticNetCV(alphas=self._alpha, l1_ratio=self._l1_ratio, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, copy_X=True, tol=0.0001, rho=None, cv=self._n_folds) self.elasticNet.fit(tfidfArray,labels) for i,l1_ratio in enumerate(self._l1_ratio): for j,alpha in enumerate(self._alpha): print "alpha: %f, l1_ratio: %f --> %f" % (alpha,l1_ratio,np.mean(self.elasticNet.mse_path_[i,j,:])) print self.vectorizer.inverse_transform(self.elasticNet.coef_)
def summarize_cisco_support_forum_texts(): # cisco_plain_text = LazyCorpusLoader( # 'content', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin_1') cisco_plain_text = LazyCorpusLoader( "cisco_forum_subset", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin_1" ) token_dict = {} for article in cisco_plain_text.fileids(): token_dict[article] = cisco_plain_text.raw(article) tfidf = TfidfVectorizer(tokenizer=tokenize_and_stem, stop_words="english", decode_error="ignore") sys.stdout.flush() # creates Compressed Sparse Row format numpy matrix tdm = tfidf.fit_transform(token_dict.values()) feature_names = tfidf.get_feature_names() # problem_statement_#1 - summarize support_forum articles automatically for article_id in range(0, tdm.shape[0] - 2): article_text = cisco_plain_text.raw(cisco_plain_text.fileids()[article_id]) sent_scores = [] for sentence in nltk.sent_tokenize(article_text): score = 0 sent_tokens = tokenize_and_stem(sentence) for token in (t for t in sent_tokens if t in feature_names): score += tdm[article_id, feature_names.index(token)] sent_scores.append((score / len(sent_tokens), sentence)) summary_length = int(math.ceil(len(sent_scores) / 5)) sent_scores.sort(key=lambda sent: sent[0]) print "\n*** SUMMARY ***" for summary_sentence in sent_scores[:summary_length]: print summary_sentence[1] print "\n*** ORIGINAL ***" print article_text # problem_statement_#2 - automatically categorize forum posts by tags into various groups reduce_dimensionality_and_cluster_docs(tfidf, tdm, num_features=200) # problem_statement_#3 - find similar documents to a current document (that user is reading) automatically # eg - quora: find similar questions, find similar answers cosine_similarity(tdm[0:1], tdm) """ output looks like this array([[ 1. , 0.22185251, 0.0215558 , 0.03805012, 0.04796646, 0.05069365, 0.05507056, 0.03374501, 0.03643342, 0.05308392, 0.06002623, 0.0298806 , 0.04177088, 0.0844478 , 0.07951179, 0.02822186, 0.03036787, 0.11022385, 0.0535391 , 0.10009412, 0.07432719, 0.03753424, 0.06596462, 0.01256566, 0.02135591, 0.13931643, 0.03062681, 0.02595649, 0.04897851, 0.06276997, 0.03173952, 0.01822134, 0.04043555, 0.06629454, 0.05436211, 0.0549144 , 0.04400169, 0.05157118, 0.05409632, 0.09541703, 0.02473209, 0.05646599, 0.05728387, 0.04672681, 0.04519217, 0.04126276, 0.06289187, 0.03116767, 0.04828476, 0.04745193, 0.01404426, 0.04201325, 0.023492 , 0.07138136, 0.03778315, 0.03677206, 0.02553581]]) The first document is compared to the rest, with the most similar to it being itself with score of 1, next most similar to it is document with score 0.22185251 """ cosine_similarities = linear_kernel(tdm[0:1], tdm).flatten() # mapping back to document_name space related_docs_indices = cosine_similarities.argsort() """ document_ids array([23, 50, 31, 24, 2, 52, 40, 56, 27, 15, 11, 16, 26, 47, 30, 7, 8, 55, 21, 54, 3, 32, 45, 12, 51, 36, 44, 43, 49, 4, 48, 28, 5, 37, 9, 18, 38, 34, 35, 6, 41, 42, 10, 29, 46, 22, 33, 53, 20, 14, 13, 39, 19, 17, 25, 1, 0]) docs 0 and 1 are very similar which are the following posts (last 2 array elements above when sorted) https://supportforums.cisco.com/discussion/11469881/aniserver-failed-run-lms-40 and supportforums.cisco.com/discussion/11469606/eos-lms-31-support-quest """ cosine_similarities[related_docs_indices] for key, value in token_dict.iteritems(): print key, value # find the actual posts which are the most similar tfidf.inverse_transform(tdm)[0] tfidf.inverse_transform(tdm)[1]
#!/usr/bin/env python # encoding: utf-8 """ @author: payneLi @time: 18-7-11 下午2:29 @email: [email protected] """ from sklearn.feature_extraction.text import TfidfVectorizer import jieba str_1 = "今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天" str_2 = "我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去" str_3 = "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系" tf_idf = TfidfVectorizer() data = [str_1, str_2, str_3] content = [" ".join(jieba.cut(sentence)) for sentence in data] result = tf_idf.fit_transform(content) feature_names = tf_idf.get_feature_names() target = result.toarray() content_inverse = tf_idf.inverse_transform(result) print("feature_name:", feature_names, "\ntarget:", target, "\ncontent_inverse:", content_inverse)
class DSOM(object): def __init__(self, inputFile=None, fileType=None, widthOfMap=2, useGPU=True): self.inputFile = inputFile self.fileType = fileType self.widthOfMap = widthOfMap self.useGPU = useGPU self.arrayTrain = [] self.Y = None self.vectorizer = None self.nodeHolder = dict() self.text = "" self.dataset = "" def readDocument(self): if(self.fileType == 'pdf'): self.text = readPDF.pdfparser(self.inputFile) else: self.text = open(self.inputFile, "r").read() self.dataset = self.text.split("\n\n") def train(self, inputFile=None): ############################################################################### #clean_file = open("data/paragraph_vector_output (copy).txt") #dataset = clean_file.read().split("\n\n") # print(dataset) # print("%d Paragraphs " % len(dataset)) # print() # print("Extracting features from the dataset using a sparse vectorizer") #t0 = time() self.vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000, min_df=2, stop_words='english', use_idf=True, sublinear_tf=True) self.Y = self.vectorizer.fit_transform(self.dataset) #arrayTrain = X.toarray() svd = TruncatedSVD(n_components=100, random_state=42) X = svd.fit_transform(self.Y) self.arrayTrain = X #print("done in %fs" % (time() - t0)) #print("n_samples: %d, n_features: %d" % X.shape) #print() ############################################################################### ## SOM #For plotting the images #Train a 20x30 SOM with 400 iterations #print("<-- Starting SOM -- >") mapSide = self.widthOfMap som = SOM.SOM(DATA=self.arrayTrain, num_units=mapSide*mapSide, width=mapSide, height=mapSide) #print("<-- Training SOM -- >") #t0 = time() if(self.useGPU == True): try: import theano.sandbox.cuda theano.sandbox.cuda.use('gpu') except: print("Switching to GPU didn't work, will fallback to CPU.") som.train_batch_theano(verbose=False) else: som.train_batch(verbose=False) #print("<-- Done Training SOM %fs -- >" %(time()-t0)) #Get output grid #print("<-- Testing SOM -- >") #print("<-- Begin Output -- >") #np.set_printoptions(threshold='nan') clusters = som.ins_unit_assign #print(clusters) for i in range(mapSide*mapSide): self.nodeHolder[i] = [] for i, m in enumerate(clusters): if (m) in self.nodeHolder: self.nodeHolder[m].append(i) else: self.nodeHolder[m] = [i] def getClusters(self): return self.nodeHolder def getDataset(self): return self.dataset def tfIDFArray(self): inverse = self.vectorizer.inverse_transform(self.Y) outList = [] for x in inverse: outList.append([y.encode('UTF8') for y in x]) return outList
def build(summary, genre, text_feature=1, baseline=1, top_genre=10, top_phrases=10): """parameter tuned classify models""" #remove punctuation tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') no_punct = summary.apply(lambda x: tokenizer.tokenize(x)) #label binarizer multilabel_binarizer = sklearn.preprocessing.MultiLabelBinarizer() multilabel_binarizer.fit(genre) label = multilabel_binarizer.transform(genre) #split training and validation set xtrain, xval, ytrain, yval = train_test_split(summary, label, test_size=0.2, random_state=1000) tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words={'english'}, max_df=0.8, max_features=10000) xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain) xval_tfidf = tfidf_vectorizer.transform(xval) #hyperparameter grid search parameters = { "estimator__C": [0.1, 1, 5, 10, 15], } if baseline: lr = sklearn.linear_model.LogisticRegression() clf = OneVsRestClassifier(lr, n_jobs=8) clf.fit(xtrain_tfidf, ytrain) y_pred = clf.predict(xval_tfidf) else: svc = svm.LinearSVC() clf = OneVsRestClassifier(svc, n_jobs=8) clf = GridSearchCV(clf, param_grid=parameters, cv=3, verbose=3, scoring='f1_micro', refit=True) clf.fit(xtrain_tfidf, ytrain) y_pred = clf.predict(xval_tfidf) clf = clf.best_estimator_ # Predicted label actual_genre = multilabel_binarizer.inverse_transform(yval) predicted_genre = multilabel_binarizer.inverse_transform(y_pred) #evaluation f1 = "f1-score: " + str( sklearn.metrics.f1_score(yval, y_pred, average="micro")) e1 = 'percentage of genres that are correctly predicted: '+ str(np.sum([len(set(a).intersection(b)) for a, b in \ zip(pd.Series(predicted_genre), pd.Series(actual_genre))])/sum(genre.apply(len))) e2 = 'percentage of movies that have at least one gnere predicted right: '+str(np.sum([len(set(a).intersection(b))>0 for a, b in\ zip(pd.Series(predicted_genre), pd.Series(actual_genre))])/len(genre)) lst = [] new_genre_label = [] genre_label = multilabel_binarizer.classes_ for a, b in zip(clf.estimators_, genre_label): try: lst.append(a.coef_) new_genre_label.append(b) except: pass dist = genre.explode().value_counts(ascending=False) genre_coef = dict(zip(new_genre_label, np.vstack(lst))) fig, ax = plt.subplots(top_genre // 3 + 1, 3, figsize=(20, top_genre * 2)) for o, g in enumerate(dist[:top_genre].index): c = genre_coef[g] words = tfidf_vectorizer.inverse_transform(c)[0] evd = [t for t in c if t > 0] d = dict(zip(words, evd)) sorted_words = sorted(d.items(), key=lambda item: item[1])[-top_phrases:] x = [i[0] for i in sorted_words] y = [i[1] for i in sorted_words] ax[o // 3][o % 3].barh(x, y) ax[o // 3][o % 3].set_title(g) fig.tight_layout() if text_feature: if baseline: fig.savefig( 'data/figures/baseline model with summary text results.png') else: fig.savefig( 'data/figures/final model with summary text results.png') else: if baseline: fig.savefig('data/figures/baseline model with phrases results.png') else: fig.savefig('data/figures/final model with phrases results.png') return (f1 + "\n" + e1 + "\n" + e2 + "\n")
def main(): """ Read sgm files and parse each article from the individual documents :return: """ t0 = time() article_list = [] article_info = {} for i in range(0, 22): filename = 'data{}'.format(str(i).zfill(2)) with open('dataset/{}.sgm'.format(filename), 'r') as f: data = f.read() parser = BeautifulSoup(data, 'html.parser') ''' Looping over each article distinguished by reuters tag , creating a dictionary out of each article of the format : { 'Body': [u'jaguar', u'jaguar', u'plc', u'jagrl', u'sell', u'new', u'xj', u'model', u'us', u'japanes' ], 'Places': [u'uk'], 'Title': [u'jaguar', u'see', u'strong', u'growth', u'new', u'model', u'sale'], 'Topics': [u'earn'], u'topics': u'YES', u'lewissplit': u'TRAIN', u'newid': u'2001', u'oldid': u'18419', 'Date': [u'mar'], u'cgisplit': u'TRAINING-SET' } The content of each dictionary tag is after removing stop words and stemming the contents ''' for article in parser.findAll('reuters'): try: article_list.append(article.body.text) except AttributeError: continue article_info[article['newid']] = {} article_info[article['newid']]['topic'] = [] article_info[article['newid']]['place'] = [] place_parser = article.places topic_parser = article.topics topic_list = [] for topic in topic_parser.findAll('d'): topic_list.append(topic.text) for place in place_parser.findAll('d'): article_info[article['newid']]['place'].append(place.text) article_info[article['newid']]['label'] = article['lewissplit'] if(len(topic_list)==0): article_list.pop() article_info.popitem() else: article_info[article['newid']]['topic'].append(topic_list) ''' Extracting the dictionary of features into a .csv file Format : Article ID,Topic,Place, Label 20057,[u'south-korea'],[],TEST ''' # with open('dictionary.csv', 'wb') as f: # f.write('Article ID,Topic,Place,Label') # f.write('\n') # for key, value in article_info.iteritems(): # f.write(key) # f.write(',') # for inner_key,inner_value in value.items(): # f.write(str(inner_value)) # f.write(',') # f.write('\n') print 'No of valid articles = {}'.format(len(article_list)) #To create a global list of topics used while tokenization(used in tokenize function) to make sure feature words do not belong to topic list global topics_list topics_list = list() topics = getTopics(article_info,[]) for topic_article in topics: if topic_article: for topic in topic_article: if topic: for top in topic: topics_list.append(top) with open('topic_labels', 'wb') as outfile: pickle.dump(topics, outfile, pickle.HIGHEST_PROTOCOL) # with open('initial_word_count.txt', 'wb') as ini: # sum =0 # for word in article_list: # sum += len(word.split()) # ini.write('Total words in body tag of all the 21578 documents initially :'+str(sum)) vectorizer = TfidfVectorizer(min_df= 0.001,max_df=0.9, tokenizer=tokenize, strip_accents='unicode', smooth_idf=True) feature_vector = vectorizer.fit_transform(article_list) feature_list = vectorizer.get_feature_names() with open('feature_vector', 'wb') as outfile: pickle.dump(feature_vector, outfile, pickle.HIGHEST_PROTOCOL) with open('features_list', 'wb') as f: pickle.dump(feature_list, f, pickle.HIGHEST_PROTOCOL) # with open('feature_list.csv','wb') as feature: # for value in feature_list: # feature.write(str(value)+'\n') counter_vectorizer = CountVectorizer(vocabulary=vectorizer.vocabulary_, strip_accents='unicode') # for the word frequency counts data_matrix = counter_vectorizer.fit_transform(article_list) # data matrix transaction_matrix = vectorizer.inverse_transform(feature_vector) # transaction matrix # terms = counter_vectorizer.get_feature_names() # freqs = data_matrix # result = dict(zip(terms, freqs)) # print result # print(len(result)) ## Un-comment from here to generate data_matrix and transaction_matrix # with open('data_matrix.dat', 'wb') as outfile: # pickle.dump(data_matrix, outfile, pickle.HIGHEST_PROTOCOL) # # with open('transaction_matrix.dat', 'wb') as outfile: # pickle.dump(transaction_matrix, outfile, pickle.HIGHEST_PROTOCOL) # with open('unigram_word_count.txt','wb') as ini: # sum = len(vectorizer.get_feature_names()) # ini.write('Total words in body tag remaining after stemming , removing stop words and computing tf-idf counts :'+str(sum)) bigram_vectorizer = TfidfVectorizer(min_df=0.001, tokenizer=tokenize, ngram_range=(2,2), strip_accents='unicode', max_df=0.9, smooth_idf=True) bigram_feature_vector = bigram_vectorizer.fit_transform(article_list) indices = np.argsort(bigram_vectorizer.idf_)[::-1] features = bigram_vectorizer.get_feature_names() top_n = 20 top_features = [features[i] for i in indices[:top_n]] print top_features # with open('top_20_bigrams.txt','wb') as ini: # ini.write(str(top_features)) print("Done in %0.3fs" % (time() - t0))
if __name__ == '__main__': corpus = build_corpus_from_dir('.') #corpus=["Hi how are you, what you doingg?", "Hey what's up bro? you are cool","Hi what are you up to? Such a cool day"] vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_features=250, min_df=5, max_df=0.5) train_cv = vectorizer.fit_transform(corpus) a = train_cv.toarray() #print('\nThis is A:\n',a) b = vectorizer.inverse_transform(a) features = vectorizer.get_feature_names() #print('\nThis is B:\n',b) print('\n\n\n\n\n\n') #print(features) #print(len(features)) #dist = 1 - cosine_similarity(a) num_clusters = 4 km = KMeans(n_clusters=num_clusters) km.fit(a) centroids = km.cluster_centers_
class Chatbot(): def __init__(self, method=None): self.method = method self.spacy_model = spacy.load('en') self.labels = self.get_labels("annotations.json") self.library_df = pd.read_csv("library_data.csv") self.library_df.drop_duplicates(subset=['name']) self.lemmatized_library_descriptions = self.lemmatize_text( list(self.library_df['description'])) self.library_vectorizer = TfidfVectorizer( stop_words=list(stop_words.ENGLISH_STOP_WORDS) + [ 'a', 'python', 'framework', 'library', 'should', 'import', 'want', 'use', 'pron' ], ngram_range=(1, 1)) self.library_desc_vectors = self.library_vectorizer.fit_transform( self.lemmatized_library_descriptions) self.library_desc_vectors = csr_matrix( self.library_desc_vectors).toarray() self.error_df = pd.read_csv( "C:\\Users\\user\\EECE 634\\chatbot\\error_data.csv") self.error_lemmatized_descriptions = self.lemmatize_text( list(self.error_df['error'])) self.error_vectorizer = TfidfVectorizer( stop_words=list(stop_words.ENGLISH_STOP_WORDS) + ['python', 'should', 'want', 'use', 'pron'], ngram_range=(1, 1)) self.error_desc_vectors = self.error_vectorizer.fit_transform( self.error_lemmatized_descriptions) self.error_desc_vectors_arr = csr_matrix( self.error_desc_vectors).toarray() self.k = [] self.threshold = [0.8, 0.5, 0.55, 0.55, 0.5] self.vectorizers = [] self.dff = [] self.df = pd.read_csv("data.csv", encoding="ISO-8859-1") for cat in range(2, 7): if cat == 2: # represents category 0 vectorizer = TfidfVectorizer(stop_words=None, ngram_range=(1, 1)) self.vectorizers.append(vectorizer) df1 = self.df[self.df['Type'] == 0] else: vectorizer = TfidfVectorizer(stop_words=[ 'a', 'the', 'python', 'should', 'want', 'use', 'pron' ], ngram_range=(1, 1)) self.vectorizers.append(vectorizer) df1 = self.df[self.df['Type'] == cat] df1 = df1.reset_index(drop=True) self.dff.append(df1) corpus = list(df1['user1']) lemmatized_corpus = self.lemmatize_text(corpus) X = vectorizer.fit_transform(lemmatized_corpus) self.k.append(csr_matrix(X).toarray()) def lemmatize_text(self, input_list): lemmatized_descriptions = [] for desc in input_list: current_desc = [] doc = self.spacy_model(desc) for token in doc: current_desc.append(token.lemma_) lemmatized_descriptions.append(" ".join(current_desc)) return lemmatized_descriptions def get_labels(self, arg): with open(arg) as json_file: data = json.load(json_file) labels = { "Greetings": [0, []], "Library": [1, []], "Error": [2, []], "Syntax": [3, []], "Interpreted": [4, []], "Methods": [5, []], "Directory": [6, []] } for item in data["entities"]: value = item["offsets"][0]["text"] if (item["classId"] == "e_7"): if value not in labels["Greetings"][1]: labels["Greetings"][1].append(value) elif (item["classId"] == "e_8"): if value not in labels["Library"][1]: labels["Library"][1].append(value) elif (item["classId"] == "e_9"): if value not in labels["Error"][1]: labels["Error"][1].append(value) elif (item["classId"] == "e_10"): if value not in labels["Syntax"][1]: labels["Syntax"][1].append(value) elif (item["classId"] == "e_11"): if value not in labels["Interpreted"][1]: labels["Interpreted"][1].append(value) elif (item["classId"] == "e_12"): if value not in labels["Methods"][1]: labels["Methods"][1].append(value) elif (item["classId"] == "e_13"): if value not in labels["Directory"][1]: labels["Directory"][1].append(value) for category in labels: txt_file = "features/annotated_" + str( labels[category][0]) + "_" + category + ".txt" with open(txt_file, 'w') as file: file.write(json.dumps(labels[category][1])) for category in labels: txt_file = "features/added_" + str( labels[category][0]) + "_" + category + ".txt" with open(txt_file, 'r') as file: x = file.read().splitlines() for value in x: if x not in labels[category][1]: labels[category][1].append(value) file.close() return labels def answer(self, question, cat): if cat == 1: v = self.library_vectorizer.transform( self.lemmatize_text([question.lower()])) isAnswered = 0 if self.library_vectorizer.inverse_transform( self.library_vectorizer.transform( self.lemmatize_text([question.lower() ])))[0].shape[0] == 0: scores = [0] * len(self.library_desc_vectors) else: scores = [] for item in self.library_desc_vectors: scores.append( 1 - spatial.distance.cosine(item, csr_matrix(v).toarray())) scores = np.array(scores) answer_list = [] for item in scores.argsort()[-3:][::-1]: if scores[item] > 0.173: if isAnswered: answer_list.append("Maybe " + self.library_df['name'][item] + " would help") else: answer_list.append(self.library_df['name'][item] + " is a good choice") isAnswered = 1 elif 0.173 > scores[item] > 0.129: answer_list.append("I'm not sure, but " + self.library_df['name'][item] + " may help") isAnswered = 1 if isAnswered == 0: return 'Sorry i cannot answer this question yet :)' else: return ". ".join(answer_list) elif cat == 2: lemmatized_qs = self.lemmatize_text([question]) for i, qs in enumerate(lemmatized_qs): v = self.error_vectorizer.transform([qs.lower()]) isAnswered = 0 if self.error_vectorizer.inverse_transform( self.error_vectorizer.transform([qs ]))[0].shape[0] == 0: scores = [0] * len(self.error_desc_vectors_arr) else: scores = [] for item in self.error_desc_vectors_arr: scores.append( 1 - spatial.distance.cosine(item, csr_matrix(v).toarray())) scores = np.array(scores) for item in scores.argsort()[-3:][::-1]: if scores[item] > 0.45: isAnswered = 1 if "pip install <package>" in self.error_df[ 'how to solve'][item]: try: return self.error_df['how to solve'][ item].replace( '<package>', re.search( r'(?<=named\s)\s*(.)*?(?=([\s;,\.\n]|$))', question.lower().replace( "'", "")).group(0)) except: return self.error_df['how to solve'][item] else: return self.error_df['how to solve'][item] break if isAnswered == 0: return 'Sorry i cannot answer this question yet :)' else: c = 0 if cat == 0 else cat - 2 lemmatized_qs = self.lemmatize_text([question]) for i, qs in enumerate(lemmatized_qs): v = self.vectorizers[c].transform([qs.lower()]) scores = [] for item in self.k[c]: scores.append( 1 - spatial.distance.cosine(item, csr_matrix(v).toarray())) scores = np.array(scores) index = scores.argsort()[-3:][::-1][0] if scores[index] > self.threshold[c]: return self.dff[c]['user2'][index] else: return 'Sorry i cannot answer this question yet :)' def classify_functional(self, question): cat = -1 cat_found = [] for category in self.labels: for phrase in self.labels[category][1]: x = re.search("(^|[^a-zA-Z])" + phrase + "($|[^a-zA-Z])", question, re.IGNORECASE) if (x is not None): cat_found.append(category) break if (cat_found == []): cat = -1 elif (cat_found == ["Greetings"]): cat = 0 elif (len(cat_found) >= 1): if ("Greetings" in cat_found): cat_found.remove("Greetings") if (len(cat_found) == 1): cat = self.labels[cat_found[0]][0] elif ("Error" in cat_found): cat = 2 elif ("Syntax" in cat_found): cat = 3 elif ("Interpreted" in cat_found): cat = 4 elif ("Directory" in cat_found): cat = 6 elif ("Methods" in cat_found): cat = 5 else: cat = 1 if (cat == -1): return "I don't understand, please be more specific." else: return self.answer(question, cat)