def infer(self): courses = [ list(set(stop_words(item).remove())) for item in [w.split() for w in self.Courses] ] classes = list(set(stop_words(self.File_class).remove())) dictionary = corpora.Dictionary(courses) feature_cnt = len(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in courses] tfidf = models.TfidfModel(corpus) kw_vector = dictionary.doc2bow(classes) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt) sim = index[tfidf[kw_vector]] course_rec = dict(zip(sim, self.Names)) course_sort = sorted(course_rec.items(), reverse=True) lda_model = models.LdaMulticore(tfidf[corpus], num_topics=10, id2word=dictionary, passes=2, workers=2) for idx, topic in lda_model.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) for index, score in sorted(lda_model[tfidf[kw_vector]], key=lambda tup: -1 * tup[1]): print("\nScore: {}\t \nTopic: {}".format( score, lda_model.print_topic(index, 10))) return course_sort
def tf_idf(text): vectorizer = TfidfVectorizer(min_df=5, max_df=0.95, max_features=8000, stop_words=stop_words(), tokenizer=tokenize) matrix = vectorizer.fit_transform(text) return vectorizer, matrix
def send(): userID = None if request.method == 'POST': userID = request.form['userID'] keyword = request.form['keyword'] print(type(userID)) print(type(int(userID))) result = 1 result = sameModel.recommendProducts(int(userID), 100) tmp = {} # key = business_num, value = business_rating lda_list = [] for line in result: tmp[line[1]] = line[2] lda_list.append(line[1]) [bn, br] = openfile() name_list = [] for key in lda_list: name_list.append(bn[key]) Recommendation = name_list[:10] #LDA if len(request.form['keyword']) >= 1: rawList = tokenize(br, lda_list) stopped_result = stop_words(rawList) [corpus, dictionary] = doc_term_matrix(stopped_result) lsiList = lsi(corpus, dictionary) #word matching document = list() for i in range(0, 99): document.append(list()) for i in range(0, 99): item = lsiList[i] for word in item: document[i].append(','.join([str(word[0])])) model = gensim.models.Word2Vec(document, min_count=1) a = [0] * 100 dic = {} for i in range(0, 99): business = document[i] for item in business: a[i] = a[i] + model.similarity(keyword, item) dic[a[i]] = lda_list[i] #!!!!!!!!!!!gaichicken!!!!!!!!!############################# #a = np.sort(a,axis=0) a = sorted(a, reverse=True) Recommendation = [] #a = np.ndarray.tolist(a) for key2 in a[0:10]: print(key2) Recommendation.append(bn[dic[key2]]) #Recommendation = json.dumps(result) return render_template('index.html', Recommendation=Recommendation) return render_template('index.html')
def top10_words(essay_v, remove_stopwords): """Get top 10 words from the model.""" essay_v = re.sub("[^a-zA-Z]", " ", essay_v) words = essay_v.lower().split() top10 = collections.defaultdict(int) if remove_stopwords: stops = stopwords.words("english") ner = preprocess.stop_words() stops.extend(ner) for word in words: if word not in stops: # words.append(w) top10[word] += 1 return (top10)
def build_model(lines_num = -1): dataset = open('rawdata.csv', 'r') [bidList,rawList]=tokenize(dataset,lines_num) stopped_result = stop_words(rawList,lines_num) stem_result = stem(stopped_result,lines_num) [corpus,dictionary] = doc_term_matrix(stem_result) # it seems like without stem the words makes more sense, still working on it ################ #### LDA ##### ################ ldaList = lda(corpus, dictionary,lines_num) print("load data...") with open('outfile','wb') as fp: pickle.dump(ldaList,fp)
def build_model(lines_num = -1): #dataset = open('rawdata.csv', 'r') rawList=tokenize(lines_num) print(rawList[0]) stopped_result = stop_words(rawList,lines_num) #stem_result = stem(stopped_result,lines_num) [corpus,dictionary] = doc_term_matrix(stopped_result) # it seems like without stem the words makes more sense, still working on it ################ #### LDA ##### ################ #ldaList = lda(corpus, dictionary,lines_num) lsiList = lsi(corpus, dictionary,lines_num) save_file(lsiList)