Пример #1
0
    def infer(self):
        courses = [
            list(set(stop_words(item).remove()))
            for item in [w.split() for w in self.Courses]
        ]
        classes = list(set(stop_words(self.File_class).remove()))

        dictionary = corpora.Dictionary(courses)
        feature_cnt = len(dictionary.token2id)
        corpus = [dictionary.doc2bow(text) for text in courses]
        tfidf = models.TfidfModel(corpus)
        kw_vector = dictionary.doc2bow(classes)
        index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                    num_features=feature_cnt)
        sim = index[tfidf[kw_vector]]

        course_rec = dict(zip(sim, self.Names))
        course_sort = sorted(course_rec.items(), reverse=True)

        lda_model = models.LdaMulticore(tfidf[corpus],
                                        num_topics=10,
                                        id2word=dictionary,
                                        passes=2,
                                        workers=2)

        for idx, topic in lda_model.print_topics(-1):
            print('Topic: {} \nWords: {}'.format(idx, topic))

        for index, score in sorted(lda_model[tfidf[kw_vector]],
                                   key=lambda tup: -1 * tup[1]):
            print("\nScore: {}\t \nTopic: {}".format(
                score, lda_model.print_topic(index, 10)))

        return course_sort
Пример #2
0
def tf_idf(text):
    vectorizer = TfidfVectorizer(min_df=5,
                                 max_df=0.95,
                                 max_features=8000,
                                 stop_words=stop_words(),
                                 tokenizer=tokenize)
    matrix = vectorizer.fit_transform(text)
    return vectorizer, matrix
Пример #3
0
def send():
    userID = None
    if request.method == 'POST':
        userID = request.form['userID']
        keyword = request.form['keyword']
        print(type(userID))
        print(type(int(userID)))
        result = 1
        result = sameModel.recommendProducts(int(userID), 100)
        tmp = {}  # key = business_num, value = business_rating
        lda_list = []
        for line in result:
            tmp[line[1]] = line[2]
            lda_list.append(line[1])
        [bn, br] = openfile()
        name_list = []
        for key in lda_list:
            name_list.append(bn[key])
        Recommendation = name_list[:10]
        #LDA
        if len(request.form['keyword']) >= 1:
            rawList = tokenize(br, lda_list)

            stopped_result = stop_words(rawList)
            [corpus, dictionary] = doc_term_matrix(stopped_result)
            lsiList = lsi(corpus, dictionary)

            #word matching
            document = list()
            for i in range(0, 99):
                document.append(list())
            for i in range(0, 99):
                item = lsiList[i]
                for word in item:
                    document[i].append(','.join([str(word[0])]))
            model = gensim.models.Word2Vec(document, min_count=1)

            a = [0] * 100
            dic = {}
            for i in range(0, 99):
                business = document[i]
                for item in business:
                    a[i] = a[i] + model.similarity(keyword, item)
                dic[a[i]] = lda_list[i]
                #!!!!!!!!!!!gaichicken!!!!!!!!!#############################
            #a = np.sort(a,axis=0)
            a = sorted(a, reverse=True)
            Recommendation = []
            #a = np.ndarray.tolist(a)
            for key2 in a[0:10]:
                print(key2)
                Recommendation.append(bn[dic[key2]])

        #Recommendation = json.dumps(result)
        return render_template('index.html', Recommendation=Recommendation)
    return render_template('index.html')
Пример #4
0
def top10_words(essay_v, remove_stopwords):
    """Get top 10 words from the model."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    top10 = collections.defaultdict(int)
    if remove_stopwords:
        stops = stopwords.words("english")
        ner = preprocess.stop_words()
        stops.extend(ner)
        for word in words:
            if word not in stops:
                # words.append(w)
                top10[word] += 1
    return (top10)
def build_model(lines_num = -1):
	dataset = open('rawdata.csv', 'r')

	[bidList,rawList]=tokenize(dataset,lines_num)

	stopped_result = stop_words(rawList,lines_num)

	stem_result = stem(stopped_result,lines_num)

	[corpus,dictionary] = doc_term_matrix(stem_result)
	# it seems like without stem the words makes more sense, still working on it
	
	################
	####  LDA  #####
	################
	ldaList = lda(corpus, dictionary,lines_num)
	
	print("load data...")
	
	with open('outfile','wb') as fp:
		pickle.dump(ldaList,fp)
Пример #6
0
def build_model(lines_num = -1):
	#dataset = open('rawdata.csv', 'r')
	


	rawList=tokenize(lines_num)

	print(rawList[0])
	stopped_result = stop_words(rawList,lines_num)

	#stem_result = stem(stopped_result,lines_num)

	[corpus,dictionary] = doc_term_matrix(stopped_result)
	# it seems like without stem the words makes more sense, still working on it
	
	################
	####  LDA  #####
	################
	#ldaList = lda(corpus, dictionary,lines_num)
	lsiList = lsi(corpus, dictionary,lines_num)


	save_file(lsiList)