示例#1
0
    lda = LatentDirichletAllocation(docs, workers=3)
    model, corpus, dictionary = lda.compute(topics, passes)
    lda.save_to_file(save_filename, model, corpus, dictionary)
    return model, corpus, dictionary


def LSA(docs, topics, save_filename):
    lsi = LatentSemanticAnalyser(docs)
    return lsi.compute(topics, save_filename)


dbmg = DatabaseHelper(connection_string)
#replies_db = dbmg.select_query("select * from reply", None, fetch_to_dict=True)
questions = dbmg.select_query("""select * from question join forum_details on
                                question.forum_details_id = forum_details.forum_details_id
                                where community_id = %s
                              """,
                              dbmg.get_community_id('Business'),
                              fetch_to_dict=True)
#replies_by_question_db = dbmg.select_query("""select reply_id, text, question_id
#from reply
#where question_id in
#    ( select question_id
#    from replyvgroup by question_id)
#    order by question_id asc""", None, fetch_to_dict=True)

#replies_question_forum = dbmg.get_replies_question_forum()
dbmg.close()

questions_contents = []

for question in questions:
示例#2
0
        file = open("similarity_res_th_{}.txt".format(self.__id), 'w+')
        for doc in working_docs:
            file.write("NEW_QUESTION" + doc + "\n")
            for other_doc in self.__docs:
                if doc != other_doc:
                    s1 = set(doc.split()).intersection(self.__model.wv.vocab)
                    s2 = set(other_doc.split()).intersection(self.__model.wv.vocab)
                    similarity = self.__model.n_similarity(s1, s2)
                    if similarity > 0.95:
                        print(similarity)
                        file.write(str(similarity) + other_doc + "\n")

connect_string = "dbname=uoa-nlp user=admin"
db = DatabaseHelper(connect_string)

questions_db = db.select_query("select question_id, content from question", None, fetch_to_dict=True)

questions_content = []
tagged_questions = []

tokenizer = InputPreprocessor(None)

###tokenize the data before applying the model

for question in questions_db:
    if question['content'] is not None:
        questions_content.append(question['content'])
        tokens = tokenizer.tokenize(question['content'])
        tagged_questions.append(doc2vec.TaggedDocument(words=tokens, tags=[question['questiony_id']]))