def main_new_dataset():

	newData = pd.read_csv('../xsense_data/global_dataset_abs_speed_diff_yaw.txt', sep=';')
	newDataToWord = newData.ix[:,['Acc_X','Acc_Y','Speed_X','Speed_Y','Diff_Yaw']]

	worder = WordData(newDataToWord)
	words = worder.create_words(worder.dataset)
	colWords = pd.Series(words, name='Word')
	wordDataset = pd.concat([newData,colWords], axis=1)
	#wordDataset.to_csv('../xsense_data/word_global_dataset.txt',sep=';')

	docs = worder.create_text_corpus(wordDataset)

	texts = [[i for i in doc.lower().split()] for doc in docs]

	dictionary = corpora.Dictionary(texts)
	dictionary.save('data_topic_modeling/new_dataset/doc_dictionary.dict');
	# corpus = corpora.TextCorpus(docs)
	corpus = [dictionary.doc2bow(text) for text in texts]
	corpora.MmCorpus.serialize('data_topic_modeling/new_dataset/documents.mm', corpus)
	hdp = models.HdpModel(corpus, dictionary, T=50, K=10)
	print hdp.show_topics(topics=20, topn=5)

	topicDocs= hdp[corpus]
	for x in topicDocs:
		print x
Exemplo n.º 2
0
def main_new_dataset():

    newData = pd.read_csv(
        '../xsense_data/global_dataset_abs_speed_diff_yaw.txt', sep=';')
    newDataToWord = newData.ix[:, [
        'Acc_X', 'Acc_Y', 'Speed_X', 'Speed_Y', 'Diff_Yaw'
    ]]

    worder = WordData(newDataToWord)
    words = worder.create_words(worder.dataset)
    colWords = pd.Series(words, name='Word')
    wordDataset = pd.concat([newData, colWords], axis=1)
    #wordDataset.to_csv('../xsense_data/word_global_dataset.txt',sep=';')

    docs = worder.create_text_corpus(wordDataset)

    texts = [[i for i in doc.lower().split()] for doc in docs]

    dictionary = corpora.Dictionary(texts)
    dictionary.save('data_topic_modeling/new_dataset/doc_dictionary.dict')
    # corpus = corpora.TextCorpus(docs)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('data_topic_modeling/new_dataset/documents.mm',
                               corpus)
    hdp = models.HdpModel(corpus, dictionary, T=50, K=10)
    print hdp.show_topics(topics=20, topn=5)

    topicDocs = hdp[corpus]
    for x in topicDocs:
        print x
Exemplo n.º 3
0
def main():

    newData = pd.read_csv('../xsense_data/global_dataset.txt', sep=';')

    ###############################LONG WORD TRY ###############################
    ############################### 15 SIGNALS   ###############################
    ## Choose feature to represent in words
    ## All exclused altitude
    ## dataPartOne = newData.ix[:,'Acc_X':'Pitch']
    ## dataPartTwo = newData.ix[:, 'Speed_X':'Speed_Z']

    ## newDataToWord = pd.concat([dataPartOne,dataPartTwo], axis=1)
    ###############################REDUCED WORD TRY ###############################
    ############################### 5 SIGNALS       ###############################
    newDataToWord = newData.ix[:,
                               ['Acc_X', 'Acc_Y', 'Acc_Z', 'Speed_X', 'Roll']]

    worder = WordData(newDataToWord)
    words = worder.create_words(worder.dataset)

    colWords = pd.Series(words, name='Word')
    wordDataset = pd.concat([newData, colWords], axis=1)
    wordDataset.to_csv('../xsense_data/word_global_dataset.txt', sep=';')

    docs = worder.create_text_corpus(wordDataset)

    #docs = ['aaabacdb abababdb addbaedb daecabdb badbccdb',
    #		'aeaaacdb abebabdb acdbaedc dbecadda addbbccb',
    #		'aeaaacdb abebabdb acdbaedc dbecadda addbbccb']

    texts = [[i for i in doc.lower().split()] for doc in docs]

    dictionary = corpora.Dictionary(texts)
    dictionary.save('data_topic_modeling/doc_dictionary.dict')
    # corpus = corpora.TextCorpus(docs)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('data_topic_modeling/documents.mm', corpus)
    hdp = models.HdpModel(corpus, dictionary, T=50, K=10)
    print hdp.show_topics(topics=20, topn=5)

    topicDocs = hdp[corpus]
    for x in topicDocs:
        print x

    alpha, beta = hdp.hdp_to_lda()
    print alpha
    lda_model = models.LdaModel(id2word=hdp.id2word,
                                num_topics=len(alpha),
                                alpha=alpha,
                                eta=hdp.m_eta)
    lda_model.expElogbeta = np.array(beta, dtype=np.float32)
    print lda_model.show_topic(1)
def main():

	newData = pd.read_csv('../xsense_data/global_dataset.txt', sep=';')

	###############################LONG WORD TRY ###############################
	############################### 15 SIGNALS   ###############################
	## Choose feature to represent in words
	## All exclused altitude
	## dataPartOne = newData.ix[:,'Acc_X':'Pitch']
	## dataPartTwo = newData.ix[:, 'Speed_X':'Speed_Z']

	## newDataToWord = pd.concat([dataPartOne,dataPartTwo], axis=1)
	###############################REDUCED WORD TRY ###############################
	############################### 5 SIGNALS       ###############################
	newDataToWord = newData.ix[:,['Acc_X','Acc_Y','Acc_Z','Speed_X','Roll']]

	worder = WordData(newDataToWord)
	words = worder.create_words(worder.dataset)

	colWords = pd.Series(words, name='Word')
	wordDataset = pd.concat([newData,colWords], axis=1)
	wordDataset.to_csv('../xsense_data/word_global_dataset.txt',sep=';')

	docs = worder.create_text_corpus(wordDataset)

	#docs = ['aaabacdb abababdb addbaedb daecabdb badbccdb',
	#		'aeaaacdb abebabdb acdbaedc dbecadda addbbccb',
	#		'aeaaacdb abebabdb acdbaedc dbecadda addbbccb']

	texts = [[i for i in doc.lower().split()] for doc in docs]

	dictionary = corpora.Dictionary(texts)
	dictionary.save('data_topic_modeling/doc_dictionary.dict');
	# corpus = corpora.TextCorpus(docs)
	corpus = [dictionary.doc2bow(text) for text in texts]
	corpora.MmCorpus.serialize('data_topic_modeling/documents.mm', corpus)
	hdp = models.HdpModel(corpus, dictionary, T=50, K=10)
	print hdp.show_topics(topics=20, topn=5)

	topicDocs= hdp[corpus]
	for x in topicDocs:
		print x

	alpha, beta = hdp.hdp_to_lda()
	print alpha
	lda_model = models.LdaModel(id2word=hdp.id2word,
						num_topics=len(alpha),
						alpha=alpha,
						eta=hdp.m_eta)
	lda_model.expElogbeta = np.array(beta, dtype=np.float32)
	print lda_model.show_topic(1)
Exemplo n.º 5
0
    def setUp(self):
        self.actors = Crowling.get_actors(
            Crowling,
            'https://movie.naver.com/movie/bi/mi/point.nhn?code=145162#tab')
        self.scores = Crowling.get_score(
            Crowling,
            'https://movie.naver.com/movie/bi/mi/point.nhn?code=145162#tab')

        self.data1 = WordData.getWord(WordData, 'master.info')
        self.biggestWords, self.reivew = WordData.getReview(
            WordData, 'master.p')

        self.maxWords, self.reivews = buttons.setReviewData('master')
        self.maxWords = self.maxWords.split('\n')[0].split(':')[0].strip()
        self.reivews = self.reivews.split('\n')[0]
Exemplo n.º 6
0
    def setData(self):
        text = self.choiceMovie.currentText()
        self.tx1.setText(text)

        infoData = WordData.getWord(WordData, text+'.info')
        self.tx2.setText(infoData[0])
        self.tx3.setText(infoData[1])

        reviewTexts, reviews = setReviewData(text)
        self.tx4.setText(reviewTexts)
        self.tx5.setText(reviews)
Exemplo n.º 7
0
def setReviewData(text):
    words = ''
    reviewData = WordData.getReview(WordData, text + '.p')
    reviewWords = sorted(reviewData[0].items(),
                         key=lambda x: x[1],
                         reverse=True)

    for word, nums in reviewWords:
        if word != 'actors' and word != 'score':
            words += word + ' : ' + str(nums) + '\n'

    sentences = ''
    for i in reviewData[1]:
        sentences += str(i) + '.' + '\n'

    return words, sentences
Exemplo n.º 8
0
    def get_synset_tokens(self, tagged):
        lemmatzr = WordNetLemmatizer()
        for token in tagged:
            wordnet_tag = self.penn_to_wordnet(token[1])
            if not wordnet_tag:
                continue

            lemma = lemmatzr.lemmatize(token[0], pos=wordnet_tag)

            # If it can't append on try it's probably a spelling error
            try:
                self.ConversionData["synsets"].append(WordData(token[0], wordnet.synsets(lemma, pos=wordnet_tag)[0]))
            except:
                self.ConversionData["spellingErrors"].append(token[0])


        return self.ConversionData