Пример #1
0
def main():
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        warnings.filterwarnings("ignore", category=DeprecationWarning)   
     
    
    preTime = time.time()
    
    LogFile = 'logging_Miner.txt'
    logging.basicConfig(filename = LogFile, level = logging.DEBUG, 
                        filemode= 'w', format = ('%(filename)s: ''%(levelname)s: ''%(funcName)s(): ''%(lineno)d:\t''%(message)s'))
    
    logging.info('Start Miner')
    
    readPath = ''
    writePath = ''
    #remember two spaces for yaml file and no tabs
    with open("MinerConfig.yml",'r') as ymlFile:
        config = yaml.load(ymlFile)
        readPath = config['folder']['readpath']
        writePath = config['folder']['writepath']
        logging.info('reading from path: ' + readPath)        
    
    #nltk.download()
    #'C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\hotel_info.csv'
    data = pd.read_csv(readPath)
    colnames = list(data.columns.values)
    
    data.columns = Miner.MyFlattten([['hotel_id'],colnames[1:]])
   
    reviewData = pd.read_csv('C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\hotel_review.csv')
    
    
    ##############################Freq Dist##################################################################
    reviewText = getDocuments(reviewData,'body',True)
    
    totalText = ''
    for k in xrange(int(len(reviewText)/20)):
        totalText += str(reviewText[k])
        
    reviewText0 = nltk.word_tokenize(totalText)
    #reviewText0 = Miner.tokenStemmer(reviewText0) #stemmer is not working very well
    
    Miner.stopWords.update(['hotel','hotels','near'])
    reviewText = [i.lower() for i in reviewText0 if i.lower() not in Miner.stopWords]
    
    Miner.stopWords.remove(['hotel','hotels','near'])
    
    freqDistText = Miner.getFreqDist(reviewText, True) #pickle freqDist
    
    #print('frequency plot')
    #freqDistText[0].plot(30,cumulative=True)    
    
    logging.info('Top words: ' )
    logging.info(freqDistText[1][3:10])    
    
    logging.info('cfDist predictions: ')
    
    #############################################word predictions######################################################
    
    print('top words')
    print([i for i,j in freqDistText[1][3:10]])
    
    topWords = [i for i,j in freqDistText[1][3:100]]
    wordsPred = [i for i,j in freqDistText[1][3:10]]
    
    print('topWords')
    print(topWords)
    print('wordsPred')
    print(wordsPred)
    
    wordsPredictions = Miner.getConditionalDist(reviewText, topWords, wordsPred)
    
    logging.info(wordsPredictions)
    
#     Ngrams = Miner.getNgram(reviewText, zip(wordsPred,wordsPred[::-1]), True)
#     
#     logging.info('Ngrams')
#     logging.info(' ')
#     logging.info(Ngrams[1])
#     
    combineData = pd.merge(data.ix[:,['hotel_id','tags']], reviewData.ix[:,['hotel_id','title','body']], on=['hotel_id'],how='inner')
    
    combineData.to_csv('C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\combineData_text.csv')
    
    ###############################Topic Modeling######################################################################
#     topicData = (combineData , 'body')
#     lda = Miner.getTopicsLDAandLSA(topicData[0],topicData[1],'lda')
#     logging.info('lda topics')
#     logging.info(lda[0].print_topics(10))
#     logging.info('LDA perplexity: ' + str(lda[1]))
#     
#     lsa = Miner.getTopicsLDAandLSA(topicData[0],topicData[1],'lsa')
#     logging.info('lsa topics')
#     logging.info(lsa.print_topics(10))
#     
#     dataText = Miner.getDocuments(topicData[0],topicData[1],True)
#     tfidf_searchTerms, modelTfIdf = Miner.tf_idf(dataText)
#     
#     print('tfidf_searchTerms.T.toarray()')
#     print(tfidf_searchTerms.T.toarray()) #word by doc, before transpose doc by word (row by col format)
#     
#     logging.info('tfidf_searchTerms transposed')
#     logging.info(tfidf_searchTerms.T.toarray())
#     
#     tfidf_review, reviewModelTfIdf = Miner.tf_idf(Miner.getDocuments(combineData,'body',True))  
#     
#     topicsNMF = Miner.getTopicNMF(tfidf_searchTerms, modelTfIdf)
#     logging.info('NMF topics')    
#     
    #################Similiarity testing###################################################################################
    colNames = ['title','body']    
    
    logging.info('Pairwise similiarity')
    #logging.info(Miner.similiarity(np.array([1,2,0,1]), np.array([0,2,2,1]), None, None))
    logging.info(' ')       
#     
#     combinePhraseDoc = Miner.CombineDocumentPhrase(combineData, colNames, True) #takes a long time to compute
#     
#     tfidf_review, combineModelTfIdf  = Miner.tf_idf(combinePhraseDoc)   
#     
#     dimReview = tfidf_review.toarray().shape
#     
#     tfidf_review_matrix = tfidf_review.toarray()    
#     
#     for i in xrange(0,int(dimReview[0]/10),2): #loop through by twos     
#         try:                         
#             logging.info('phrase: ' + str(combinePhraseDoc[i]))
#             logging.info('document: ' + str(combinePhraseDoc[i+1]))
#             #logging.info('phrase vector: ' )
#             #logging.info(tfidf_review_matrix[i,:])
#             #logging.info('doc vector: ' )
#             #logging.info(tfidf_review_matrix[i+1,:])
#             logging.info('similiarity: ' + str(Miner.similiarity(tfidf_review_matrix[i,:], tfidf_review_matrix[i+1,:], None, None)))
#         except Exception, e:
#             logging.warn('Error: ' + str(e))
#     
#     logging.info(' ')            
#     logging.info('Pairwise similiarity')
#     logging.info(' ')
#     dimCombineData = combineData.shape
#     
#     phrasesText = Miner.getDocuments(combineData,colNames[0],True)
#     documentText = Miner.getDocuments(combineData,colNames[1], True)
#     
#     for j in xrange(int(dimCombineData[0]/10)):
#         try:                
#             logging.info('phrase: ' + str(phrasesText[j]))
#             logging.info('document: ' + str(documentText[j]))
#             tfidf_pair, pairTfIdf = Miner.tf_idf([phrasesText[j],documentText[j]]) 
#             tfidf_pair_matrix = tfidf_pair.toarray()
#             logging.info('similiarity: ' + str(Miner.similiarity(tfidf_pair_matrix[0,:], tfidf_pair_matrix[1,:], None, None)))
#         except Exception, e:
#             logging.warn('Error: ' + str(e))    
#     
    #logging.info(topicsNMF.)
    
    ###########################Entity Extraction ###################################################################################
   
    tagToken = Miner.ExtractTags(combineData[:10],'body')
    Entities = Miner.ExtractEntity(tagToken)
    
    logging.info('compute time: {0}'.format(time.time() - preTime))
    Entities[0].draw()    
    
    RegexEntities = Miner.grammarEntity(tagToken) #takes long
    RegexEntities[0].draw()
   
    logging.info('Entities')
    for entity in Entities:
        logging.info(entity)
        logging.info(' ')
    
    #'C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\entity_test.csv'
    logging.info('write to path: ' + writePath)
    pd.DataFrame(Entities).to_csv(writePath, index=False)
    
    w2vec = Miner.getTopicWord2VecNeuralNet(data, 'tags')
    logging.info('word2vec features')
    logging.info(w2vec.accuracy())