def extractKeywords(input_list, bigram_model): input_list = clean_text_ms.cleanText(input_list) # output = [] # output = splitText(input_list, output) keywds_list = [] i = 0 status = 0 for input in input_list: keywds, graph, text, ngrams = genspacyrank.extract_keywords( text=input, # lang='en', lang='fr', # bigram_model=bigram_model, bigram_model=None, trigram_model=None, selected_pos=['V', 'N', 'J'], # rm_stopwords=True rm_stopwords=False) keywords = [] for k in keywds: keywords.append(k[0]) keywds_list.append(keywords) print "Processing message " + str(i) + " of " + str(len(input_list)) i = i + 1 # DEBUGGING debugging = conf.get('MAIN', 'debugging') if (debugging == 'True'): if i == 1000: return keywds_list # keywds = utilities.convertLisfOfListToList(keywds_list) # return keywds return keywds_list
def predictTopics(input_list, w2v_model, som_model, cluster_model, dried_topics, type_chart="d3"): codebook2cluster = cluster_model.predict(som_model.W) cleaned_tweet_list = clean_text_ms.cleanText(input_list) embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords( cleaned_tweet_list, w2v_model) graphs = [] for index in xrange(codebook2cluster.max() + 1): M, words_list = getDriedTopicMatrix(index, dried_topics, embedded_words, dict_word2index) if len(words_list) > 10: # file_name_index = './data/output/dried_' + str(index) + '.json' file_name = conf.get('MAIN', 'MST_dried_topics_d3_base_file') file_name_index = file_name + str(index) + '.html' graph = plot_graph.plot_similarity_graph(M, words_list, file_name_index, type_chart) graphs.append(graph) print file_name_index return graphs
def doSomAndDryTopics(input_list, w2v_model, som_model, clustering_model): cleaned_tweet_list = clean_text_ms.cleanText(input_list) embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords( cleaned_tweet_list, w2v_model) data2unit, data2cell, data2dist, data2saliency, data2saliency_index, data2maps = som_model.predict( embedded_words) log.info("fit cluster...") codebook2cluster = clustering_model.predict(som_model.W) topics = getTopics(som_model, embedded_words, dict_index2word) save_obj( stopwordsDictFromFile(conf.ConfigSectionMap('STOPWORDS_FILES')), conf.get('MAIN', 'path_pickle_stopwords_dict')) dried_topics = dryTopics(topics, codebook2cluster, embedded_words, dict_word2index, dict_index2word, 1, conf) return dried_topics
def createCorpus(cleaned_input_list): # -------------------------GET ENTITIES---------------------------------------------------- log.info("GET ENTITIES") entity_list = [] confidence = conf.getfloat('ENTITY', 'confidence') entity_list, tweet_with_entity_list, all_uri = getEntities( cleaned_input_list, confidence=confidence) # -------------------------GET WIKIPEDIA PAGES--------------------------------------------- log.info("GET WIKIPEDIA PAGES") wikipage_list = getWikipediaPages(all_uri) wikipage_list = clean_text_ms.cleanText(wikipage_list) # -------------------------CREATE CORPUS--------------------------------------------------- print log.info("CREATE CORPUS") tweet_corpus = createTweetCorpus(wikipage_list, cleaned_input_list, tweet_with_entity_list) corpus = tweet_corpus corpus += wikipage_list return corpus
def cleanText(): """ Clean text Get a list of tweet message, return a list of cleaned messages --- parameters: - in: body name: body schema: type: array items: type: object properties: message: type: string description: tweeet message required: true responses: 200: description: Text cleaned schema: type: array items: type: string 500: description: Internal Server Error schema: type: object properties: error: type: string """ log.info("/analytics-backend/cleanText") r = request data_json = json.dumps(request.get_json(silent=True)) input_list = pd.read_json(data_json, encoding='utf8')['message'].tolist() cleaned_tweet_list = clean_text_ms.cleanText( pd.read_json(data_json, encoding='utf8')['message'].tolist()) return jsonify(cleaned_tweet_list)
def __init__(self, tweets, w2v_model=None): # load trained model W2V if (w2v_model == None): self.model = Word2Vec.load( conf.get('MAIN', 'path_pickle_w2v_model')) else: self.model = w2v_model self.vec2tweets = {} self.vec2word = {} self.word2tweet = {} self.tweets = tweets self.cleaned_tweets = clean_text_ms.cleanText(tweets) if os.path.exists(conf.get('MAIN', 'path_vec2tweets')): self.vec2tweets = load_obj(conf.get('MAIN', 'path_vec2tweets')) if os.path.exists(conf.get('MAIN', 'path_vec2word')): self.vec2word = load_obj(conf.get('MAIN', 'path_vec2word')) if os.path.exists(conf.get('MAIN', 'path_word2tweet')): self.word2tweet = load_obj(conf.get('MAIN', 'path_word2tweet')) self.embedded_words, self.index2word, self.word2index = getEmbeddedWords( self.cleaned_tweets, w2v_model)
def main(): log.info( "----------------------------------START------------------------------------" ) reload(sys) sys.setdefaultencoding('utf-8') document_path_file = conf.get('MAIN', 'path_document') log.info("reading input file: " + document_path_file) # ------------------------READ INPUT------------------------------------------------------- # read csv into list of string input_list = pd.read_csv(document_path_file, encoding='utf-8', error_bad_lines=False) # read which rows are from twitter source_value = np.array(input_list['idriferimento_ricerca']) tweet_rows_bool = (source_value == 5) | (source_value == 6) # read all input input_list = input_list['messaggio'].tolist() # ------------------------CLEANING TEXT--------------------------------------------------- cleaned_input_list = [] # read csv file path_csv_output_folder = conf.get('MAIN', 'path_csv_output_folder') file = path_csv_output_folder + 'cleaned_tweet_list.csv' if (os.path.isfile(file)): log.info("reading input from file " + file) cleaned_input_list = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) cleaned_input_list = cleaned_input_list['colummn'].tolist() if (cleaned_input_list == [] or cleaned_input_list == [[]]): log.info("CLEANING TEXT") cleaned_input_list = clean_text_ms.cleanText(input_list) # write output to csv df = pd.DataFrame(cleaned_input_list, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) # if word2vec does not exist or rebuild is setted train w2v model if not os.path.exists(conf.get('MAIN', 'path_pickle_w2v_model')): #-------------------------GET ENTITIES---------------------------------------------------- log.info("GET ENTITIES") entity_list = [] file_entity_list = path_csv_output_folder + 'entity_list.csv' file = file_entity_list if (os.path.isfile(file)): log.info("reading input from file " + file) # read csv file entity_list = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) entity_list = entity_list['colummn'].tolist() tweet_with_entity_list = [] file_tweet_with_entity_list = path_csv_output_folder + 'tweet_with_entity_list.csv' file = file_tweet_with_entity_list if (os.path.isfile(file)): log.info("reading input from file " + file) tweet_with_entity_list = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) tweet_with_entity_list = tweet_with_entity_list['colummn'].tolist() all_uri = [] file_all_uri = path_csv_output_folder + 'all_uri.csv' file = file_all_uri if (os.path.isfile(file)): log.info("reading input from file " + file) all_uri = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) all_uri = all_uri['colummn'].tolist() # get entities if (entity_list == [] or entity_list == [[]]): confidence = conf.get('ENTITY', 'confidence') entity_list, tweet_with_entity_list, all_uri = Corpus.getEntities( cleaned_input_list, confidence=confidence) file = file_entity_list # write output to csv df = pd.DataFrame(entity_list, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) file = file_tweet_with_entity_list df = pd.DataFrame(tweet_with_entity_list, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) file = file_all_uri df = pd.DataFrame(all_uri, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) with_wiki_pages = conf.get('MAIN', 'with_wiki_pages') if (with_wiki_pages == 'False'): corpus = cleaned_input_list else: #-------------------------GET WIKIPEDIA PAGES--------------------------------------------- log.info("GET WIKIPEDIA PAGES") wikipage_list = [] wikipage_list_file = path_csv_output_folder + 'wikipage_list.csv' file = wikipage_list_file if (os.path.isfile(file)): log.info("reading input from file " + file) # read csv file wikipage_list = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) wikipage_list = wikipage_list['colummn'].tolist() # get wikipedia page if (wikipage_list == [] or wikipage_list == [[]]): wikipage_list = Corpus.getWikipediaPages(all_uri) wikipage_list = clean_text_ms.cleanText(wikipage_list) # write csv df = pd.DataFrame(wikipage_list, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) #-------------------------CREATE CORPUS--------------------------------------------------- print log.info("CREATE CORPUS") corpus = [] # read csv file corpus_file = path_csv_output_folder + 'corpus.csv' file = corpus_file if (os.path.isfile(file)): log.info("reading input from file " + file) # read csv file corpus = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) corpus = corpus['colummn'].tolist() # create corpus if (corpus == [] or wikipage_list == [[]]): tweet_corpus = Corpus.createTweetCorpus( wikipage_list, cleaned_input_list, tweet_with_entity_list) corpus = tweet_corpus if (USE_WIKIPEDIA_FOR_W2V): corpus += wikipage_list corpus_file = path_csv_output_folder + 'corpus.csv' file = corpus_file # write corpus to csv df = pd.DataFrame(corpus, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) #-------------------------TRAIN MODEL W2V------------------------------------------------- # train model W2v log.info("TRAIN W2V") trainW2Vmodel(corpus) #----------------------TRAINING SOM------------------------------------------------ # load trained model W2V w2v_model = Word2Vec.load(conf.get('MAIN', 'path_pickle_w2v_model')) log.info("loading W2V model " + conf.get('MAIN', 'path_pickle_w2v_model')) # get w2v words, dict words and vectors only for tweet embedded_words_t_w, dict_index2word_t_w, dict_word2indext_w = collectWords( w2v_model) # train SOM: get codebook matrix doTrainSom = conf.getboolean('ADVANCED_ASOM', 'do_trainSom') if doTrainSom or not os.path.exists( conf.get('MAIN', 'path_pickle_som_model')): width = int(conf.get('ADVANCED_ASOM', 'width')) height = int(conf.get('ADVANCED_ASOM', 'height')) empty_codebook_threshold = int( conf.getboolean('ADVANCED_ASOM', 'empty_codebook_threshold')) log.info("training som [" + str(width) + "x" + str(height) + "]") mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w, conf, width, height) min_size_codebook_mtx = int( conf.get('ADVANCED_ASOM', 'min_size_codebook_mtx')) step_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'step_codebook_mtx')) # decrease som dimensions if we have more than one codebook empty while (not som_ms.isGoodResult(mySom, width, height, empty_codebook_threshold) and width > min_size_codebook_mtx + step_codebook_mtx): log.info("training som [" + str(width) + "x" + str(height) + "]") width = height = height - 2 mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w, conf, width, height) save_obj(mySom, conf.get('MAIN', 'path_pickle_som_model')) #--------- PREDICT: only on tweets------------------------------------------------ cleaned_input_list = clean_text_ms.cleanText(input_list) # getting only tweets of 3 cleaned_tweet_rows = [] tweet_rows = [] index = 0 for item in tweet_rows_bool: if item == True: cleaned_tweet_rows.append(cleaned_input_list[index]) tweet_rows.append(input_list[index]) index = index + 1 # get embedded words from input embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords( cleaned_tweet_rows) word2VecMS = Word2VecMS(tweet_rows, w2v_model) word2VecMS.computeWord2Tweets() word2VecMS.saveObject() # load SOM mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model')) log.info("loading SOM model " + conf.get('MAIN', 'path_pickle_som_model')) # predict SOM codebooks and plot file_name = conf.get('MAIN', 'MST_html_output_file') url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word, file_name, "netx") file_name = conf.get('MAIN', 'MST_html_d3_output_file') url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word, file_name, "d3") #--------------------PLOT/PRINT INFO ON SOM--------------------------------- png = som_ms.getCodebookActivation() num_of_topic = conf.getint('GRAPH_IMG', 'num_of_topic_for_frequencies') html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model, mySom, num_of_topic, 'bar') html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model, mySom, num_of_topic, 'bubble') png = som_ms.getUmatrix() plt.show() print som_ms.getCostOfSom() #-------------------------KMEANS -------------------------------------------------- if not os.path.exists(conf.get('MAIN', 'path_pickle_cluster_model')): log.info("START CLUSTERING") mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model')) make_figure = False mySom.fit_cluster(cluster_model=None, num_cluster_min=conf.getint('ADVANCED_ASOM', 'num_cluster_min'), num_cluster_max=conf.getint('ADVANCED_ASOM', 'num_cluster_max')) save_obj(mySom.cluster_model, conf.get('MAIN', 'path_pickle_cluster_model')) log.info("saved cluster model in " + conf.get('MAIN', 'path_pickle_cluster_model')) # make clustering and plot file_name = conf.get('MAIN', 'MST_cluster_csv_output_file') url = som_ms.doClusteringAndPlot(cleaned_tweet_rows, file_name)
def doSomAndPlot1(): """ Get entities: Apply SOM and plot result of codebook MST Get word2vec model id, som model id, the list of tweet messages or the url of the csv with messages, the type of result and return a result graph --- parameters: - in: body name: body schema: type: object properties: w2v_model_id: type: string description: id of model to use for word embedding som_model_id: type: string description: id of SOM model type_chart: type: string description: type of result "d3" (html) of json url_input: type: string description: url of the csv with messages tweets: type: array items: type: object properties: message: type: string description: tweeet message required: true responses: 200: description: graph of entities schema: type: object properties: directed: type: boolean graph: type: object links: type: array items: type: object properties: source: type: integer target: type: integer multigraph: type: boolean nodes: type: array items: type: object properties: id: type: integer name: type: string pos: type: array items: type: integer 500: description: Internal Server Error schema: type: object properties: error: type: string 299: description: Model is still training or not trained schema: type: object properties: warning: type: string """ log.info("/analytics-backend/doSomAndPlot") # reading json input data_json = json.dumps(request.get_json(silent=True)) data_json = json.loads(data_json) type_chart = data_json["type_chart"] w2v_model_id = data_json["w2v_model_id"] som_model_id = data_json["som_model_id"] if 'url_input' in data_json: url_input = data_json["url_input"] df = pd.read_csv(url_input) # DEBUGGING debugging = conf.get('MAIN', 'debugging') if (debugging == 'True'): document_path_file = conf.get('MAIN', 'path_document') df = pd.read_csv(document_path_file, encoding='utf-8', error_bad_lines=False) df = df.head() input_list = df['message'].tolist() else: input_list = json.dumps(data_json["tweets"]) input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist() filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold' ) + "word2vec_" + str(w2v_model_id) + ".pickle" try: model = word2vec_ms.Word2Vec.load(filename) except: filename = conf.get( 'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str( w2v_model_id) + "_training.txt" return returnModelStatus(filename, w2v_model_id) # get embedded words from input cleaned_tweet_list = clean_text_ms.cleanText(input_list) embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords( cleaned_tweet_list, model) filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + ".pickle" try: som_model = som_ms.load_obj(filename) except: filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + "_training.txt" return returnModelStatus(filename, som_model_id) file_name = conf.get('MAIN', 'MST_html_d3_output_file') response = som_ms.doSomAndPlot(som_model, embedded_words, dict_index2word, file_name, type_chart) if (type_chart == "d3"): return render_template('MST_d3.html') return html elif (type_chart == "json"): return jsonify(response) else: return internalServerError(500)
def trainWord2Vec(): """ Train word 2 vec model Get a list of tweet message or the url of the csv with tweet messages, return the id of the model that will be trained and start in a new thread the training of the model (The training process takes hours) --- parameters: - in: body name: body schema: type: object properties: url_input: type: string description: url of csv with tweet messages tweets: type: array items: type: object properties: message: type: string description: tweeet message required: true responses: 200: description: Id trained Model schema: type: object properties: w2v_model_id: type: string 500: description: Internal Server Error schema: type: object properties: error: type: string """ log.info("/analytics-backend/trainWord2Vec") data_json = json.dumps(request.get_json(silent=True)) data_json = json.loads(data_json) if 'url_input' in data_json: url_input = data_json["url_input"] df = pd.read_csv(url_input) # DEBUGGING debugging = conf.get('MAIN', 'debugging') if (debugging == 'True'): document_path_file = conf.get('MAIN', 'path_document') df = pd.read_csv(document_path_file, encoding='utf-8', error_bad_lines=False) df = df.head(100) input_list = df['message'].tolist() else: input_list = json.dumps(data_json["tweets"]) input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist() cleaned_input_list = clean_text_ms.cleanText(input_list) corpus = Corpus.createCorpus(cleaned_input_list) identifier = core.utility.utilities.getUniqueIdentifier() thread.start_new_thread(word2vec_ms.trainNewModelW2Vmodel, (corpus, identifier)) response = jsonify({"w2v_model_id": identifier}) return response