def clusteringData(self): self.store_data = StoreData(db_config['user'], db_config['password'], db_config['host'], db_config['database']) self.cursor = self.store_data.db_connect().cursor() query_info = "SELECT sentence FROM english_sentences" self.cursor.execute(query_info) sentences_dataframe = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences']) return sentences_dataframe
def __init__(self, language_name, pre_model_name, our_corpus_name): """ The language of pre_model_name and our_corpus_name should be identical! :param language_name: :param pre_model_name: it's from udpipe :param our_corpus_name: it's our found """ self.language_name = language_name self.pre_model_name = pre_model_name self.our_corpus_name = our_corpus_name try: self.store_data = StoreData(db_config['user'], db_config['password'], db_host=db_config['db_host'], db_name=db_config['db_name']) self.cursor = self.store_data.db_connect().cursor() except Exception as ex: print('logging in database error %s' % ex)
def __init__(self, language_name, pre_model_name, our_corpus_name): """ The language of pre_model_name and our_corpus_name should be identical! :param language_name: :param pre_model_name: it's from udpipe :param our_corpus_name: it's our found """ self.language_name = language_name self.pre_model_name = pre_model_name self.our_corpus_name = our_corpus_name try: self.store_data = StoreData(db_config['user'], db_config['password'], db_config['host'], db_config['database']) self.cursor = self.store_data.db_connect().cursor() # second loading udpipe pre-train model self.model = Model(self.pre_model_name) self._word_count, self.MAX_WORD_COUNT = 0, 500000 print('\n logging will start in database \n') except Exception as ex: print('logging in database error %s' % ex)
class UdpipeTrain(ITrain): def __init__(self, language_name, pre_model_name, our_corpus_name): """ The language of pre_model_name and our_corpus_name should be identical! :param language_name: :param pre_model_name: it's from udpipe :param our_corpus_name: it's our found """ self.language_name = language_name self.pre_model_name = pre_model_name self.our_corpus_name = our_corpus_name try: self.store_data = StoreData(db_config['user'], db_config['password'], db_host=db_config['db_host'], db_name=db_config['db_name']) self.cursor = self.store_data.db_connect().cursor() # second loading udpipe pre-train model self.model = Model(self.pre_model_name) except Exception as ex: print('logging in database error %s' % ex) def load_data(self) -> str: with open(self.our_corpus_name, 'r') as f: for sen in f: print('loading one sentence: %s' % (sen,)) yield sen print('loading done for our corpus') def clean_data(self, data: str) -> str: """ data is one or several sentence(s) we expect if data is \n, \t, empty str, etc, replace them :param data: raw data :return: data after cleaning """ cleaned_data = re.sub('[\n\t]+', '', data) return cleaned_data def do_train(self) -> List[TResult]: """ By pre-train modules of unpipe get the results for our corpus These udpipe modules can be download here: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131 :return: """ # train our corpus to get POS for each word line_no = 1 for sen in self.load_data(): # if line_no < 1811: # line_no += 1 # continue sen_clean = self.clean_data(sen) if not sen_clean: continue word_pos = list(self.model.process(sen_clean)) # pprint(word_pos) for i, one_sentence in enumerate(word_pos): sentence_text = self.extract_one_sentence(one_sentence) results = self.extract_one_word(one_sentence, sentence_text) self.store_data.insert_data(self.cursor, results, self.language_name) print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name)) line_no += 1 print(' all written succeed for corpus of %s' % self.our_corpus_name) def extract_one_sentence(self, sentence) -> str: """ This private method is mainly used to extract the sentence text. an instance of udpipe Sentence: Sentence( comments=[ '# sent_id = 3', '# text = 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。'], words=[ Word(id=0, <root>), Word(id=1, form='黄土', lemma='黄土', xpostag='NNP', upostag='PROPN', head=3, deprel='nmod', misc='SpaceAfter=No'), Word(id=2, form='高原', lemma='高原', xpostag='NN', upostag='NOUN', head=3, deprel='nmod', misc='SpaceAfter=No'), Word(id=3, form='严寒', lemma='严寒', xpostag='NN', upostag='NOUN', head=22, deprel='nsubj', misc='SpaceAfter=No'), omited by myself ]) :param sentence: udpipe Sentence :return: str 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。 """ comment = ''.join(sentence.comments) try: cs = re.findall(r'text = (.*)', comment)[0] return cs except Exception as e: # TODO: need to write warning log print('error: not find a sentence', e) return '' def extract_one_word(self, sentence, sentence_text: str) -> [TResult]: """ This private method is mainly used to extract one word and it's POS :param sentence_text: :param sentence: :return: [TResult] """ r = [] for word in sentence.words: if word.lemma and word.lemma not in ITrain.FILTER_WORD: if word.lemma and word.upostag and sentence_text: r.append(TResult(word.lemma, word.upostag, sentence_text)) return r def word_segmentation(self, sentence) -> List[str]: """ :param sentence: :return: word list """ sen_clean = self.clean_data(sentence) if not sen_clean: return [] word_pos = list(self.model.process(sen_clean)) words = [] for i, one_sentence in enumerate(word_pos): sentence_text = self.extract_one_sentence(one_sentence) results = self.extract_one_word(one_sentence, sentence_text) words.extend([res.word for res in results]) return words
class AppService(object): def __init__(self): self.pos_dict = None self.sel_result = None self.udt_pre_model = None def config_udpipe(self, language_name): # first loading udpipe to segement word for each sentence # all these need to be at preprocessed level self.udt_pre_model = UdpipeTrain(language_name, udpipe_language[language_name], corpus_language[language_name]) return self def find_service(self, language_name: str, sel_word: str): """This method get results from database by specified language_name and input word assgin value to self.pos_dict and self.sel_result :param language_name: :param sel_word: :return: None """ # select sql_str = "select * from " + language_name + "_wordpos as w left join " + language_name + "_sentences as s on " \ "w.sentence = s.id " \ "where w.word = %s " try: cursor.execute(sql_str, (sel_word, )) self.sel_result = cursor.fetchall() cnx.commit() except Exception as e: print(e) # convert to data structure following # sel_result = (("sink", "NOUN", ["Don't just leave your dirty plates in the sink!"]), # ("sink", "VERB", ["The wheels, started to sink into the mud.", "How could you sink so low?"])) self.pos_dict = defaultdict(list) for row in self.sel_result: pos_sentences = self.pos_dict[row[POS_COLUMN_INDEX]] if row[SENTENCE_COLUMN_INDEX] not in pos_sentences: pos_sentences.append(row[SENTENCE_COLUMN_INDEX]) self.sel_result = [(sel_word, k, self.pos_dict[k]) for k in self.pos_dict] def database(self): self.store_data = StoreData(db_config['user'], db_config['password'], db_config['host'], db_config['database']) self.cursor = self.store_data.db_connect().cursor() query_info = "SELECT sentence FROM english_sentences" self.cursor.execute(query_info) sentences_df = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences']) return sentences_df def clusteringData(self): self.store_data = StoreData(db_config['user'], db_config['password'], db_config['host'], db_config['database']) self.cursor = self.store_data.db_connect().cursor() query_info = "SELECT sentence FROM english_sentences" self.cursor.execute(query_info) sentences_dataframe = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences']) return sentences_dataframe def cluster_sentences(self, language_name: str, save_path: str, sentences: List[str], n_clusters: int) -> List[str]: """ cluster sentences to get examples :param language_name: :param save_path: the saved path for our cluster udpipemodel trained well :param sentences: :param n_clusters: :return: """ no_n_input = False if n_clusters == '': n_clusters, no_n_input = 2, True n_clusters = int(n_clusters) if n_clusters <= 0: print("Parameter is Invalid") return if n_clusters > len(sentences): # TODO add log print('number of cluster bigger than sentences count') return if len(self.sel_result) <= 0: print('no sentence') return # first loading model # first loading udpipemodel word2vec_model = load_model(save_path) # second geting vectors for one sentence sent_vectors = [] default_dimn = 100 # iterator to sentence for sent in sentences: words = self.udt_pre_model.word_segmentation(sent) word_vectors = [] # iterator to word window_words = get_keyword_window(self.sel_result[0][0], words, 5) for word in window_words: if word in word2vec_model.wv: word_vectors.append(word2vec_model.wv[word]) # else: # not in dict, fill 0 # word_vectors.append([0] * default_dimn) to_array = np.array(word_vectors) sent_vectors.append(to_array.mean(axis=0).tolist()) # third using kmeans to cluster best_score, best_labels = -1, None evaluator = Evaluator(sent_vectors) labels1 = evaluator.kmeans_strategy(n_clusters) score1 = evaluator.higher_better_score(labels1) labels2 = evaluator.agglomerative_strategy(n_clusters) score2 = evaluator.higher_better_score(labels2) if score1 < score2: best_score = score2 best_labels = labels2 print('agglomerative is better than kmeans') else: best_score = score1 best_labels = labels1 print('kmeans is better than agglomerative') labels3, n_clusters = evaluator.get_best_n_clusters() score3 = evaluator.higher_better_score(labels3) if best_score < score3: best_labels, best_score = labels3, score3 best_score = score1 best_labels = labels1 print('kmeans is better than agglomerative') # fourth select one sentence with each label examples = self._get_examples(sentences, best_labels, n_clusters) labels3, recommend_clusters = evaluator.get_best_n_clusters() score3 = evaluator.higher_better_score(labels3) if best_score < score3: print('recommend %d sentences' % (recommend_clusters, )) recommend_sentences = self._get_examples(sentences, labels3, recommend_clusters) if no_n_input: examples = recommend_sentences return examples, recommend_sentences def _get_examples(self, sentences: List[str], best_labels, n_clusters: int): tmp_labels, examples = [], [] for sent, label in zip(sentences, best_labels): if label not in tmp_labels: tmp_labels.append(label) examples.append(sent) if len(examples) == n_clusters: break # add bottom logic for cluster if len(examples) < n_clusters: for sent in sentences: if sent not in examples: examples.append(sent) if len(examples) >= n_clusters: break return examples
import pandas as pd import numpy as np from typing import List from sklearn.cluster import KMeans from collections import defaultdict import pymysql from src.train.result_model import TResult from src.train.store import StoreData from src.util import * from src.train.train_cluster import load_model from src.train.train_model import UdpipeTrain from src.train.cluster import Evaluator try: store_data = StoreData(db_config['user'], db_config['password'], db_config['host'], db_config['database']) cnx = store_data.db_connect() cursor = cnx.cursor() except Exception as ex: print('logging in database error %s' % ex) POS_COLUMN_INDEX, SENTENCE_COLUMN_INDEX = 2, 6 class AppService(object): def __init__(self): self.pos_dict = None self.sel_result = None self.udt_pre_model = None def config_udpipe(self, language_name):
import json from typing import List import sys import os import numpy as np from sklearn.cluster import KMeans from src.train.result_model import TResult from src.train.store import StoreData from src.util import language_dict, language_list, db_config from src.train.train_cluster import load_model from src.train.train_model import UdpipeTrain try: store_data = StoreData(db_config['user'], db_config['password'], db_host=db_config['db_host'], db_name=db_config['db_name']) cnx = store_data.db_connect() cursor = cnx.cursor() except Exception as ex: print('logging in database error %s' % ex) POS_COLUMN_INDEX, SENTENCE_COLUMN_INDEX = 2, 6 class AppService(object): def __init__(self): self.pos_dict = None self.sel_result = None self.udt_pre_model = None