def config_udpipe(self, language_name): # first loading udpipe to segement word for each sentence # all these need to be at preprocessed level self.udt_pre_model = UdpipeTrain(language_name, udpipe_language[language_name], corpus_language[language_name]) return self
def config_udpipe(self, language_name): # first loading udpipe to segement word for each sentence # TODO: once getting language_name, then to find the related udpipe and corpus # all these need to be at preprocessed level self.udt_pre_model = UdpipeTrain(language_name, '/home/zglg/SLU/psd/pre-model/english-ewt-ud-2.5-191206.udpipe', '/home/zglg/SLU/psd/corpus/english/wiki_en.txt') return self
def batch(): for lang in language_list: if lang in ['Chinese', 'English']: continue udpipe_pre_model_path = udpipe_language[lang] corpus_filepath = corpus_language[lang] # first loading udpipe to segement word for each sentence udt_lang = UdpipeTrain(lang, udpipe_pre_model_path, corpus_filepath) # second train to get the word2vec model word2vec_result_file = 'input//word2vecmodel//gensim-word2vec-model-' train_model(lang, corpus_filepath, word2vec_result_file, udt_lang)
# file_path = '/home/zglg/SLU/psd/cluster_pre_train/gensim-word2vec-model-' parser = argparse.ArgumentParser( description='train corpus to get our word2vec for multiple languages') parser.add_argument('-udfp', help='udpipe pre-model filepath') parser.add_argument('-cfp', help='corpus filepath for a specific language') parser.add_argument('-wvfp', help='word vector filepath after finishing train') args = parser.parse_args() if 'udfp' in args: udpipe_pre_model_path = args.udfp else: print('please input udpipe pre-model filepath') if 'cfp' in args: corpus_filepath = args.cfp else: print('please input corpus filepath') if 'wvfp' in args: file_path = args.wvfp else: print('please input word vector filepath') # first loading udpipe to segement word for each sentence udt_english = UdpipeTrain(languange_name, udpipe_pre_model_path, corpus_filepath) # second train to get the word2vec model train_model(languange_name, corpus_filepath, file_path, udt_english) # finally, after train we can load model to use directly load_model(file_path) print('All done')
parser = argparse.ArgumentParser(description='train corpus to get our word2vec for multiple languages') parser.add_argument('-udfp', help='udpipe pre-model filepath') parser.add_argument('-cfp', help='corpus filepath for a specific language') parser.add_argument('-wvfp', help='word vector filepath after finishing train') args = parser.parse_args() if 'udfp' in args: udpipe_pre_model_path = args.udfp else: print('please input udpipe pre-model filepath') if 'cfp' in args: corpus_filepath = args.cfp else: print('please input corpus filepath') if 'wvfp' in args: file_path = args.wvfp else: print('please input word vector filepath') # first loading udpipe to segement word for each sentence udt_chinese = UdpipeTrain(languange_name, udpipe_pre_model_path, corpus_filepath) # second train to get the word2vec model train_model(languange_name, corpus_filepath, file_path, udt_chinese) # finally, after train we can load model to use directly load_model(file_path) print('All done')
class AppService(object): def __init__(self): self.pos_dict = None self.sel_result = None self.udt_pre_model = None def config_udpipe(self, language_name): # first loading udpipe to segement word for each sentence # all these need to be at preprocessed level self.udt_pre_model = UdpipeTrain(language_name, udpipe_language[language_name], corpus_language[language_name]) return self def find_service(self, language_name: str, sel_word: str): """This method get results from database by specified language_name and input word assgin value to self.pos_dict and self.sel_result :param language_name: :param sel_word: :return: None """ # select sql_str = "select * from " + language_name + "_wordpos as w left join " + language_name + "_sentences as s on " \ "w.sentence = s.id " \ "where w.word = %s " try: cursor.execute(sql_str, (sel_word, )) self.sel_result = cursor.fetchall() cnx.commit() except Exception as e: print(e) # convert to data structure following # sel_result = (("sink", "NOUN", ["Don't just leave your dirty plates in the sink!"]), # ("sink", "VERB", ["The wheels, started to sink into the mud.", "How could you sink so low?"])) self.pos_dict = defaultdict(list) for row in self.sel_result: pos_sentences = self.pos_dict[row[POS_COLUMN_INDEX]] if row[SENTENCE_COLUMN_INDEX] not in pos_sentences: pos_sentences.append(row[SENTENCE_COLUMN_INDEX]) self.sel_result = [(sel_word, k, self.pos_dict[k]) for k in self.pos_dict] def database(self): self.store_data = StoreData(db_config['user'], db_config['password'], db_config['host'], db_config['database']) self.cursor = self.store_data.db_connect().cursor() query_info = "SELECT sentence FROM english_sentences" self.cursor.execute(query_info) sentences_df = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences']) return sentences_df def clusteringData(self): self.store_data = StoreData(db_config['user'], db_config['password'], db_config['host'], db_config['database']) self.cursor = self.store_data.db_connect().cursor() query_info = "SELECT sentence FROM english_sentences" self.cursor.execute(query_info) sentences_dataframe = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences']) return sentences_dataframe def cluster_sentences(self, language_name: str, save_path: str, sentences: List[str], n_clusters: int) -> List[str]: """ cluster sentences to get examples :param language_name: :param save_path: the saved path for our cluster udpipemodel trained well :param sentences: :param n_clusters: :return: """ no_n_input = False if n_clusters == '': n_clusters, no_n_input = 2, True n_clusters = int(n_clusters) if n_clusters <= 0: print("Parameter is Invalid") return if n_clusters > len(sentences): # TODO add log print('number of cluster bigger than sentences count') return if len(self.sel_result) <= 0: print('no sentence') return # first loading model # first loading udpipemodel word2vec_model = load_model(save_path) # second geting vectors for one sentence sent_vectors = [] default_dimn = 100 # iterator to sentence for sent in sentences: words = self.udt_pre_model.word_segmentation(sent) word_vectors = [] # iterator to word window_words = get_keyword_window(self.sel_result[0][0], words, 5) for word in window_words: if word in word2vec_model.wv: word_vectors.append(word2vec_model.wv[word]) # else: # not in dict, fill 0 # word_vectors.append([0] * default_dimn) to_array = np.array(word_vectors) sent_vectors.append(to_array.mean(axis=0).tolist()) # third using kmeans to cluster best_score, best_labels = -1, None evaluator = Evaluator(sent_vectors) labels1 = evaluator.kmeans_strategy(n_clusters) score1 = evaluator.higher_better_score(labels1) labels2 = evaluator.agglomerative_strategy(n_clusters) score2 = evaluator.higher_better_score(labels2) if score1 < score2: best_score = score2 best_labels = labels2 print('agglomerative is better than kmeans') else: best_score = score1 best_labels = labels1 print('kmeans is better than agglomerative') labels3, n_clusters = evaluator.get_best_n_clusters() score3 = evaluator.higher_better_score(labels3) if best_score < score3: best_labels, best_score = labels3, score3 best_score = score1 best_labels = labels1 print('kmeans is better than agglomerative') # fourth select one sentence with each label examples = self._get_examples(sentences, best_labels, n_clusters) labels3, recommend_clusters = evaluator.get_best_n_clusters() score3 = evaluator.higher_better_score(labels3) if best_score < score3: print('recommend %d sentences' % (recommend_clusters, )) recommend_sentences = self._get_examples(sentences, labels3, recommend_clusters) if no_n_input: examples = recommend_sentences return examples, recommend_sentences def _get_examples(self, sentences: List[str], best_labels, n_clusters: int): tmp_labels, examples = [], [] for sent, label in zip(sentences, best_labels): if label not in tmp_labels: tmp_labels.append(label) examples.append(sent) if len(examples) == n_clusters: break # add bottom logic for cluster if len(examples) < n_clusters: for sent in sentences: if sent not in examples: examples.append(sent) if len(examples) >= n_clusters: break return examples
class AppService(object): def __init__(self): self.pos_dict = None self.sel_result = None self.udt_pre_model = None def config_udpipe(self, language_name): # first loading udpipe to segement word for each sentence # TODO: once getting language_name, then to find the related udpipe and corpus # all these need to be at preprocessed level self.udt_pre_model = UdpipeTrain(language_name, '/home/zglg/SLU/psd/pre-model/english-ewt-ud-2.5-191206.udpipe', '/home/zglg/SLU/psd/corpus/english/wiki_en.txt') return self def find_service(self, language_name: str, sel_word: str): """This method get results from database by specified language_name and input word assgin value to self.pos_dict and self.sel_result :param language_name: :param sel_word: :return: None """ # select sql_str = "select * from " + language_name + "_wordpos as w left join " + language_name + "_sentences as s on w.sentence = s.id where w.word = %s " try: cursor.execute(sql_str, (sel_word,)) self.sel_result = cursor.fetchall() cnx.commit() except Exception as e: print(e) # convert to data structure following # sel_result = (("sink", "NOUN", ["Don't just leave your dirty plates in the sink!"]), # ("sink", "VERB", ["The wheels, started to sink into the mud.", "How could you sink so low?"])) self.pos_dict = defaultdict(list) for row in self.sel_result: pos_sentences = self.pos_dict[row[POS_COLUMN_INDEX]] if row[SENTENCE_COLUMN_INDEX] not in pos_sentences: pos_sentences.append(row[SENTENCE_COLUMN_INDEX]) self.sel_result = [(sel_word, k, self.pos_dict[k]) for k in self.pos_dict] def cluster_sentences(self, language_name: str, sentences: List[str], n_clusters: int) -> List[str]: """ cluster sentences to get examples :param language_name: :param sentences: :param n_clusters: :return: """ # first loading model word2vec_model = load_model(language_name) # second geting vectors for one sentence sent_vectors = [] default_dimn = 100 # iterator to sentence for sent in sentences: words = self.udt_pre_model.word_segmentation(sent) word_vectors = [] # iterator to word for word in words: if word in word2vec_model.wv: word_vectors.append(word2vec_model.wv[word]) else: # not in dict, fill 0 word_vectors.append([0] * default_dimn) to_array = np.array(word_vectors) sent_vectors.append(to_array.mean(axis=0).tolist()) # third using kmeans to cluster kmeans = KMeans(n_clusters=int(n_clusters), random_state=0).fit(sent_vectors) labels = kmeans.labels_ # fourth select one sentence with each label tmp_labels, examples = [], [] for sent, label in zip(sentences, labels): if label not in tmp_labels: tmp_labels.append(label) examples.append(sent) if len(examples) == n_clusters: break return examples
kmeans = KMeans(n_clusters=int(n_clusters), random_state=0).fit(sent_vectors) labels = kmeans.labels_ # fourth select one sentence with each label tmp_labels, examples = [], [] for sent, label in zip(sentences, labels): if label not in tmp_labels: tmp_labels.append(label) examples.append(sent) if len(examples) == n_clusters: break return examples if __name__ == "__main__": # get word vector for one sentence language_name = 'English' sentences = [ 'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.', 'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.', 'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.', 'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.'] # first loading udpipe to segement word for each sentence udt_english = UdpipeTrain(language_list[1], '/home/zglg/SLU/psd/pre-model/english-ewt-ud-2.5-191206.udpipe', '/home/zglg/SLU/psd/corpus/english/wiki_en.txt') cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, sentences, 2) print("two examples sentences: \n") print(cluster_result)