Пример #1
0
 def clusteringData(self):
     self.store_data = StoreData(db_config['user'], db_config['password'],
                                 db_config['host'], db_config['database'])
     self.cursor = self.store_data.db_connect().cursor()
     query_info = "SELECT sentence FROM english_sentences"
     self.cursor.execute(query_info)
     sentences_dataframe = pd.DataFrame(self.cursor.fetchall(),
                                        columns=['Sentences'])
     return sentences_dataframe
Пример #2
0
 def __init__(self, language_name, pre_model_name, our_corpus_name):
     """
     The language of pre_model_name and our_corpus_name should be identical!
     :param language_name:
     :param pre_model_name: it's from udpipe
     :param our_corpus_name: it's our found
     """
     self.language_name = language_name
     self.pre_model_name = pre_model_name
     self.our_corpus_name = our_corpus_name
     try:
         self.store_data = StoreData(db_config['user'],
                                     db_config['password'],
                                     db_host=db_config['db_host'],
                                     db_name=db_config['db_name'])
         self.cursor = self.store_data.db_connect().cursor()
     except Exception as ex:
         print('logging in database error %s' % ex)
Пример #3
0
 def __init__(self, language_name, pre_model_name, our_corpus_name):
     """
     The language of pre_model_name and our_corpus_name should be identical!
     :param language_name:
     :param pre_model_name: it's from udpipe
     :param our_corpus_name: it's our found
     """
     self.language_name = language_name
     self.pre_model_name = pre_model_name
     self.our_corpus_name = our_corpus_name
     try:
         self.store_data = StoreData(db_config['user'],
                                     db_config['password'],
                                     db_config['host'],
                                     db_config['database'])
         self.cursor = self.store_data.db_connect().cursor()
         # second loading udpipe pre-train model
         self.model = Model(self.pre_model_name)
         self._word_count, self.MAX_WORD_COUNT = 0, 500000
         print('\n logging will start in database \n')
     except Exception as ex:
         print('logging in database error %s' % ex)
Пример #4
0
class UdpipeTrain(ITrain):
    def __init__(self, language_name, pre_model_name, our_corpus_name):
        """

        The language of pre_model_name and our_corpus_name should be identical!
        :param language_name:
        :param pre_model_name: it's from udpipe
        :param our_corpus_name: it's our found
        """
        self.language_name = language_name
        self.pre_model_name = pre_model_name
        self.our_corpus_name = our_corpus_name
        try:
            self.store_data = StoreData(db_config['user'],
                                        db_config['password'],
                                        db_host=db_config['db_host'],
                                        db_name=db_config['db_name'])
            self.cursor = self.store_data.db_connect().cursor()

            # second loading udpipe pre-train model
            self.model = Model(self.pre_model_name)

        except Exception as ex:
            print('logging in database error %s' % ex)

    def load_data(self) -> str:
        with open(self.our_corpus_name, 'r') as f:
            for sen in f:
                print('loading one sentence: %s' % (sen,))
                yield sen

        print('loading done for our corpus')

    def clean_data(self, data: str) -> str:
        """
        data is one or several sentence(s) we expect

        if data is \n, \t, empty str, etc, replace them

        :param data: raw data
        :return: data after cleaning
        """
        cleaned_data = re.sub('[\n\t]+', '', data)
        return cleaned_data

    def do_train(self) -> List[TResult]:
        """
        By pre-train modules of unpipe get the results for our corpus
        These udpipe modules can be download here:
        https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131
        :return:
        """
        # train our corpus to get POS for each word
        line_no = 1
        for sen in self.load_data():
            # if line_no < 1811:
            #     line_no += 1
            #     continue
            sen_clean = self.clean_data(sen)
            if not sen_clean:
                continue
            word_pos = list(self.model.process(sen_clean))
            # pprint(word_pos)
            for i, one_sentence in enumerate(word_pos):
                sentence_text = self.extract_one_sentence(one_sentence)
                results = self.extract_one_word(one_sentence, sentence_text)
                self.store_data.insert_data(self.cursor, results, self.language_name)
                print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name))
            line_no += 1
        print(' all written succeed for corpus of %s' % self.our_corpus_name)

    def extract_one_sentence(self, sentence) -> str:
        """
       This private method is mainly used to extract the sentence text.
       an instance of udpipe Sentence:
       Sentence(
           comments=[
             '# sent_id = 3',
             '# text = 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。'],
           words=[
             Word(id=0, <root>),
             Word(id=1,
                  form='黄土',
                  lemma='黄土',
                  xpostag='NNP',
                  upostag='PROPN',
                  head=3,
                  deprel='nmod',
                  misc='SpaceAfter=No'),
             Word(id=2,
                  form='高原',
                  lemma='高原',
                  xpostag='NN',
                  upostag='NOUN',
                  head=3,
                  deprel='nmod',
                  misc='SpaceAfter=No'),
             Word(id=3,
                  form='严寒',
                  lemma='严寒',
                  xpostag='NN',
                  upostag='NOUN',
                  head=22,
                  deprel='nsubj',
                  misc='SpaceAfter=No'),
             
             omited by myself ])
       
       :param sentence: udpipe Sentence
       :return: str 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。
       """
        comment = ''.join(sentence.comments)
        try:
            cs = re.findall(r'text = (.*)', comment)[0]
            return cs
        except Exception as e:
            # TODO: need to write warning log
            print('error: not find a sentence', e)
            return ''

    def extract_one_word(self, sentence, sentence_text: str) -> [TResult]:
        """
        This private method is mainly used to extract one word and it's POS

        :param sentence_text:
        :param sentence:
        :return: [TResult]
        """
        r = []
        for word in sentence.words:
            if word.lemma and word.lemma not in ITrain.FILTER_WORD:
                if word.lemma and word.upostag and sentence_text:
                    r.append(TResult(word.lemma, word.upostag, sentence_text))
        return r

    def word_segmentation(self, sentence) -> List[str]:
        """
        :param sentence:
        :return: word list
        """
        sen_clean = self.clean_data(sentence)
        if not sen_clean:
            return []
        word_pos = list(self.model.process(sen_clean))
        words = []
        for i, one_sentence in enumerate(word_pos):
            sentence_text = self.extract_one_sentence(one_sentence)
            results = self.extract_one_word(one_sentence, sentence_text)
            words.extend([res.word for res in results])
        return words
Пример #5
0
class AppService(object):
    def __init__(self):
        self.pos_dict = None
        self.sel_result = None
        self.udt_pre_model = None

    def config_udpipe(self, language_name):
        # first loading udpipe to segement word for each sentence
        # all these need to be at preprocessed level
        self.udt_pre_model = UdpipeTrain(language_name,
                                         udpipe_language[language_name],
                                         corpus_language[language_name])
        return self

    def find_service(self, language_name: str, sel_word: str):
        """This method get results from database by specified language_name and input word
        assgin value to self.pos_dict and self.sel_result
        :param language_name:
        :param sel_word:
        :return: None
        """
        # select
        sql_str = "select * from " + language_name + "_wordpos as w left join " + language_name + "_sentences as s on " \
                                                                                                  "w.sentence = s.id " \
                                                                                                  "where w.word = %s "
        try:
            cursor.execute(sql_str, (sel_word, ))
            self.sel_result = cursor.fetchall()
            cnx.commit()
        except Exception as e:
            print(e)

        # convert to data structure following
        # sel_result = (("sink", "NOUN", ["Don't just leave your dirty plates in the sink!"]),
        #                ("sink", "VERB", ["The wheels, started to sink into the mud.", "How could you sink so low?"]))
        self.pos_dict = defaultdict(list)
        for row in self.sel_result:
            pos_sentences = self.pos_dict[row[POS_COLUMN_INDEX]]
            if row[SENTENCE_COLUMN_INDEX] not in pos_sentences:
                pos_sentences.append(row[SENTENCE_COLUMN_INDEX])
        self.sel_result = [(sel_word, k, self.pos_dict[k])
                           for k in self.pos_dict]

    def database(self):
        self.store_data = StoreData(db_config['user'], db_config['password'],
                                    db_config['host'], db_config['database'])
        self.cursor = self.store_data.db_connect().cursor()
        query_info = "SELECT sentence FROM english_sentences"
        self.cursor.execute(query_info)
        sentences_df = pd.DataFrame(self.cursor.fetchall(),
                                    columns=['Sentences'])
        return sentences_df

    def clusteringData(self):
        self.store_data = StoreData(db_config['user'], db_config['password'],
                                    db_config['host'], db_config['database'])
        self.cursor = self.store_data.db_connect().cursor()
        query_info = "SELECT sentence FROM english_sentences"
        self.cursor.execute(query_info)
        sentences_dataframe = pd.DataFrame(self.cursor.fetchall(),
                                           columns=['Sentences'])
        return sentences_dataframe

    def cluster_sentences(self, language_name: str, save_path: str,
                          sentences: List[str], n_clusters: int) -> List[str]:
        """
        cluster sentences to get examples
        :param language_name:
        :param save_path: the saved path for our cluster udpipemodel trained well
        :param sentences:
        :param n_clusters:
        :return:
        """
        no_n_input = False
        if n_clusters == '':
            n_clusters, no_n_input = 2, True
        n_clusters = int(n_clusters)
        if n_clusters <= 0:
            print("Parameter is Invalid")
            return
        if n_clusters > len(sentences):
            # TODO add log
            print('number of cluster bigger than sentences count')
            return
        if len(self.sel_result) <= 0:
            print('no sentence')
            return
        # first loading model
        # first loading udpipemodel
        word2vec_model = load_model(save_path)
        # second geting vectors for one sentence
        sent_vectors = []
        default_dimn = 100
        # iterator to sentence
        for sent in sentences:
            words = self.udt_pre_model.word_segmentation(sent)
            word_vectors = []
            # iterator to word
            window_words = get_keyword_window(self.sel_result[0][0], words, 5)
            for word in window_words:
                if word in word2vec_model.wv:
                    word_vectors.append(word2vec_model.wv[word])
                # else:  # not in dict, fill 0
                # word_vectors.append([0] * default_dimn)

            to_array = np.array(word_vectors)
            sent_vectors.append(to_array.mean(axis=0).tolist())

        # third using kmeans to cluster
        best_score, best_labels = -1, None
        evaluator = Evaluator(sent_vectors)
        labels1 = evaluator.kmeans_strategy(n_clusters)
        score1 = evaluator.higher_better_score(labels1)
        labels2 = evaluator.agglomerative_strategy(n_clusters)
        score2 = evaluator.higher_better_score(labels2)
        if score1 < score2:
            best_score = score2
            best_labels = labels2
            print('agglomerative is better than kmeans')
        else:

            best_score = score1
            best_labels = labels1
            print('kmeans is better than agglomerative')

        labels3, n_clusters = evaluator.get_best_n_clusters()
        score3 = evaluator.higher_better_score(labels3)
        if best_score < score3:
            best_labels, best_score = labels3, score3

            best_score = score1
            best_labels = labels1
            print('kmeans is better than agglomerative')

        # fourth select one sentence with each label
        examples = self._get_examples(sentences, best_labels, n_clusters)

        labels3, recommend_clusters = evaluator.get_best_n_clusters()
        score3 = evaluator.higher_better_score(labels3)
        if best_score < score3:
            print('recommend %d sentences' % (recommend_clusters, ))
        recommend_sentences = self._get_examples(sentences, labels3,
                                                 recommend_clusters)

        if no_n_input:
            examples = recommend_sentences

        return examples, recommend_sentences

    def _get_examples(self, sentences: List[str], best_labels,
                      n_clusters: int):
        tmp_labels, examples = [], []
        for sent, label in zip(sentences, best_labels):
            if label not in tmp_labels:
                tmp_labels.append(label)
                examples.append(sent)
            if len(examples) == n_clusters:
                break
        # add bottom logic for cluster
        if len(examples) < n_clusters:
            for sent in sentences:
                if sent not in examples:
                    examples.append(sent)
                if len(examples) >= n_clusters:
                    break
        return examples
Пример #6
0
import pandas as pd
import numpy as np
from typing import List
from sklearn.cluster import KMeans
from collections import defaultdict
import pymysql

from src.train.result_model import TResult
from src.train.store import StoreData
from src.util import *
from src.train.train_cluster import load_model
from src.train.train_model import UdpipeTrain
from src.train.cluster import Evaluator

try:
    store_data = StoreData(db_config['user'], db_config['password'],
                           db_config['host'], db_config['database'])
    cnx = store_data.db_connect()
    cursor = cnx.cursor()
except Exception as ex:
    print('logging in database error %s' % ex)

POS_COLUMN_INDEX, SENTENCE_COLUMN_INDEX = 2, 6


class AppService(object):
    def __init__(self):
        self.pos_dict = None
        self.sel_result = None
        self.udt_pre_model = None

    def config_udpipe(self, language_name):
Пример #7
0
import json
from typing import List
import sys
import os
import numpy as np
from sklearn.cluster import KMeans

from src.train.result_model import TResult
from src.train.store import StoreData
from src.util import language_dict, language_list, db_config
from src.train.train_cluster import load_model
from src.train.train_model import UdpipeTrain

try:
    store_data = StoreData(db_config['user'],
                           db_config['password'],
                           db_host=db_config['db_host'],
                           db_name=db_config['db_name'])
    cnx = store_data.db_connect()
    cursor = cnx.cursor()
except Exception as ex:
    print('logging in database error %s' % ex)

POS_COLUMN_INDEX, SENTENCE_COLUMN_INDEX = 2, 6


class AppService(object):
    def __init__(self):
        self.pos_dict = None
        self.sel_result = None
        self.udt_pre_model = None