class SpeechRecognizer:
    def __init__(self):
        # Services
        self.__recognizer = sr.Recognizer()
        self.__logger = Logger()
        self._exceptions_handler = ExceptionsHandler()

        self.__logger.info('SpeechRecognizer was successfully initialized.', __name__)

    def recognize_speech(self):
        while True:
            try:
                with sr.Microphone() as source:
                    speech = self.__recognizer.listen(source)

            except BaseException as exception:
                error_message = self._exceptions_handler.get_error_message(exception)

                self.__logger.error(error_message, __name__)
                return error_message

            try:
                text = self.__recognizer.recognize_google(speech, language="ru-RU").lower().strip()
                return text

            except BaseException as exception:
                error_message = self._exceptions_handler.get_error_message(exception)

                if isinstance(exception, sr.WaitTimeoutError):
                    self.__logger.warning(self._exceptions_handler.get_error_message(exception), __name__)
                else:
                    self.__logger.error(error_message, __name__)
                    return error_message
class DocumentPreparer:
    def __init__(self):
        self.__logger = Logger()

        self.__logger.info('DocumentPreparer was successfully initialized.',
                           __name__)

    def split_into_unigrams(self, text):
        if text:
            return re.findall(r'\w+', text)
        else:
            self.__logger.warning('Got empty text.', __name__)

    def split_into_bigrams(self, text):
        if not text:
            self.__logger.warning('Got empty text.', __name__)
            return

        unigrams = self.split_into_unigrams(text)
        bigrams = list()

        if len(unigrams) >= 2:
            for unigram_index in range(len(unigrams) - 1):
                bigram = ' '.join(
                    sorted(
                        [unigrams[unigram_index],
                         unigrams[unigram_index + 1]])).strip()
                bigrams.append(bigram)

            return bigrams
        else:
            self.__logger.info("Text doesn't contain enough words.", __name__)

    def split_into_trigrams(self, text):
        if not text:
            self.__logger.warning('Got empty text.', __name__)
            return

        unigrams = self.split_into_unigrams(text)
        trigrams = list()

        if len(unigrams) >= 3:
            for unigram_index in range(len(unigrams) - 2):
                trigram = ' '.join(
                    sorted([
                        unigrams[unigram_index], unigrams[unigram_index + 1],
                        unigrams[unigram_index + 2]
                    ])).strip()

                trigrams.append(trigram)

            return trigrams
        else:
            self.__logger.info("Text doesn't contain enough words.", __name__)
Пример #3
0
class DatabaseCursor:
    def __init__(self):
        # Services
        self.__logger = Logger()
        self._path_service = PathService()
        self._configurator = Configurator()
        self._exceptions_handler = ExceptionsHandler()

        # Data
        self._wd = os.getcwd()
        self._request_url = None
        self.databases_public_keys = None

        self.__logger.info('DatabaseCursor was successfully initialized.',
                           __name__)

    def _load_config(self):
        path_to_config = os.path.join(self._path_service.path_to_configs,
                                      'database_cursor.json')

        if os.path.exists(path_to_config):
            with open(path_to_config, 'r', encoding='utf-8') as file:
                config = json.load(file)

            self._request_url = config['request_url']
            self.databases_public_keys = config['database_public_keys']
        else:
            self.__logger.error(
                "Can't load config for DatabaseCursor (doesn't exist).",
                __name__)

    def __update_connection(self, ngram):
        path_to_db = None

        if ngram.count(' ') == 0:
            path_to_db = self._path_service.get_path_to_database('unigrams.db')

        elif ngram.count(' ') == 1:
            path_to_db = self._path_service.get_path_to_database('bigrams.db')

        elif ngram.count(' ') == 2:
            path_to_db = self._path_service.get_path_to_database('trigrams.db')

        if path_to_db and os.path.exists(path_to_db):
            self.__logger.info(f'Connected to database: {path_to_db}',
                               __name__)

            return sqlite3.connect(path_to_db)

        else:
            self.__logger.warning(f'Database lost: {path_to_db}', __name__)
            self.__logger.info('Trying to download database from cloud...',
                               __name__)

            self._configurator.download_database(path_to_db)

            self.__logger.info(f'Connected to database: {path_to_db}',
                               __name__)

            if os.path.exists(path_to_db):
                return sqlite3.connect(path_to_db)
            else:
                self.__logger.fatal("Database doesn't exist.", __name__)

    def get_entry(self, ngram):
        connection = self.__update_connection(ngram)
        cursor = connection.cursor()

        request = ("""
        SELECT * FROM 'Data' WHERE Ngram='%s'
        """) % ngram

        self.__logger.info(f'Request to DB: {request.strip()}', __name__)

        try:
            cursor.execute(request)
            self.__logger.info('Request is OK.', __name__)

        except BaseException as exception:
            connection.close()

            self.__logger.error(
                self._exceptions_handler.get_error_message(exception),
                __name__)
            return

        result = cursor.fetchone()
        self.__logger.info(f'Received data: {str(result)}', __name__)

        if result:
            connection.close()

            return result[1], result[2]

        else:
            connection.close()

    def entry_exists(self, ngram):
        connection = self.__update_connection(ngram)
        cursor = connection.cursor()

        request = ("""
        SELECT * FROM 'Data' WHERE Ngram='%s'
        """) % ngram

        self.__logger.info(f'Request to DB: {request.strip()}', __name__)

        try:
            cursor.execute(request)
            self.__logger.info('Request is OK.', __name__)

        except BaseException as exception:
            connection.close()

            self.__logger.error(
                self._exceptions_handler.get_error_message(exception),
                __name__)
            return

        if cursor.fetchone():
            connection.close()

            self.__logger.info('Entry exists.', __name__)
            return True

        else:
            connection.close()

            self.__logger.info("Entry doesn't exist.", __name__)
            return False
Пример #4
0
class MainWindow(QWidget):
    def __init__(self):
        super().__init__()
        self.os = platform.system().lower()

        # Services
        self._speech_recognizer = SpeechRecognizer()
        self._file_reader = FileReader()
        self.__logger = Logger()
        self._path_service = PathService()
        self._text_tonal_analyzer = TextTonalAnalyzer('NBC')

        self._config = None
        self._load_config()

        # GUI Elements
        self.line_edit = QLineEdit(self)
        self.answer_label = QLabel(self)
        self.voice_button = QPushButton(self)
        self.answer_button = QPushButton(self)
        self.file_dialog_button = QPushButton(self)
        self.delete_button = QPushButton(self)
        self.message_box = QMessageBox()

    def _load_config(self):
        path_to_config = os.path.join(self._path_service.path_to_configs, 'demo.json')

        if os.path.exists(path_to_config):
            with open(path_to_config, 'r', encoding='utf-8') as file:
                self._config = json.load(file)

            if self.os == 'windows':
                self._config = self._config['windows']
            else:
                self._config = self._config['darwin']
        else:
            self.__logger.fatal("Config for GUI doesn't exist.", __name__)
            exit(-1)

    def _configure_main_window(self):
        self._set_base_params()

        self._configure_line_edit()
        self._configure_answer_button()
        self.configure_voice_button()
        self._configure_delete_button()
        self._configure_file_dialog_button()
        self._configure_answer_label()

        self.__logger.info('Main window was successfully configured.', __name__)

    def _set_base_params(self):
        self.setFixedSize(*self._config['size'])
        self.setStyleSheet('QWidget { background-color: %s }' % self._config['background-color'])

    def _configure_line_edit(self):
        self.line_edit.setToolTip('Enter the text here')
        self.line_edit.returnPressed.connect(self._answer_button_clicked)

        self.line_edit.resize(*self._config['line-edit']['size'])
        self.line_edit.setStyleSheet('QWidget { background-color: %s }' %
                                     self._config['line-edit']['background-color'])
        self.line_edit.move(*self._config['line-edit']['coordinates'])
        self.line_edit.setFont(QFont(*self._config['line-edit']['font']))

    def _configure_answer_button(self):
        self.answer_button.clicked.connect(self._answer_button_clicked)
        self.answer_button.setText('Start')
        self.answer_button.setToolTip('Push to count tonal')

        self.answer_button.setStyleSheet("""
                             QPushButton:hover { background-color: %s }
                             QPushButton:!hover { background-color: %s }
                             QPushButton:pressed { background-color: %s; }
                         """ % (self._config['answer-button']['background-color']['hover'],
                                self._config['answer-button']['background-color']['!hover'],
                                self._config['answer-button']['background-color']['pressed']))
        self.answer_button.resize(*self._config['answer-button']['size'])
        self.answer_button.move(*self._config['answer-button']['coordinates'])
        self.answer_button.setFont(QFont(*self._config['answer-button']['font']))

    def configure_voice_button(self):
        self.voice_button.setText('🎙')
        self.voice_button.setToolTip('Push to enter the text by speech')
        self.voice_button.clicked.connect(self._voice_button_clicked)

        self.voice_button.resize(*self._config['voice-button']['size'])
        self.voice_button.setFont(QFont(*self._config['voice-button']['font']))
        self.voice_button.move(*self._config['voice-button']['coordinates'])
        self.voice_button.setStyleSheet("""
                             QPushButton:hover { background-color: %s }
                             QPushButton:!hover { background-color: %s }
                             QPushButton:pressed { background-color: %s; }
                         """ % (self._config['voice-button']['background-color']['hover'],
                                self._config['voice-button']['background-color']['!hover'],
                                self._config['voice-button']['background-color']['pressed']))

    def _configure_delete_button(self):
        self.delete_button.setText('✗')
        self.delete_button.setToolTip('Push to clear text box')
        self.delete_button.clicked.connect(self._delete_button_clicked)

        self.delete_button.resize(*self._config['delete-button']['size'])
        self.delete_button.setFont(QFont(*self._config['delete-button']['font']))
        self.delete_button.move(*self._config['delete-button']['coordinates'])
        self.delete_button.setStyleSheet("""
                             QPushButton:hover { background-color: %s }
                             QPushButton:!hover { background-color: %s }
                             QPushButton:pressed { background-color: %s; }
                         """ % (self._config['delete-button']['background-color']['hover'],
                                self._config['delete-button']['background-color']['!hover'],
                                self._config['delete-button']['background-color']['pressed']))

    def _configure_file_dialog_button(self):
        self.file_dialog_button.setText('📂')
        self.file_dialog_button.setToolTip('Push to open file')
        self.file_dialog_button.clicked.connect(self._file_dialog_button_clicked)

        self.file_dialog_button.resize(*self._config['file-dialog-button']['size'])
        self.file_dialog_button.setFont(QFont(*self._config['file-dialog-button']['font']))
        self.file_dialog_button.move(*self._config['file-dialog-button']['coordinates'])
        self.file_dialog_button.setStyleSheet("""
                             QPushButton:hover { background-color: %s }
                             QPushButton:!hover { background-color: %s }
                             QPushButton:pressed { background-color: %s; }
                         """ % (self._config['file-dialog-button']['background-color']['hover'],
                                self._config['file-dialog-button']['background-color']['!hover'],
                                self._config['file-dialog-button']['background-color']['pressed']))

    def _configure_answer_label(self):
        self.answer_label.move(*self._config['answer-label']['coordinates'])
        self.answer_label.setFont(QFont(*self._config['answer-label']['font']))
        self.answer_label.resize(*self._config['answer-label']['size'])

    def launch(self):
        self.setWindowIcon(QIcon('icon.ico'))
        self.setWindowTitle('Sentiment Analyser')

        self._configure_main_window()
        self.show()

        self.__logger.info('Main window was successfully launched.', __name__)

    def _delete_button_clicked(self):
        self.line_edit.clear()
        self.answer_label.clear()

    def _voice_button_clicked(self):
        self.message_box.question(self, 'Speak', 'You can start speeking.', QMessageBox.Ok)

        speech_text = self._speech_recognizer.recognize_speech()

        if speech_text == 'Unknown value':
            try_again = QMessageBox.Yes

            while try_again == QMessageBox.Yes and speech_text == 'Unknown value':
                try_again = self.message_box.question(self, 'Error', 'Unknown value\n Try again?',
                                                      QMessageBox.Yes | QMessageBox.No)
                if try_again == QMessageBox.No:
                    break

                speech_text = self._speech_recognizer.recognize_speech()

        elif speech_text == 'Internet connection lost':
            self.message_box.question(self, 'Error', 'Internet connection lost', QMessageBox.Ok)
            return

        elif speech_text == 'No microphone':
            self.message_box.question(self, 'Error', 'Microphone was disconnected', QMessageBox.Ok)
            return

        if speech_text != 'Unknown value':
            self.line_edit.setText(speech_text)

            return

    def _file_dialog_button_clicked(self):
        file_content = self._file_reader.get_file_content()

        if file_content:
            self.line_edit.setText(file_content)
        else:
            self.__logger.warning('Empty file.', __name__)

    def _answer_button_clicked(self):
        self._text_tonal_analyzer.detect_tonal(self.line_edit.text())

        if self.os == 'windows':
            if self._text_tonal_analyzer.tonal == 'positive':
                self.answer_label.setStyleSheet('QLabel {color:rgba(0, 200, 100, 255)}')
                self.answer_label.move(193.5, 180)

            elif self._text_tonal_analyzer.tonal == 'negative':
                self.answer_label.setStyleSheet('QLabel {color:rgba(255, 56, 20, 255)}')
                self.answer_label.move(180, 180)

        elif self.os == 'darwin':
            if self._text_tonal_analyzer.tonal == 'positive':
                self.answer_label.setStyleSheet('QLabel {color:rgba(0, 200, 100, 255)}')
                self.answer_label.move(230, 210)

            elif self._text_tonal_analyzer.tonal == 'negative':
                self.answer_label.setStyleSheet('QLabel {color:rgba(255, 56, 20, 255)}')
                self.answer_label.move(225, 210)

        self.answer_label.setToolTip('Tonal and probability')

        if self._text_tonal_analyzer.probability:
            self.answer_label.setText(self._text_tonal_analyzer.tonal.capitalize() + '\n' +
                                      str(round(self._text_tonal_analyzer.probability * 100, 3)) + '%')
        else:
            self.answer_label.setText(self._text_tonal_analyzer.tonal.capitalize())
Пример #5
0
class PathService(metaclass=Singleton):
    def __init__(self):
        # Services
        self.__logger = Logger()

        # Data
        self._wd = os.getcwd()
        self.path_to_databases = None
        self.path_to_configs = None
        self._valid_classifiers = None
        self._valid_model_types = None
        self._valid_databases = None
        self._valid_test_results_modes = None
        self._valid_datasets = None
        self.path_to_stop_words = None
        self._path_to_main_directory = None
        self.path_to_vector_model = None
        self._path_to_classifier_models = None
        self._path_to_test_results = None

        self.configure()
        self.__logger.info('PathService was successfully configured.', __name__)

    def _find_main_directory(self):
        max_nesting_level = 5
        nesting_level = 0

        while not os.getcwd().endswith('Python'):
            if os.getcwd().endswith('Databases'):
                os.chdir(os.path.join('..', 'Python'))
                break
            else:
                os.chdir('..')

            nesting_level += 1

            if nesting_level > max_nesting_level:
                self.__logger.fatal("Can't find main directory (exceeded maximum nesting level).", __name__)
                exit(-1)

        self._path_to_main_directory = os.getcwd()
        self.path_to_configs = os.path.join(self._path_to_main_directory, 'Services', 'Configs')
        self.path_to_databases = os.path.abspath(os.path.join('..', 'Databases'))

        os.chdir(self._wd)

    def _check_paths_existing(self):
        if not os.path.exists(self.path_to_configs):
            self.__logger.fatal("Directory with config files doesn't exist.", __name__)
            exit(-1)

        elif not os.path.exists(self.path_to_databases):
            self.__logger.fatal("Directory with databases doesn't exist.", __name__)
            exit(-1)

        elif not os.path.exists(self._path_to_classifier_models):
            self.__logger.fatal("Directory with classifier models doesn't exist.", __name__)
            exit(-1)

        if not os.path.exists(self.path_to_vector_model):
            self.path_to_vector_model = None
            self.__logger.error("Vector model doesn't exist.", __name__)

        if not os.path.exists(self.path_to_stop_words):
            self.path_to_stop_words = None
            self.__logger.error("File with stop-words doesn't exist.", __name__)

        if not os.path.exists(self._path_to_test_results):
            self._path_to_test_results = None
            self.__logger.warning("Directory with tests reports doesn't exist.", __name__)

    def _load_config(self):
        path_to_config = os.path.join(self.path_to_configs, 'path_service.json')

        if not os.path.exists(path_to_config):
            self.__logger.error("Can't find config-file for PathService.", __name__)

        with open(path_to_config, 'r', encoding='utf-8') as file:
            config = json.load(file)

        self._valid_classifiers = config['valid_classifiers']
        self._valid_databases = config['valid_databases']
        self._valid_datasets = config['valid_datasets']
        self._valid_test_results_modes = config['valid_test_results_modes']
        self._valid_model_types = config['valid_model_types']

    def configure(self):
        self._find_main_directory()
        self._load_config()

        self.path_to_vector_model = os.path.join(self.path_to_databases, 'ruscorpora_upos_skipgram_300_10_2017.bin.gz')
        self.path_to_stop_words = os.path.join(self._path_to_main_directory, 'Services', 'Lemmatizer',
                                               'stop_words.json')
        self._path_to_classifier_models = os.path.join(self.path_to_databases, 'Models')
        self._path_to_test_results = os.path.join(self._path_to_main_directory, 'Tests', 'System', 'Reports')

        self._check_paths_existing()

    def get_path_to_test_results(self, mode='classifier', classifier_name='NBC'):
        if classifier_name not in self._valid_classifiers:
            self.__logger.warning('Got incorrect classifier name.', __name__)
            classifier_name = 'NBC'

        if classifier_name not in self._valid_test_results_modes:
            self.__logger.warning('Got incorrect mode.', __name__)
            return self._path_to_test_results

        if mode.lower().strip() == 'vec_model':
            return os.path.join(self._path_to_test_results, 'VectorModel')

        elif mode.lower().strip() == 'classifier_main':
            return os.path.join(self._path_to_test_results, '..', '..', 'MainReports', 'Classifier', classifier_name)

        elif mode.lower().strip() == 'classifier':
            return self._path_to_test_results

    def get_path_to_model(self, model='unigrams', classifier_name='NBC'):
        if classifier_name not in self._valid_classifiers:
            self.__logger.warning('Got incorrect classifier name.', __name__)
            classifier_name = 'NBC'

        if model not in self._valid_model_types:
            self.__logger.warning('Got incorrect model type.', __name__)
            model = 'unigrams'

        path_to_models = os.path.join(self._path_to_classifier_models, classifier_name)

        if os.path.exists(path_to_models):
            path_to_required_model = os.path.join(path_to_models, f'model_{model}.pkl')

            return path_to_required_model
        else:
            self.__logger.error("Required model wasn't found.", __name__)

    def get_path_to_database(self, database_name='unigrams.db'):
        if database_name not in self._valid_databases:
            self.__logger.warning('Got incorrect database name.', __name__)
            database_name = 'unigrams.db'

        path_to_database = os.path.join(self.path_to_databases, database_name)

        return path_to_database

    def get_path_to_dataset(self, dataset):
        if dataset not in self._valid_datasets:
            self.__logger.warning('Got incorrect dataset name.', __name__)
            dataset = 'dataset_with_unigrams.csv'

        path_to_dataset = os.path.join(self.path_to_databases, dataset)

        return path_to_dataset

    def set_path_to_vector_model(self, path_to_vector_model):
        self.path_to_vector_model = path_to_vector_model
class TextTonalAnalyzer:
    def __init__(self, classifier_name='NBC'):
        # Services
        self._configurator = Configurator()
        self._configurator.configure_system()

        self._database_cursor = DatabaseCursor()
        self._document_preparer = DocumentPreparer()
        self._text_weight_counter = TextWeightCounter()
        self._classifier = Classifier()
        self.__logger = Logger()
        self._lemmatizer = Lemmatizer()
        self._path_service = PathService()

        # Data
        self._classifier_name = classifier_name

        self._text = None
        self.tonal = None
        self.probability = 0

        self._unigrams = None
        self._bigrams = None
        self._trigrams = None

        self._unigrams_weight = None
        self._bigrams_weight = None
        self._trigrams_weight = None

        self.__logger.info('TextTonalAnalyzer was successfully initialized.',
                           __name__)

    def _reset_data(self):
        self._text = None
        self.tonal = None
        self.probability = 0

        self._unigrams = None
        self._bigrams = None
        self._trigrams = None

        self._unigrams_weight = None
        self._bigrams_weight = None
        self._trigrams_weight = None

        self.__logger.info('Data was successfully reset.', __name__)

    def _document_prepare(self):
        self._unigrams = self._document_preparer.split_into_unigrams(
            self._text)
        self._bigrams = self._document_preparer.split_into_bigrams(self._text)
        self._trigrams = self._document_preparer.split_into_trigrams(
            self._text)

    def _text_in_dataset(self):
        path_to_dataset = self._path_service.get_path_to_dataset(
            'dataset_with_unigrams.csv')

        with open(path_to_dataset, 'r', encoding='utf-8') as file:
            dataset = csv.reader(file)
            for doc in dataset:
                doc = ''.join(doc).split(';')
                if doc[0] == self._text:
                    self.tonal = doc[1]
                    self.probability = 1

                    self.__logger.info('Document is in dataset.', __name__)
                    return True

        return False

    def _count_weight_by_unigrams(self):
        self._unigrams_weight = self._text_weight_counter.count_weight_by_unigrams(
            self._unigrams)

    def _count_weight_by_bigrams(self):
        self._bigrams_weight = self._text_weight_counter.count_weight_by_bigrams(
            self._bigrams)

    def _count_weight_by_trigrams(self):
        self._trigrams_weight = self._text_weight_counter.count_weight_by_trigrams(
            self._trigrams)

    def detect_tonal(self, text):
        self._reset_data()

        self._text = self._lemmatizer.get_text_initial_form(text)

        if not self._text:
            self.tonal = 'Unknown'

            self.__logger.warning('Text is empty.', __name__)
            return None

        self._document_prepare()

        if not self._text_in_dataset():
            threads = list()

            threads.append(
                Thread(target=self._count_weight_by_unigrams, args=()))
            threads.append(
                Thread(target=self._count_weight_by_bigrams, args=()))
            threads.append(
                Thread(target=self._count_weight_by_trigrams, args=()))

            for thread in threads:
                thread.start()

            for thread in threads:
                while thread.is_alive():
                    time.sleep(0.1)

                thread.join()

            self._classifier.customize(self._unigrams_weight,
                                       self._bigrams_weight,
                                       self._trigrams_weight,
                                       self._classifier_name)

            self.tonal, self.probability = self._classifier.predict_tonal()

            self.__logger.page_break()
Пример #7
0
class NgramAnalyzer:
    def __init__(self):
        # Services
        self._database_cursor = DatabaseCursor()
        self.__logger = Logger()
        self._exceptions_hanlder = ExceptionsHandler()
        self._lemmatizer = Lemmatizer()
        self._path_service = PathService()
        self._configurator = Configurator()
        self._morph_analyzer = pymorphy2.MorphAnalyzer()

        # Data
        self._vec_model = None

        self._load_vec_model()

        self.__logger.info('NgramAnalyzer was successfully initialized.', __name__)

    def _load_vec_model(self):
        if not self._path_service.path_to_vector_model:
            self.__logger.warning("Vector model doesn't exist.", __name__)

            self._configurator.download_vector_model()
            self._path_service.set_path_to_vector_model(os.path.join(self._path_service.path_to_databases,
                                                                     'ruscorpora_upos_skipgram_300_10_2017.bin.gz'))
            self.__logger.info('Vector model was successfully downloaded.', __name__)

        if self._path_service.path_to_vector_model:
            self._vec_model = gensim.models.KeyedVectors.load_word2vec_format(self._path_service.path_to_vector_model,
                                                                              binary=True)
        else:
            self.__logger.error("Vector model doesn't exist.", __name__)

    def _part_of_speech_detect(self, word):
        if not word:
            return

        part_of_speech = self._morph_analyzer.parse(word)[0].tag.POS

        if part_of_speech:
            if re.match(r'ADJ', part_of_speech):
                return 'ADJ'

            elif re.match(r'PRT', part_of_speech):
                return 'PRT'

            elif part_of_speech == 'INFN':
                return 'VERB'

            elif part_of_speech == 'ADVB' or part_of_speech == 'PRED':
                return 'ADV'

            elif part_of_speech == 'PRCL':
                return 'PART'

        return part_of_speech

    @staticmethod
    def _detect_ngram_type(ngram):
        if not ngram:
            return

        if ngram.count(' ') == 0:
            return 'unigram'

        elif ngram.count(' ') == 1:
            return 'bigram'

        elif ngram.count(' ') == 2:
            return 'trigram'

    def _nearest_synonyms_find(self, word, topn):
        if not self._vec_model or not word or topn <= 0:
            return

        nearest_synonyms = list()
        part_of_speech = self._part_of_speech_detect(word)
        ngram_type = self._detect_ngram_type(word)

        if part_of_speech:
            word = word + '_%s' % self._part_of_speech_detect(word)

        try:
            for synonym in self._vec_model.most_similar(positive=[word], topn=topn * 10):
                found_synonym = self._lemmatizer.get_text_initial_form(synonym[0].split('_')[0])

                if found_synonym and self._detect_ngram_type(found_synonym) == ngram_type:
                    nearest_synonyms.append({'word': found_synonym,
                                             'cosine proximity': synonym[1]})

                if len(nearest_synonyms) == topn:
                    break

        except BaseException as exception:
            self.__logger.warning(self._exceptions_hanlder.get_error_message(exception), __name__)
            return

        return nearest_synonyms

    def relevant_ngram_find(self, ngram):
        if not ngram:
            return

        self.__logger.info(f'Start ngram: {ngram}', __name__)

        response = {'synonym_found': False, 'content': dict()}

        if self._detect_ngram_type(ngram) == 'unigram':
            synonyms_count = 10
            nearest_synonyms = self._nearest_synonyms_find(ngram, synonyms_count)

            if not nearest_synonyms:
                return response

            for nearest_synonym in nearest_synonyms:
                data = self._database_cursor.get_entry(nearest_synonym['word'])

                if data and data[0]:
                    self.__logger.info(f'Relevant ngram: {nearest_synonym["word"]}', __name__)

                    response['synonym_found'] = True

                    response['content']['synonym'] = nearest_synonym['word']
                    response['content']['pos_docs'] = data[0]
                    response['content']['neg_docs'] = data[1]

                    return response

        return response
Пример #8
0
class Configurator(metaclass=Singleton):
    def __init__(self):
        # Services
        self.__logger = Logger()
        self._path_service = PathService()
        self._exceptions_handler = ExceptionsHandler()

        # Data
        self._config = dict()
        self._wd = os.getcwd()
        self._path_to_databases = None
        self._request_url = None
        self._vector_model_public_key = None
        self._databases_public_keys = None

        self._load_public_keys()

        self.__logger.info('Configurator was successfully initialized.',
                           __name__)

    def _load_public_keys(self):
        path_to_config = os.path.join(self._path_service.path_to_configs,
                                      'configurator.json')

        if os.path.exists(path_to_config):
            with open(path_to_config, 'r', encoding='utf-8') as file:
                config = json.load(file)

            self._request_url = config['request_url']
            self._vector_model_public_key = config['vector_model_public_key']
            self._databases_public_keys = config['databases_public_keys']

        else:
            self.__logger.error(
                "Can't load config for Configrurator (doesn't exist).",
                __name__)

    def download_database(self, path_to_db):
        database_name = os.path.split(path_to_db)[1]

        if database_name:
            try:
                download_url = requests.get(
                    self._request_url,
                    params={
                        'public_key':
                        self._databases_public_keys[database_name]
                    }).json()["href"]

                with open(path_to_db, 'wb') as database_file:
                    database_file.write(requests.get(download_url).content)

                self._config[path_to_db] = 'downloaded'

            except BaseException as exception:
                self.__logger.error(
                    self._exceptions_handler.get_error_message(exception),
                    __name__)
                self._config[path_to_db] = 'error'

    def download_vector_model(self):
        self._path_service.set_path_to_vector_model(
            os.path.join(self._path_service.path_to_databases,
                         'ruscorpora_upos_skipgram_300_10_2017.bin.gz'))

        try:
            download_url = requests.get(self._request_url,
                                        params={
                                            'public_key':
                                            self._vector_model_public_key
                                        }).json()["href"]

            with open(self._path_service.path_to_vector_model,
                      'wb') as vec_model:
                vec_model.write(requests.get(download_url).content)

            self._config[
                'ruscorpora_upos_skipgram_300_10_2017.bin.gz'] = 'downloaded'

        except BaseException as exception:
            self.__logger.error(
                self._exceptions_handler.get_error_message(exception),
                __name__)

            self._config[
                'ruscorpora_upos_skipgram_300_10_2017.bin.gz'] = 'error'

    def configure_system(self):
        self._config['datetime'] = str(datetime.now())

        for database in ['unigrams.db', 'bigrams.db', 'trigrams.db']:
            path_to_database = self._path_service.get_path_to_database(
                database)

            if not path_to_database or not os.path.exists(path_to_database):
                self.__logger.warning('Database not found: %s' % str(database),
                                      __name__)
                self.download_database(
                    os.path.join(self._path_service.path_to_databases,
                                 database))
            else:
                self._config[database] = 'exists'

        if not self._path_service.path_to_vector_model or not os.path.exists(
                self._path_service.path_to_vector_model):
            self.__logger.warning('Vector model not found.', __name__)
            self.download_vector_model()
        else:
            self._config[
                'ruscorpora_upos_skipgram_300_10_2017.bin.gz'] = 'exists'

        self._create_config()

    def _create_config(self):
        with open(os.path.join('Logs', 'config.json'), 'w',
                  encoding='utf-8') as config:
            json.dump(self._config, config, indent=4)
Пример #9
0
class Lemmatizer:
    def __init__(self):
        # Services
        self._spell_checker = SpellChecker()
        self.__logger = Logger()
        self._path_service = PathService()
        self._morph_analyzer = pymorphy2.MorphAnalyzer()

        # Data
        self._stop_words = self._read_stop_words()
        self._parts_of_speech_to_remove = ['NUMR', 'NPRO', 'PREP']

        self.__logger.info('Lemmatizer was successfully initialized.',
                           __name__)

    @staticmethod
    def _contains_latin_letter(word):
        if word:
            return all(map(lambda c: c in ascii_letters, word))

    def _detect_part_of_speech(self, word):
        if word:
            return self._morph_analyzer.parse(word)[0].tag.POS

    def _is_stop_word(self, word):
        if not word:
            self.__logger.warning('Got empty word.', __name__)
            return

        word = f' {word} '

        for stop_words in self._stop_words.values():
            if word in stop_words:
                return True

        return False

    def _remove_words_without_emotions(self, text):
        if not text:
            self.__logger.warning('Got empty text.', __name__)
            return

        cleaned_text = list()

        for word in re.findall(r'\w+', text):
            if not self._detect_part_of_speech(word) in self._parts_of_speech_to_remove and\
                    not self._is_stop_word(word):
                cleaned_text.append(word)

        return ' '.join(cleaned_text).strip()

    def _read_stop_words(self):
        if os.path.exists(self._path_service.path_to_stop_words):
            with open(self._path_service.path_to_stop_words,
                      'r',
                      encoding='utf-8') as file:
                return json.load(file)

    def _delete_words_contains_latin_letters(self, text):
        text = ' '.join([
            word for word in re.findall(
                r'\w+', self._spell_checker.check_spelling(text.lower()))
            if not self._contains_latin_letter(word) and word.isalpha()
        ]).strip()

        if text:
            return text
        else:
            self.__logger.warning(
                'All words in document contain latin letters or all words are digits.',
                __name__)

    def _get_text_normal_form(self, text):
        return ' '.join([self._morph_analyzer.parse(word)[0].normal_form + ' ' for word in re.findall(r'\w+', text)]) \
            .strip()

    def get_text_initial_form(self, text):
        if not text:
            self.__logger.warning('Got empty text.', __name__)
            return

        self.__logger.info(f'Start text: {text}', __name__)

        transformations = [
            self._delete_words_contains_latin_letters,
            self._get_text_normal_form, self._remove_words_without_emotions
        ]

        for transformation in transformations:
            text = transformation(text)

            if not text:
                return

        self.__logger.info(f'Lemmatized text: {text}', __name__)
        return text