class SpeechRecognizer: def __init__(self): # Services self.__recognizer = sr.Recognizer() self.__logger = Logger() self._exceptions_handler = ExceptionsHandler() self.__logger.info('SpeechRecognizer was successfully initialized.', __name__) def recognize_speech(self): while True: try: with sr.Microphone() as source: speech = self.__recognizer.listen(source) except BaseException as exception: error_message = self._exceptions_handler.get_error_message(exception) self.__logger.error(error_message, __name__) return error_message try: text = self.__recognizer.recognize_google(speech, language="ru-RU").lower().strip() return text except BaseException as exception: error_message = self._exceptions_handler.get_error_message(exception) if isinstance(exception, sr.WaitTimeoutError): self.__logger.warning(self._exceptions_handler.get_error_message(exception), __name__) else: self.__logger.error(error_message, __name__) return error_message
class FileReader(QWidget): def __init__(self): super().__init__() self.__logger = Logger() self.__file_dialog = QFileDialog() self.__logger.info('FileReader was successfully initialized.', __name__) def _detect_encoding(self, filename): with open(filename, 'rb') as byte_file: byte_string = byte_file.read() encoding = chardet.detect(byte_string)['encoding'] self.__logger.info(f"File's encoding: {encoding}", __name__) return encoding def get_file_content(self): try: filename = self.__file_dialog.getOpenFileName( self, 'Open file', '/home')[0] self.__logger.info(f'Filename: {filename}', __name__) if filename: with open(filename, 'r', encoding=self._detect_encoding(filename)) as file: return file.read() except BaseException as exception: self.__logger.error(str(exception), __name__)
class SpellChecker: def __init__(self): # Services self.__logger = Logger() self._exceptions_handler = ExceptionsHandler() self.__logger.info('SpellChecker was successfully initialized.', __name__) def check_spelling(self, text): self.__logger.info(f'Start text: {text}', __name__) try: response = requests.get( 'https://speller.yandex.net/services/spellservice.json/checkText', params={ 'text': text }).json() except BaseException as exception: self.__logger.error( self._exceptions_handler.get_error_message(exception), __name__) return text for word in response: text = text.replace(word['word'], word['s'][0]) self.__logger.info(f'Checked text: {text}', __name__) return text
class DocumentPreparer: def __init__(self): self.__logger = Logger() self.__logger.info('DocumentPreparer was successfully initialized.', __name__) def split_into_unigrams(self, text): if text: return re.findall(r'\w+', text) else: self.__logger.warning('Got empty text.', __name__) def split_into_bigrams(self, text): if not text: self.__logger.warning('Got empty text.', __name__) return unigrams = self.split_into_unigrams(text) bigrams = list() if len(unigrams) >= 2: for unigram_index in range(len(unigrams) - 1): bigram = ' '.join( sorted( [unigrams[unigram_index], unigrams[unigram_index + 1]])).strip() bigrams.append(bigram) return bigrams else: self.__logger.info("Text doesn't contain enough words.", __name__) def split_into_trigrams(self, text): if not text: self.__logger.warning('Got empty text.', __name__) return unigrams = self.split_into_unigrams(text) trigrams = list() if len(unigrams) >= 3: for unigram_index in range(len(unigrams) - 2): trigram = ' '.join( sorted([ unigrams[unigram_index], unigrams[unigram_index + 1], unigrams[unigram_index + 2] ])).strip() trigrams.append(trigram) return trigrams else: self.__logger.info("Text doesn't contain enough words.", __name__)
class DatabaseCursor: def __init__(self): # Services self.__logger = Logger() self._path_service = PathService() self._configurator = Configurator() self._exceptions_handler = ExceptionsHandler() # Data self._wd = os.getcwd() self._request_url = None self.databases_public_keys = None self.__logger.info('DatabaseCursor was successfully initialized.', __name__) def _load_config(self): path_to_config = os.path.join(self._path_service.path_to_configs, 'database_cursor.json') if os.path.exists(path_to_config): with open(path_to_config, 'r', encoding='utf-8') as file: config = json.load(file) self._request_url = config['request_url'] self.databases_public_keys = config['database_public_keys'] else: self.__logger.error( "Can't load config for DatabaseCursor (doesn't exist).", __name__) def __update_connection(self, ngram): path_to_db = None if ngram.count(' ') == 0: path_to_db = self._path_service.get_path_to_database('unigrams.db') elif ngram.count(' ') == 1: path_to_db = self._path_service.get_path_to_database('bigrams.db') elif ngram.count(' ') == 2: path_to_db = self._path_service.get_path_to_database('trigrams.db') if path_to_db and os.path.exists(path_to_db): self.__logger.info(f'Connected to database: {path_to_db}', __name__) return sqlite3.connect(path_to_db) else: self.__logger.warning(f'Database lost: {path_to_db}', __name__) self.__logger.info('Trying to download database from cloud...', __name__) self._configurator.download_database(path_to_db) self.__logger.info(f'Connected to database: {path_to_db}', __name__) if os.path.exists(path_to_db): return sqlite3.connect(path_to_db) else: self.__logger.fatal("Database doesn't exist.", __name__) def get_entry(self, ngram): connection = self.__update_connection(ngram) cursor = connection.cursor() request = (""" SELECT * FROM 'Data' WHERE Ngram='%s' """) % ngram self.__logger.info(f'Request to DB: {request.strip()}', __name__) try: cursor.execute(request) self.__logger.info('Request is OK.', __name__) except BaseException as exception: connection.close() self.__logger.error( self._exceptions_handler.get_error_message(exception), __name__) return result = cursor.fetchone() self.__logger.info(f'Received data: {str(result)}', __name__) if result: connection.close() return result[1], result[2] else: connection.close() def entry_exists(self, ngram): connection = self.__update_connection(ngram) cursor = connection.cursor() request = (""" SELECT * FROM 'Data' WHERE Ngram='%s' """) % ngram self.__logger.info(f'Request to DB: {request.strip()}', __name__) try: cursor.execute(request) self.__logger.info('Request is OK.', __name__) except BaseException as exception: connection.close() self.__logger.error( self._exceptions_handler.get_error_message(exception), __name__) return if cursor.fetchone(): connection.close() self.__logger.info('Entry exists.', __name__) return True else: connection.close() self.__logger.info("Entry doesn't exist.", __name__) return False
class MainWindow(QWidget): def __init__(self): super().__init__() self.os = platform.system().lower() # Services self._speech_recognizer = SpeechRecognizer() self._file_reader = FileReader() self.__logger = Logger() self._path_service = PathService() self._text_tonal_analyzer = TextTonalAnalyzer('NBC') self._config = None self._load_config() # GUI Elements self.line_edit = QLineEdit(self) self.answer_label = QLabel(self) self.voice_button = QPushButton(self) self.answer_button = QPushButton(self) self.file_dialog_button = QPushButton(self) self.delete_button = QPushButton(self) self.message_box = QMessageBox() def _load_config(self): path_to_config = os.path.join(self._path_service.path_to_configs, 'demo.json') if os.path.exists(path_to_config): with open(path_to_config, 'r', encoding='utf-8') as file: self._config = json.load(file) if self.os == 'windows': self._config = self._config['windows'] else: self._config = self._config['darwin'] else: self.__logger.fatal("Config for GUI doesn't exist.", __name__) exit(-1) def _configure_main_window(self): self._set_base_params() self._configure_line_edit() self._configure_answer_button() self.configure_voice_button() self._configure_delete_button() self._configure_file_dialog_button() self._configure_answer_label() self.__logger.info('Main window was successfully configured.', __name__) def _set_base_params(self): self.setFixedSize(*self._config['size']) self.setStyleSheet('QWidget { background-color: %s }' % self._config['background-color']) def _configure_line_edit(self): self.line_edit.setToolTip('Enter the text here') self.line_edit.returnPressed.connect(self._answer_button_clicked) self.line_edit.resize(*self._config['line-edit']['size']) self.line_edit.setStyleSheet('QWidget { background-color: %s }' % self._config['line-edit']['background-color']) self.line_edit.move(*self._config['line-edit']['coordinates']) self.line_edit.setFont(QFont(*self._config['line-edit']['font'])) def _configure_answer_button(self): self.answer_button.clicked.connect(self._answer_button_clicked) self.answer_button.setText('Start') self.answer_button.setToolTip('Push to count tonal') self.answer_button.setStyleSheet(""" QPushButton:hover { background-color: %s } QPushButton:!hover { background-color: %s } QPushButton:pressed { background-color: %s; } """ % (self._config['answer-button']['background-color']['hover'], self._config['answer-button']['background-color']['!hover'], self._config['answer-button']['background-color']['pressed'])) self.answer_button.resize(*self._config['answer-button']['size']) self.answer_button.move(*self._config['answer-button']['coordinates']) self.answer_button.setFont(QFont(*self._config['answer-button']['font'])) def configure_voice_button(self): self.voice_button.setText('🎙') self.voice_button.setToolTip('Push to enter the text by speech') self.voice_button.clicked.connect(self._voice_button_clicked) self.voice_button.resize(*self._config['voice-button']['size']) self.voice_button.setFont(QFont(*self._config['voice-button']['font'])) self.voice_button.move(*self._config['voice-button']['coordinates']) self.voice_button.setStyleSheet(""" QPushButton:hover { background-color: %s } QPushButton:!hover { background-color: %s } QPushButton:pressed { background-color: %s; } """ % (self._config['voice-button']['background-color']['hover'], self._config['voice-button']['background-color']['!hover'], self._config['voice-button']['background-color']['pressed'])) def _configure_delete_button(self): self.delete_button.setText('✗') self.delete_button.setToolTip('Push to clear text box') self.delete_button.clicked.connect(self._delete_button_clicked) self.delete_button.resize(*self._config['delete-button']['size']) self.delete_button.setFont(QFont(*self._config['delete-button']['font'])) self.delete_button.move(*self._config['delete-button']['coordinates']) self.delete_button.setStyleSheet(""" QPushButton:hover { background-color: %s } QPushButton:!hover { background-color: %s } QPushButton:pressed { background-color: %s; } """ % (self._config['delete-button']['background-color']['hover'], self._config['delete-button']['background-color']['!hover'], self._config['delete-button']['background-color']['pressed'])) def _configure_file_dialog_button(self): self.file_dialog_button.setText('📂') self.file_dialog_button.setToolTip('Push to open file') self.file_dialog_button.clicked.connect(self._file_dialog_button_clicked) self.file_dialog_button.resize(*self._config['file-dialog-button']['size']) self.file_dialog_button.setFont(QFont(*self._config['file-dialog-button']['font'])) self.file_dialog_button.move(*self._config['file-dialog-button']['coordinates']) self.file_dialog_button.setStyleSheet(""" QPushButton:hover { background-color: %s } QPushButton:!hover { background-color: %s } QPushButton:pressed { background-color: %s; } """ % (self._config['file-dialog-button']['background-color']['hover'], self._config['file-dialog-button']['background-color']['!hover'], self._config['file-dialog-button']['background-color']['pressed'])) def _configure_answer_label(self): self.answer_label.move(*self._config['answer-label']['coordinates']) self.answer_label.setFont(QFont(*self._config['answer-label']['font'])) self.answer_label.resize(*self._config['answer-label']['size']) def launch(self): self.setWindowIcon(QIcon('icon.ico')) self.setWindowTitle('Sentiment Analyser') self._configure_main_window() self.show() self.__logger.info('Main window was successfully launched.', __name__) def _delete_button_clicked(self): self.line_edit.clear() self.answer_label.clear() def _voice_button_clicked(self): self.message_box.question(self, 'Speak', 'You can start speeking.', QMessageBox.Ok) speech_text = self._speech_recognizer.recognize_speech() if speech_text == 'Unknown value': try_again = QMessageBox.Yes while try_again == QMessageBox.Yes and speech_text == 'Unknown value': try_again = self.message_box.question(self, 'Error', 'Unknown value\n Try again?', QMessageBox.Yes | QMessageBox.No) if try_again == QMessageBox.No: break speech_text = self._speech_recognizer.recognize_speech() elif speech_text == 'Internet connection lost': self.message_box.question(self, 'Error', 'Internet connection lost', QMessageBox.Ok) return elif speech_text == 'No microphone': self.message_box.question(self, 'Error', 'Microphone was disconnected', QMessageBox.Ok) return if speech_text != 'Unknown value': self.line_edit.setText(speech_text) return def _file_dialog_button_clicked(self): file_content = self._file_reader.get_file_content() if file_content: self.line_edit.setText(file_content) else: self.__logger.warning('Empty file.', __name__) def _answer_button_clicked(self): self._text_tonal_analyzer.detect_tonal(self.line_edit.text()) if self.os == 'windows': if self._text_tonal_analyzer.tonal == 'positive': self.answer_label.setStyleSheet('QLabel {color:rgba(0, 200, 100, 255)}') self.answer_label.move(193.5, 180) elif self._text_tonal_analyzer.tonal == 'negative': self.answer_label.setStyleSheet('QLabel {color:rgba(255, 56, 20, 255)}') self.answer_label.move(180, 180) elif self.os == 'darwin': if self._text_tonal_analyzer.tonal == 'positive': self.answer_label.setStyleSheet('QLabel {color:rgba(0, 200, 100, 255)}') self.answer_label.move(230, 210) elif self._text_tonal_analyzer.tonal == 'negative': self.answer_label.setStyleSheet('QLabel {color:rgba(255, 56, 20, 255)}') self.answer_label.move(225, 210) self.answer_label.setToolTip('Tonal and probability') if self._text_tonal_analyzer.probability: self.answer_label.setText(self._text_tonal_analyzer.tonal.capitalize() + '\n' + str(round(self._text_tonal_analyzer.probability * 100, 3)) + '%') else: self.answer_label.setText(self._text_tonal_analyzer.tonal.capitalize())
class PathService(metaclass=Singleton): def __init__(self): # Services self.__logger = Logger() # Data self._wd = os.getcwd() self.path_to_databases = None self.path_to_configs = None self._valid_classifiers = None self._valid_model_types = None self._valid_databases = None self._valid_test_results_modes = None self._valid_datasets = None self.path_to_stop_words = None self._path_to_main_directory = None self.path_to_vector_model = None self._path_to_classifier_models = None self._path_to_test_results = None self.configure() self.__logger.info('PathService was successfully configured.', __name__) def _find_main_directory(self): max_nesting_level = 5 nesting_level = 0 while not os.getcwd().endswith('Python'): if os.getcwd().endswith('Databases'): os.chdir(os.path.join('..', 'Python')) break else: os.chdir('..') nesting_level += 1 if nesting_level > max_nesting_level: self.__logger.fatal("Can't find main directory (exceeded maximum nesting level).", __name__) exit(-1) self._path_to_main_directory = os.getcwd() self.path_to_configs = os.path.join(self._path_to_main_directory, 'Services', 'Configs') self.path_to_databases = os.path.abspath(os.path.join('..', 'Databases')) os.chdir(self._wd) def _check_paths_existing(self): if not os.path.exists(self.path_to_configs): self.__logger.fatal("Directory with config files doesn't exist.", __name__) exit(-1) elif not os.path.exists(self.path_to_databases): self.__logger.fatal("Directory with databases doesn't exist.", __name__) exit(-1) elif not os.path.exists(self._path_to_classifier_models): self.__logger.fatal("Directory with classifier models doesn't exist.", __name__) exit(-1) if not os.path.exists(self.path_to_vector_model): self.path_to_vector_model = None self.__logger.error("Vector model doesn't exist.", __name__) if not os.path.exists(self.path_to_stop_words): self.path_to_stop_words = None self.__logger.error("File with stop-words doesn't exist.", __name__) if not os.path.exists(self._path_to_test_results): self._path_to_test_results = None self.__logger.warning("Directory with tests reports doesn't exist.", __name__) def _load_config(self): path_to_config = os.path.join(self.path_to_configs, 'path_service.json') if not os.path.exists(path_to_config): self.__logger.error("Can't find config-file for PathService.", __name__) with open(path_to_config, 'r', encoding='utf-8') as file: config = json.load(file) self._valid_classifiers = config['valid_classifiers'] self._valid_databases = config['valid_databases'] self._valid_datasets = config['valid_datasets'] self._valid_test_results_modes = config['valid_test_results_modes'] self._valid_model_types = config['valid_model_types'] def configure(self): self._find_main_directory() self._load_config() self.path_to_vector_model = os.path.join(self.path_to_databases, 'ruscorpora_upos_skipgram_300_10_2017.bin.gz') self.path_to_stop_words = os.path.join(self._path_to_main_directory, 'Services', 'Lemmatizer', 'stop_words.json') self._path_to_classifier_models = os.path.join(self.path_to_databases, 'Models') self._path_to_test_results = os.path.join(self._path_to_main_directory, 'Tests', 'System', 'Reports') self._check_paths_existing() def get_path_to_test_results(self, mode='classifier', classifier_name='NBC'): if classifier_name not in self._valid_classifiers: self.__logger.warning('Got incorrect classifier name.', __name__) classifier_name = 'NBC' if classifier_name not in self._valid_test_results_modes: self.__logger.warning('Got incorrect mode.', __name__) return self._path_to_test_results if mode.lower().strip() == 'vec_model': return os.path.join(self._path_to_test_results, 'VectorModel') elif mode.lower().strip() == 'classifier_main': return os.path.join(self._path_to_test_results, '..', '..', 'MainReports', 'Classifier', classifier_name) elif mode.lower().strip() == 'classifier': return self._path_to_test_results def get_path_to_model(self, model='unigrams', classifier_name='NBC'): if classifier_name not in self._valid_classifiers: self.__logger.warning('Got incorrect classifier name.', __name__) classifier_name = 'NBC' if model not in self._valid_model_types: self.__logger.warning('Got incorrect model type.', __name__) model = 'unigrams' path_to_models = os.path.join(self._path_to_classifier_models, classifier_name) if os.path.exists(path_to_models): path_to_required_model = os.path.join(path_to_models, f'model_{model}.pkl') return path_to_required_model else: self.__logger.error("Required model wasn't found.", __name__) def get_path_to_database(self, database_name='unigrams.db'): if database_name not in self._valid_databases: self.__logger.warning('Got incorrect database name.', __name__) database_name = 'unigrams.db' path_to_database = os.path.join(self.path_to_databases, database_name) return path_to_database def get_path_to_dataset(self, dataset): if dataset not in self._valid_datasets: self.__logger.warning('Got incorrect dataset name.', __name__) dataset = 'dataset_with_unigrams.csv' path_to_dataset = os.path.join(self.path_to_databases, dataset) return path_to_dataset def set_path_to_vector_model(self, path_to_vector_model): self.path_to_vector_model = path_to_vector_model
class ExceptionsHandler: def __init__(self): self.__logger = Logger() self._request_exceptions = [ type(item) for item in [ requests.ConnectionError(), requests.HTTPError(), requests.TooManyRedirects(), requests.Timeout(), requests.TooManyRedirects(), requests.RequestException(), requests.ConnectTimeout(), requests.ReadTimeout() ] ] self._system_errors = [ type(item) for item in [ KeyError(), AttributeError(), IndexError(), ZeroDivisionError(), SystemError(), ValueError(), AssertionError() ] ] self._file_errors = [ type(item) for item in [FileExistsError(), FileNotFoundError()] ] self._database_errors = [ type(item) for item in [ sqlite3.Error(), sqlite3.DataError(), sqlite3.ProgrammingError(), sqlite3.DatabaseError(), sqlite3.NotSupportedError(), sqlite3.IntegrityError(), sqlite3.InterfaceError(), sqlite3.InternalError(), sqlite3.OperationalError() ] ] self._speech_recognizer_errors = [ type(item) for item in [ sr.RequestError(), sr.UnknownValueError(), sr.WaitTimeoutError(), sr.RequestError() ] ] self.__logger.info('ExceptionsHandler was successfully initialized.', __name__) @staticmethod def _handle_system_exception(exception): if isinstance(exception, KeyError): return 'Key error occurred.' elif isinstance(exception, AttributeError): return 'AttributeError occurred.' elif isinstance(exception, IndexError): return 'Index error occurred.' elif isinstance(exception, ZeroDivisionError): return 'ZeroDivisionError occurred.' elif isinstance(exception, SystemError): return 'SystemError occurred.' elif isinstance(exception, ValueError): return 'ValueError occurred.' elif isinstance(exception, AssertionError): return 'Assertion error occurred.' @staticmethod def _handle_file_exception(exception): if isinstance(exception, FileNotFoundError): return f'FileNotFoundError occurred ({str(exception.filename)}).' elif isinstance(exception, FileExistsError): return f'FileExistsError occurred ({str(exception.filename)}).' @staticmethod def _handle_database_exception(exception): if isinstance(exception, sqlite3.OperationalError): return 'sqlite3.Operational occurred.' elif isinstance(exception, sqlite3.ProgrammingError): return 'sqlite3.ProgrammingError occurred.' elif isinstance(exception, sqlite3.InternalError): return 'sqlite3.InternalError occurred.' elif isinstance(exception, sqlite3.InterfaceError): return 'sqlite3.InterfaceError occurred.' elif isinstance(exception, sqlite3.IntegrityError): return 'sqlite3.IntegrityError occurred.' elif isinstance(exception, sqlite3.NotSupportedError): return 'sqlite3.NotSupportedError occurred.' elif isinstance(exception, sqlite3.DatabaseError): return 'sqlite3.DatabaseError occurred.' elif isinstance(exception, sqlite3.DataError): return 'sqlite3.DataError occurred.' elif isinstance(exception, sqlite3.Error): return 'sqlite3.Error occurred.' @staticmethod def _handle_request_exception(exception): if isinstance(exception, requests.ConnectionError): return 'Problems with connection (requests.ConnectionError).' elif isinstance(exception, requests.HTTPError): return 'HHTP request return unsuccessful status code (requests.HTTPError).' elif isinstance(exception, requests.Timeout): return 'Request times out (requests.Timeout).' elif isinstance(exception, requests.TooManyRedirects): return 'Request exceeds the configured number of maximum redirections (requests.TooManyRedirects).' elif isinstance(exception, requests.ConnectTimeout): return 'ConnectTimeout (requests.ConnectTimeout).' elif isinstance(exception, requests.ReadTimeout): return 'ReadTimeout (requests.ReadTimeout).' else: return 'Request exception (requests.RequestException).' @staticmethod def _handle_speech_recognizer_exception(exception): if isinstance(exception, sr.WaitTimeoutError): return 'speech_recognition.WaitTimeoutError occurred.' elif isinstance(exception, sr.UnknownValueError): return 'Unknown value (speech_recognoition.UnknownValueError).' elif isinstance(exception, sr.RequestError): return 'speech_recognition.RequestError occurred.' def get_error_message(self, exception): if type(exception) in self._system_errors: return self._handle_system_exception(exception) elif type(exception) in self._file_errors: return self._handle_file_exception(exception) elif type(exception) in self._request_exceptions: return ExceptionsHandler._handle_request_exception(exception) elif type(exception) in self._speech_recognizer_errors: return self._handle_speech_recognizer_exception(exception) return str(exception)
class TextTonalAnalyzer: def __init__(self, classifier_name='NBC'): # Services self._configurator = Configurator() self._configurator.configure_system() self._database_cursor = DatabaseCursor() self._document_preparer = DocumentPreparer() self._text_weight_counter = TextWeightCounter() self._classifier = Classifier() self.__logger = Logger() self._lemmatizer = Lemmatizer() self._path_service = PathService() # Data self._classifier_name = classifier_name self._text = None self.tonal = None self.probability = 0 self._unigrams = None self._bigrams = None self._trigrams = None self._unigrams_weight = None self._bigrams_weight = None self._trigrams_weight = None self.__logger.info('TextTonalAnalyzer was successfully initialized.', __name__) def _reset_data(self): self._text = None self.tonal = None self.probability = 0 self._unigrams = None self._bigrams = None self._trigrams = None self._unigrams_weight = None self._bigrams_weight = None self._trigrams_weight = None self.__logger.info('Data was successfully reset.', __name__) def _document_prepare(self): self._unigrams = self._document_preparer.split_into_unigrams( self._text) self._bigrams = self._document_preparer.split_into_bigrams(self._text) self._trigrams = self._document_preparer.split_into_trigrams( self._text) def _text_in_dataset(self): path_to_dataset = self._path_service.get_path_to_dataset( 'dataset_with_unigrams.csv') with open(path_to_dataset, 'r', encoding='utf-8') as file: dataset = csv.reader(file) for doc in dataset: doc = ''.join(doc).split(';') if doc[0] == self._text: self.tonal = doc[1] self.probability = 1 self.__logger.info('Document is in dataset.', __name__) return True return False def _count_weight_by_unigrams(self): self._unigrams_weight = self._text_weight_counter.count_weight_by_unigrams( self._unigrams) def _count_weight_by_bigrams(self): self._bigrams_weight = self._text_weight_counter.count_weight_by_bigrams( self._bigrams) def _count_weight_by_trigrams(self): self._trigrams_weight = self._text_weight_counter.count_weight_by_trigrams( self._trigrams) def detect_tonal(self, text): self._reset_data() self._text = self._lemmatizer.get_text_initial_form(text) if not self._text: self.tonal = 'Unknown' self.__logger.warning('Text is empty.', __name__) return None self._document_prepare() if not self._text_in_dataset(): threads = list() threads.append( Thread(target=self._count_weight_by_unigrams, args=())) threads.append( Thread(target=self._count_weight_by_bigrams, args=())) threads.append( Thread(target=self._count_weight_by_trigrams, args=())) for thread in threads: thread.start() for thread in threads: while thread.is_alive(): time.sleep(0.1) thread.join() self._classifier.customize(self._unigrams_weight, self._bigrams_weight, self._trigrams_weight, self._classifier_name) self.tonal, self.probability = self._classifier.predict_tonal() self.__logger.page_break()
class NgramAnalyzer: def __init__(self): # Services self._database_cursor = DatabaseCursor() self.__logger = Logger() self._exceptions_hanlder = ExceptionsHandler() self._lemmatizer = Lemmatizer() self._path_service = PathService() self._configurator = Configurator() self._morph_analyzer = pymorphy2.MorphAnalyzer() # Data self._vec_model = None self._load_vec_model() self.__logger.info('NgramAnalyzer was successfully initialized.', __name__) def _load_vec_model(self): if not self._path_service.path_to_vector_model: self.__logger.warning("Vector model doesn't exist.", __name__) self._configurator.download_vector_model() self._path_service.set_path_to_vector_model(os.path.join(self._path_service.path_to_databases, 'ruscorpora_upos_skipgram_300_10_2017.bin.gz')) self.__logger.info('Vector model was successfully downloaded.', __name__) if self._path_service.path_to_vector_model: self._vec_model = gensim.models.KeyedVectors.load_word2vec_format(self._path_service.path_to_vector_model, binary=True) else: self.__logger.error("Vector model doesn't exist.", __name__) def _part_of_speech_detect(self, word): if not word: return part_of_speech = self._morph_analyzer.parse(word)[0].tag.POS if part_of_speech: if re.match(r'ADJ', part_of_speech): return 'ADJ' elif re.match(r'PRT', part_of_speech): return 'PRT' elif part_of_speech == 'INFN': return 'VERB' elif part_of_speech == 'ADVB' or part_of_speech == 'PRED': return 'ADV' elif part_of_speech == 'PRCL': return 'PART' return part_of_speech @staticmethod def _detect_ngram_type(ngram): if not ngram: return if ngram.count(' ') == 0: return 'unigram' elif ngram.count(' ') == 1: return 'bigram' elif ngram.count(' ') == 2: return 'trigram' def _nearest_synonyms_find(self, word, topn): if not self._vec_model or not word or topn <= 0: return nearest_synonyms = list() part_of_speech = self._part_of_speech_detect(word) ngram_type = self._detect_ngram_type(word) if part_of_speech: word = word + '_%s' % self._part_of_speech_detect(word) try: for synonym in self._vec_model.most_similar(positive=[word], topn=topn * 10): found_synonym = self._lemmatizer.get_text_initial_form(synonym[0].split('_')[0]) if found_synonym and self._detect_ngram_type(found_synonym) == ngram_type: nearest_synonyms.append({'word': found_synonym, 'cosine proximity': synonym[1]}) if len(nearest_synonyms) == topn: break except BaseException as exception: self.__logger.warning(self._exceptions_hanlder.get_error_message(exception), __name__) return return nearest_synonyms def relevant_ngram_find(self, ngram): if not ngram: return self.__logger.info(f'Start ngram: {ngram}', __name__) response = {'synonym_found': False, 'content': dict()} if self._detect_ngram_type(ngram) == 'unigram': synonyms_count = 10 nearest_synonyms = self._nearest_synonyms_find(ngram, synonyms_count) if not nearest_synonyms: return response for nearest_synonym in nearest_synonyms: data = self._database_cursor.get_entry(nearest_synonym['word']) if data and data[0]: self.__logger.info(f'Relevant ngram: {nearest_synonym["word"]}', __name__) response['synonym_found'] = True response['content']['synonym'] = nearest_synonym['word'] response['content']['pos_docs'] = data[0] response['content']['neg_docs'] = data[1] return response return response
class TextWeightCounter: def __init__(self): # Services self._database_cursor = DatabaseCursor() self._ngram_analyzer = NgramAnalyzer() self.__logger = Logger() self._path_service = PathService() # Data self._docs_count = dict() self._count_all_docs() self.__logger.info('TextWeightCounter was successfully initialized.', __name__) def _count_docs_in_dataset(self, mode): path_to_dataset = self._path_service.get_path_to_dataset( f'dataset_with_{mode}.csv') with open(path_to_dataset, 'r', encoding='utf-8') as file: negative_docs_shift = 10000 positive_docs = 0 negative_docs = negative_docs_shift for row in csv.reader(file): if ''.join(row).split(';')[1] == 'positive': positive_docs += 1 else: negative_docs += 1 return positive_docs + negative_docs - negative_docs_shift, positive_docs, negative_docs def _count_all_docs(self): modes = ['unigrams', 'bigrams', 'trigrams'] for mode in modes: self._docs_count[mode] = dict() self._docs_count[mode]['all_docs'], self._docs_count[mode]['positive_docs'], \ self._docs_count[mode]['negative_docs'] = self._count_docs_in_dataset(mode) @staticmethod def _detect_ngram_type(ngram): if ngram.count(' ') == 0: return 'unigram' elif ngram.count(' ') == 1: return 'bigram' elif ngram.count(' ') == 2: return 'trigram' def _count_ngram_weight(self, ngram): self.__logger.info(f'Ngram: {ngram}', __name__) ngram_type = self._detect_ngram_type(ngram) delta_tf_idf = 0 self.__logger.info(f'Ngram_type: {ngram_type}', __name__) if self._database_cursor.entry_exists(ngram): pos_docs_word, neg_docs_word = self._database_cursor.get_entry( ngram) delta_tf_idf = math.log10( (self._docs_count[ngram_type + 's']['negative_docs'] * pos_docs_word) / (self._docs_count[ngram_type + 's']['positive_docs'] * neg_docs_word)) else: response = self._ngram_analyzer.relevant_ngram_find(ngram) if response['synonym_found']: if ngram_type == 'unigram': pos_docs_word, neg_docs_word = response['content'][ 'pos_docs'], response['content']['neg_docs'] if (not (pos_docs_word and neg_docs_word)) or ( pos_docs_word == 1 and neg_docs_word == 1): return 0 delta_tf_idf = math.log10( (self._docs_count[ngram_type + 's']['negative_docs'] * pos_docs_word) / (self._docs_count[ngram_type + 's']['positive_docs'] * neg_docs_word)) return delta_tf_idf def count_weight_by_unigrams(self, unigrams): checked_unigrams = list() important_unigrams = list() unigrams_weight = 0 for unigram in unigrams: if unigram not in checked_unigrams: this_doc_unigram = unigrams.count(unigram) unigram_weight = this_doc_unigram * self._count_ngram_weight( unigram) unigrams_weight += unigram_weight checked_unigrams.append(unigram) if unigram_weight: important_unigrams.append(unigram) if len(important_unigrams) >= round( len(unigrams) * 0.6) and important_unigrams: unigrams_weight = unigrams_weight / len(important_unigrams) self.__logger.info(f'Unigrams weight: {unigrams_weight}', __name__) return unigrams_weight def count_weight_by_bigrams(self, bigrams): if not bigrams: return None checked_bigrams = list() important_bigrams = list() bigrams_weight = 0 for bigram in bigrams: if bigram not in checked_bigrams: this_doc_bigram = bigrams.count(bigram) bigram_weight = this_doc_bigram * self._count_ngram_weight( bigram) bigrams_weight += bigram_weight checked_bigrams.append(bigram) if bigram_weight: important_bigrams.append(bigram) if len(important_bigrams) >= len(bigrams) // 2 and important_bigrams: bigrams_weight = bigrams_weight / len(important_bigrams) self.__logger.info(f'Bigrams weight: {bigrams_weight}', __name__) return bigrams_weight def count_weight_by_trigrams(self, trigrams): if not trigrams: return None checked_trigrams = list() important_trigrams = list() trigrams_weight = 0 for trigram in trigrams: if trigram not in checked_trigrams: this_doc_trigram = trigrams.count(trigram) trigram_weight = this_doc_trigram * self._count_ngram_weight( trigram) trigrams_weight += trigram_weight checked_trigrams.append(trigram) if trigram_weight: important_trigrams.append(trigram) if len(important_trigrams) >= round( len(trigrams) * 0.4) and important_trigrams: trigrams_weight = trigrams_weight / len(important_trigrams) self.__logger.info(f'Trigrams weight: {trigrams_weight}', __name__) return trigrams_weight
class Configurator(metaclass=Singleton): def __init__(self): # Services self.__logger = Logger() self._path_service = PathService() self._exceptions_handler = ExceptionsHandler() # Data self._config = dict() self._wd = os.getcwd() self._path_to_databases = None self._request_url = None self._vector_model_public_key = None self._databases_public_keys = None self._load_public_keys() self.__logger.info('Configurator was successfully initialized.', __name__) def _load_public_keys(self): path_to_config = os.path.join(self._path_service.path_to_configs, 'configurator.json') if os.path.exists(path_to_config): with open(path_to_config, 'r', encoding='utf-8') as file: config = json.load(file) self._request_url = config['request_url'] self._vector_model_public_key = config['vector_model_public_key'] self._databases_public_keys = config['databases_public_keys'] else: self.__logger.error( "Can't load config for Configrurator (doesn't exist).", __name__) def download_database(self, path_to_db): database_name = os.path.split(path_to_db)[1] if database_name: try: download_url = requests.get( self._request_url, params={ 'public_key': self._databases_public_keys[database_name] }).json()["href"] with open(path_to_db, 'wb') as database_file: database_file.write(requests.get(download_url).content) self._config[path_to_db] = 'downloaded' except BaseException as exception: self.__logger.error( self._exceptions_handler.get_error_message(exception), __name__) self._config[path_to_db] = 'error' def download_vector_model(self): self._path_service.set_path_to_vector_model( os.path.join(self._path_service.path_to_databases, 'ruscorpora_upos_skipgram_300_10_2017.bin.gz')) try: download_url = requests.get(self._request_url, params={ 'public_key': self._vector_model_public_key }).json()["href"] with open(self._path_service.path_to_vector_model, 'wb') as vec_model: vec_model.write(requests.get(download_url).content) self._config[ 'ruscorpora_upos_skipgram_300_10_2017.bin.gz'] = 'downloaded' except BaseException as exception: self.__logger.error( self._exceptions_handler.get_error_message(exception), __name__) self._config[ 'ruscorpora_upos_skipgram_300_10_2017.bin.gz'] = 'error' def configure_system(self): self._config['datetime'] = str(datetime.now()) for database in ['unigrams.db', 'bigrams.db', 'trigrams.db']: path_to_database = self._path_service.get_path_to_database( database) if not path_to_database or not os.path.exists(path_to_database): self.__logger.warning('Database not found: %s' % str(database), __name__) self.download_database( os.path.join(self._path_service.path_to_databases, database)) else: self._config[database] = 'exists' if not self._path_service.path_to_vector_model or not os.path.exists( self._path_service.path_to_vector_model): self.__logger.warning('Vector model not found.', __name__) self.download_vector_model() else: self._config[ 'ruscorpora_upos_skipgram_300_10_2017.bin.gz'] = 'exists' self._create_config() def _create_config(self): with open(os.path.join('Logs', 'config.json'), 'w', encoding='utf-8') as config: json.dump(self._config, config, indent=4)
class Classifier: def __init__(self): # Services self.__logger = Logger() self._path_service = PathService() self._exceptions_handler = ExceptionsHandler() # Data self._container = ClassificationDataContainer() self._possible_classifiers = ['NBC', 'LogisticRegression', 'KNN'] self.__logger.info('Classifier was successfully initialized.', __name__) def _load_config(self): path_to_config = os.path.join(self._path_service.path_to_configs, 'classifier.json') if os.path.exists(path_to_config): with open(path_to_config, 'r', encoding='utf-8') as file: config = json.load(file) self._possible_classifiers = config['possible_classifiers'] else: self.__logger.error("Can't load Classifier configuration.", __name__) def customize(self, unigrams_weight, bigrams_weight, trigrams_weight, classifier_name='NBC'): self._container.clear() if classifier_name in self._possible_classifiers: self._container.classifiers['name'] = classifier_name else: self._container.classifiers['name'] = 'NBC' self.__logger.error('Got unknown classifier, set default (NBC).', __name__) self._container.weights['unigrams'] = unigrams_weight self._container.weights['bigrams'] = bigrams_weight self._container.weights['trigrams'] = trigrams_weight try: if self._container.weights['unigrams']: self._container.classifiers['unigrams'] = joblib.load( self._path_service.get_path_to_model( 'unigrams', self._container.classifiers['name'])) if self._container.weights['bigrams']: self._container.classifiers['bigrams'] = joblib.load( self._path_service.get_path_to_model( 'bigrams', self._container.classifiers['name'])) if self._container.weights['trigrams']: self._container.classifiers['trigrams'] = joblib.load( self._path_service.get_path_to_model( 'trigrams', self._container.classifiers['name'])) self.__logger.info('Models were successfully loaded.', __name__) self.__logger.info('Classifier was successfully configured.', __name__) except BaseException as exception: self.__logger.fatal( self._exceptions_handler.get_error_message(exception), __name__) def _predict_tonal_by_unigrams(self): self._container.tonalities['unigrams'] = self._container.classifiers[ 'unigrams'].predict(self._container.weights['unigrams'])[0] self._container.probabilities['unigrams'] = max( self._container.classifiers['unigrams'].predict_proba( self._container.weights['unigrams'])[0]) self.__logger.info( f'Unigrams tonal: {self._container.tonalities["unigrams"]}', __name__) self.__logger.info( f'Unigrams probability: {self._container.probabilities["unigrams"]}', __name__) def _predict_tonal_by_unigrams_bigrams(self): self._container.tonalities['bigrams'] = self._container.classifiers[ 'bigrams'].predict([[ self._container.weights['unigrams'], self._container.weights['bigrams'] ]])[0] self._container.probabilities['bigrams'] = max( self._container.classifiers['bigrams'].predict_proba([[ self._container.weights['unigrams'], self._container.weights['bigrams'] ]])[0]) self.__logger.info( f'Bigrams tonal: {self._container.tonalities["bigrams"]}', __name__) self.__logger.info( f'Bigrams probability: {self._container.probabilities["bigrams"]}', __name__) def _predict_tonal_by_unigrams_bigrams_trigrams(self): self._container.tonalities['trigrams'] = self._container.classifiers[ 'trigrams'].predict([[ self._container.weights['unigrams'], self._container.weights['bigrams'], self._container.weights['trigrams'] ]])[0] self._container.probabilities['trigrams'] = max( self._container.classifiers['trigrams'].predict_proba([[ self._container.weights['unigrams'], self._container.weights['bigrams'], self._container.weights['trigrams'] ]])[0]) self.__logger.info( f'Trigrams tonal: {self._container.tonalities["trigrams"]}', __name__) self.__logger.info( f'Trigrams probability: {self._container.probabilities["trigrams"]}', __name__) def _predict_intermediate_tonalities(self): threads = list() if self._container.weights['unigrams']: threads.append( Thread(target=self._predict_tonal_by_unigrams, args=())) if self._container.weights['bigrams']: threads.append( Thread(target=self._predict_tonal_by_unigrams_bigrams, args=())) if self._container.weights['trigrams']: threads.append( Thread(target=self._predict_tonal_by_unigrams_bigrams_trigrams, args=())) for thread in threads: thread.start() for thread in threads: while thread.is_alive(): time.sleep(0.1) thread.join() def _select_final_tonal(self): if self._container.tonalities['unigrams'] and self._container.tonalities['bigrams'] and \ self._container.tonalities['trigrams']: if self._container.tonalities[ 'unigrams'] == self._container.tonalities['bigrams']: self._container.tonalities[ 'final'] = self._container.tonalities['unigrams'] self._container.probabilities['final'] = max( self._container.probabilities['unigrams'], self._container.probabilities['bigrams']) elif self._container.tonalities[ 'unigrams'] == self._container.tonalities['trigrams']: self._container.tonalities[ 'final'] = self._container.tonalities['unigrams'] self._container.probabilities['final'] = max( self._container.probabilities['unigrams'], self._container.probabilities['trigrams']) elif self._container.tonalities[ 'bigrams'] == self._container.tonalities['trigrams']: self._container.tonalities[ 'final'] = self._container.tonalities['bigrams'] self._container.probabilities['final'] = max( self._container.probabilities['bigrams'], self._container.probabilities['trigrams']) elif self._container.tonalities[ 'unigrams'] and self._container.tonalities['bigrams']: if self._container.tonalities[ 'unigrams'] != self._container.tonalities['bigrams']: if self._container.probabilities[ 'unigrams'] >= self._container.probabilities['bigrams']: self._container.tonalities[ 'final'] = self._container.tonalities['unigrams'] self._container.probabilities[ 'final'] = self._container.probabilities['unigrams'] else: self._container.tonalities[ 'final'] = self._container.tonalities['bigrams'] self._container.probabilities[ 'final'] = self._container.probabilities['bigrams'] elif self._container.tonalities[ 'unigrams'] == self._container.tonalities['bigrams']: self._container.tonalities[ 'final'] = self._container.tonalities['unigrams'] self._container.probabilities['final'] = max( self._container.probabilities['bigrams'], self._container.probabilities['unigrams']) elif self._container.tonalities['unigrams']: self._container.tonalities['final'] = self._container.tonalities[ 'unigrams'] self._container.probabilities[ 'final'] = self._container.probabilities['unigrams'] def predict_tonal(self): self._predict_intermediate_tonalities() self._select_final_tonal() self.__logger.info( f'Final tonal: {self._container.tonalities["final"]}', __name__) self.__logger.info( f'Final probability: {self._container.probabilities["final"]}', __name__) return self._container.tonalities[ 'final'], self._container.probabilities['final']
class Lemmatizer: def __init__(self): # Services self._spell_checker = SpellChecker() self.__logger = Logger() self._path_service = PathService() self._morph_analyzer = pymorphy2.MorphAnalyzer() # Data self._stop_words = self._read_stop_words() self._parts_of_speech_to_remove = ['NUMR', 'NPRO', 'PREP'] self.__logger.info('Lemmatizer was successfully initialized.', __name__) @staticmethod def _contains_latin_letter(word): if word: return all(map(lambda c: c in ascii_letters, word)) def _detect_part_of_speech(self, word): if word: return self._morph_analyzer.parse(word)[0].tag.POS def _is_stop_word(self, word): if not word: self.__logger.warning('Got empty word.', __name__) return word = f' {word} ' for stop_words in self._stop_words.values(): if word in stop_words: return True return False def _remove_words_without_emotions(self, text): if not text: self.__logger.warning('Got empty text.', __name__) return cleaned_text = list() for word in re.findall(r'\w+', text): if not self._detect_part_of_speech(word) in self._parts_of_speech_to_remove and\ not self._is_stop_word(word): cleaned_text.append(word) return ' '.join(cleaned_text).strip() def _read_stop_words(self): if os.path.exists(self._path_service.path_to_stop_words): with open(self._path_service.path_to_stop_words, 'r', encoding='utf-8') as file: return json.load(file) def _delete_words_contains_latin_letters(self, text): text = ' '.join([ word for word in re.findall( r'\w+', self._spell_checker.check_spelling(text.lower())) if not self._contains_latin_letter(word) and word.isalpha() ]).strip() if text: return text else: self.__logger.warning( 'All words in document contain latin letters or all words are digits.', __name__) def _get_text_normal_form(self, text): return ' '.join([self._morph_analyzer.parse(word)[0].normal_form + ' ' for word in re.findall(r'\w+', text)]) \ .strip() def get_text_initial_form(self, text): if not text: self.__logger.warning('Got empty text.', __name__) return self.__logger.info(f'Start text: {text}', __name__) transformations = [ self._delete_words_contains_latin_letters, self._get_text_normal_form, self._remove_words_without_emotions ] for transformation in transformations: text = transformation(text) if not text: return self.__logger.info(f'Lemmatized text: {text}', __name__) return text