示例#1
0
def _load_vector_space_mapper(model_1_path, model_2_path, bilingual_path):
    """Build a vector space mapper from model 1,2 and bilingual dict."""
    model_1 = Word2Vec.load(model_1_path)
    model_2 = Word2Vec.load(model_2_path)
    bilingual_dict = bg.load_bilingual_dictionary(bilingual_path)
    tvecs_vm = VectorSpaceMapper(model_1, model_2, bilingual_dict)
    tvecs_vm.map_vector_spaces()
    return tvecs_vm
def _load_vector_space_mapper(model_1_path, model_2_path, bilingual_path):
    """Build a vector space mapper from model 1,2 and bilingual dict."""
    model_1 = Word2Vec.load(model_1_path)
    model_2 = Word2Vec.load(model_2_path)
    bilingual_dict = bg.load_bilingual_dictionary(bilingual_path)
    tvecs_vm = VectorSpaceMapper(model_1, model_2, bilingual_dict)
    tvecs_vm.map_vector_spaces()
    return tvecs_vm
    def obtain_mean_square_error_from_dataset(self, dataset_path, ):
        """
        Obtain Mean Square Error from bilingual dataset.

        API Documentation:
            :param dataset_path: Path for the test bilingual dictionary.
            :type dataset_path: :class:`String`
            :return: %% of reduction of Mean Square Error after transformation.
            :rtype: :class:`Float`
        """
        self.logger.info(
            'Obtain mean square error from dataset: %s', dataset_path
        )
        bilingual_dictionary = bg.load_bilingual_dictionary(dataset_path)
        avg = 0.0
        count = 0.0
        expected_with_tr = []
        actual_with_tr = []
        expected = []
        actual = []
        for tup in bilingual_dictionary:
            word_1 = tup[0]
            word_2 = tup[1]
            try:
                pr_vector_1 = self._predict_vec_from_word(word_1)
                vector_1 = self.model_1[word_1]
                vector_2 = self.model_2[word_2]
                expected.append(vector_1)
                actual.append(vector_2)
                expected_with_tr.append(pr_vector_1)
                actual_with_tr.append(vector_2)
            except KeyError:
                pass
        score = metrics.mean_squared_error(expected, actual)
        score_with_tr = metrics.mean_squared_error(
            expected_with_tr, actual_with_tr
        )
        self.logger.info(
            'Mean Square Error for Dataset without transformation: %s', score
        )
        self.logger.info(
            'Mean Square Error for Dataset'
            ' with transformation: %s', score_with_tr
        )
        error_reduction = ((score - score_with_tr) / score) * 100
        self.logger.info(
            'Reduction in Mean Square Error'
            ' with transformation: %s %%', error_reduction
        )
        return error_reduction
    def obtain_mean_square_error_from_dataset(
        self,
        dataset_path,
    ):
        """
        Obtain Mean Square Error from bilingual dataset.

        API Documentation:
            :param dataset_path: Path for the test bilingual dictionary.
            :type dataset_path: :class:`String`
            :return: %% of reduction of Mean Square Error after transformation.
            :rtype: :class:`Float`
        """
        self.logger.info('Obtain mean square error from dataset: %s',
                         dataset_path)
        bilingual_dictionary = bg.load_bilingual_dictionary(dataset_path)
        avg = 0.0
        count = 0.0
        expected_with_tr = []
        actual_with_tr = []
        expected = []
        actual = []
        for tup in bilingual_dictionary:
            word_1 = tup[0]
            word_2 = tup[1]
            try:
                pr_vector_1 = self._predict_vec_from_word(word_1)
                vector_1 = self.model_1[word_1]
                vector_2 = self.model_2[word_2]
                expected.append(vector_1)
                actual.append(vector_2)
                expected_with_tr.append(pr_vector_1)
                actual_with_tr.append(vector_2)
            except KeyError:
                pass
        score = metrics.mean_squared_error(expected, actual)
        score_with_tr = metrics.mean_squared_error(expected_with_tr,
                                                   actual_with_tr)
        self.logger.info(
            'Mean Square Error for Dataset without transformation: %s', score)
        self.logger.info(
            'Mean Square Error for Dataset'
            ' with transformation: %s', score_with_tr)
        error_reduction = ((score - score_with_tr) / score) * 100
        self.logger.info(
            'Reduction in Mean Square Error'
            ' with transformation: %s %%', error_reduction)
        return error_reduction
from tvecs.bilingual_generator import bilingual_generator as bg
from tvecs.vector_space_mapper import vector_space_mapper as vm
from gensim.models import Word2Vec
import os
from gensim.models import KeyedVectors

# train_bilingual_corpus = 'data/bilingual_dictionary/en-fr.txt'
# test_bilingual_corpus = 'data/bilingual_dictionary/en-fr.txt'
#
bilingual_dict = bg.load_bilingual_dictionary(
    'data/bilingual_dictionary/en-bn.txt')

vector_space_mapper = vm.VectorSpaceMapper(
    model_1=Word2Vec.load(os.path.join('data', 'models',
                                       't-vex-english-model')),
    model_2=KeyedVectors.load_word2vec_format(
        'data/models/t-vex-bengali-model'),
    bilingual_dict=bilingual_dict)

vector_space_mapper.map_vector_spaces()

# print("Training MSE: {} %".format(vector_space_mapper.obtain_mean_square_error_from_dataset(
#     dataset_path=train_bilingual_corpus
# )))
#
# print("Testing MSE: {} %".format(vector_space_mapper.obtain_mean_square_error_from_dataset(
#     dataset_path=test_bilingual_corpus
# )))

word = raw_input("Please input the english word :")
from tvecs.bilingual_generator import bilingual_generator as bg
from tvecs.vector_space_mapper import vector_space_mapper as vm
from gensim.models import KeyedVectors

import os

train_bilingual_corpus = 'data/bilingual_dictionary/en-fr.txt'
test_bilingual_corpus = 'data/bilingual_dictionary/en-fr.txt'

bilingual_dict = bg.load_bilingual_dictionary(train_bilingual_corpus)

vector_space_mapper = vm.VectorSpaceMapper(
    model_1=KeyedVectors.load_word2vec_format(
        'data/models/t-vex-english-fb-model'),
    model_2=KeyedVectors.load_word2vec_format(
        'data/models/t-vex-french-model')  #change
    ,
    bilingual_dict=bilingual_dict)

vector_space_mapper.map_vector_spaces()

print("Training MSE: {} %".format(
    vector_space_mapper.obtain_mean_square_error_from_dataset(
        dataset_path=train_bilingual_corpus)))

print("Testing MSE: {} %".format(
    vector_space_mapper.obtain_mean_square_error_from_dataset(
        dataset_path=test_bilingual_corpus)))

word1 = raw_input("Please input the english word :")
word2 = raw_input("Please input another english word :")
from tvecs.bilingual_generator import bilingual_generator as bg
from tvecs.vector_space_mapper import vector_space_mapper as vm
from gensim.models import Word2Vec
import os

# train_bilingual_corpus = 'data/bilingual_dictionary/en-fr.txt'
# test_bilingual_corpus = 'data/bilingual_dictionary/en-fr.txt'
#
bilingual_dict = bg.load_bilingual_dictionary(
    'data/bilingual_dictionary/english_hindi_bd')

vector_space_mapper = vm.VectorSpaceMapper(
    model_1=Word2Vec.load(os.path.join('data', 'models',
                                       't-vex-english-model')),
    model_2=Word2Vec.load(os.path.join('data', 'models', 't-vex-hindi-model')),
    bilingual_dict=bilingual_dict)

vector_space_mapper.map_vector_spaces()

# print("Training MSE: {} %".format(vector_space_mapper.obtain_mean_square_error_from_dataset(
#     dataset_path=train_bilingual_corpus
# )))
#
# print("Testing MSE: {} %".format(vector_space_mapper.obtain_mean_square_error_from_dataset(
#     dataset_path=test_bilingual_corpus
# )))

word = raw_input("Please input the english word :")

while word != '///':
    print vector_space_mapper.get_recommendations_from_word(word.lower(),
        score_with_tr = metrics.mean_squared_error(expected_with_tr,
                                                   actual_with_tr)
        self.logger.info(
            'Mean Square Error for Dataset without transformation: %s', score)
        self.logger.info(
            'Mean Square Error for Dataset'
            ' with transformation: %s', score_with_tr)
        error_reduction = ((score - score_with_tr) / score) * 100
        self.logger.info(
            'Reduction in Mean Square Error'
            ' with transformation: %s %%', error_reduction)
        return error_reduction


if __name__ == '__main__':
    log.set_logger_normal(LOGGER)
    model_1 = Word2Vec.load(
        os.path.join('data', 'models', 't-vex-english-model'))
    model_2 = Word2Vec.load(os.path.join('data', 'models',
                                         't-vex-hindi-model'))
    bilingual_dict = bg.load_bilingual_dictionary(
        os.path.join('data', 'bilingual_dictionary', 'english_hindi_train_bd'))
    vm = VectorSpaceMapper(model_1, model_2, bilingual_dict)
    vm.map_vector_spaces()
    LOGGER.info('Evaluation of Testing Dataset')
    vm.obtain_mean_square_error_from_dataset(dataset_path=os.path.join(
        'data', 'bilingual_dictionary', 'english_hindi_test_bd'))
    LOGGER.info('Evaluation of Training Dataset')
    vm.obtain_mean_square_error_from_dataset(dataset_path=os.path.join(
        'data', 'bilingual_dictionary', 'english_hindi_train_bd'))
            ' with transformation: %s %%', error_reduction
        )
        return error_reduction


if __name__ == '__main__':
    log.set_logger_normal(LOGGER)
    model_1 = Word2Vec.load(
        os.path.join('data', 'models', 't-vex-english-model')
    )
    model_2 = Word2Vec.load(
        os.path.join('data', 'models', 't-vex-hindi-model')
    )
    bilingual_dict = bg.load_bilingual_dictionary(
        os.path.join(
            'data', 'bilingual_dictionary', 'english_hindi_train_bd'
        )
    )
    vm = VectorSpaceMapper(model_1, model_2, bilingual_dict)
    vm.map_vector_spaces()
    LOGGER.info(
        'Evaluation of Testing Dataset'
    )
    vm.obtain_mean_square_error_from_dataset(dataset_path=os.path.join(
        'data', 'bilingual_dictionary', 'english_hindi_test_bd'
    ))
    LOGGER.info(
        'Evaluation of Training Dataset'
    )
    vm.obtain_mean_square_error_from_dataset(dataset_path=os.path.join(
        'data', 'bilingual_dictionary', 'english_hindi_train_bd'
示例#10
0
def bilingual_generator(lang1, lang2, bilingual_dict):
    """Load & returns previously generated bilingual dictionary."""
    bilingual_dict = bg.load_bilingual_dictionary(
        bilingual_dict
    )
    return bilingual_dict
    def setup_class(cls):
        """
        Setup Unit Testing for :class:`VectorSpaceMapper`.

        | *Test Suite ID* : V
        |
        | *Test Case Number* : 01
        |
        | *Description* : Create an instance of
        |                 :mod:`tvecs.vector_space_mapper.vector_space_mapper`.
        |
        | *Preconditions* : Corpus data for both languages
        |                   and bilingual dictionary exists.
        |
        | *Test Parameters* : model_1, model_2, bilingual_dict
        |
        | *Test Data* : model_1 = English, model_2 = Hindi, bilingual_dict =
        |               'data/bilingual_dictionary/english_hindi_train_bd'
        |
        | *Expected Result* : Vector Space Mapping created
        |
        | *Actual Result* : Vector Space Mapping created
        |
        | **Status : Pass**
        |

        - Learns transformation between two models
            - :mod:`tvecs.model_generator.model_generator`
            - :mod:`tvecs.model_generator.model_generator`
        """
        try:
            model_1 = mod.generate_model(
                preprocessor_type='HcCorpusPreprocessor',
                language='english',
                corpus_fname='test_english',
                corpus_dir_path=os.path.join('tests', 'resources'),
                output_dir_path=os.path.join('tests', 'resources'),
                need_preprocessing=True,
                output_fname='model_1'
            )
            model_2 = mod.generate_model(
                preprocessor_type='HcCorpusPreprocessor',
                language='hindi',
                corpus_fname='test_hindi',
                corpus_dir_path=os.path.join('tests', 'resources'),
                output_dir_path=os.path.join('tests', 'resources'),
                need_preprocessing=True,
                output_fname='model_2'
            )
        except Exception as err:
            pytest.fail(
                'Model construction failed: %s' % err.message
            )
        try:
            bilingual_dict = bg.load_bilingual_dictionary(
                os.path.join(
                    'data', 'bilingual_dictionary', 'english_hindi_train_bd'
                )
            )
        except Exception as err:
            pytest.fail(
                'Bilingual Dictionary Construction failed: %s' % err.message
            )
        try:
                cls.testing_obj = VectorSpaceMapper(
                    model_1, model_2, bilingual_dict
                )
                cls.testing_obj.map_vector_spaces()
        except BaseException as err:
            pytest.fail(
                'Vector Space Mapping failed : %s' % err.message
            )