Exemplo n.º 1
0
 def __init__(
     self,
     corpus_fname,
     corpus_dir_path='.',
     encoding='utf-8',
     language='english',
     need_preprocessing=False,
     limit=None
 ):
     """Constructor which initializes the BasePreprocessor constructor."""
     try:
         self.logger = LOGGER
     except NameError:
         self.logger = log.initialise('T-Vecs.Preprocessor')
     self.language = language
     # If language is not specified, regex pattern for split is default ''
     self.lang_split_sent = defaultdict(lambda: u'')
     # Specify language specific split regex pattern
     lang_split_sent = [
         ('hindi', u'[।]'),
     ]
     # Store language specific regex pattern in the defaultdict
     for k, v in lang_split_sent:
         self.lang_split_sent[k] = v
     self.logger.info('EmilleCorpusPreprocessor utilised')
     super(EmilleCorpusPreprocessor, self).__init__(
         corpus_fname,
         corpus_dir_path=corpus_dir_path,
         encoding=encoding,
         need_preprocessing=need_preprocessing,
         limit=limit
     )
 def __init__(self, model_1, model_2, bilingual_dict, encoding='utf-8'):
     """Constructor initialization for the vector space mapper."""
     try:
         self.logger = LOGGER
     except NameError:
         self.logger = log.initialise('T-Vecs.VectorSpaceMapper')
     self.model_1 = model_1
     self.model_2 = model_2
     self.encoding = encoding
     self.lt = None
     self.bilingual_dict = bilingual_dict
     bilingual_dict = dict(bilingual_dict)
     self.logger.debug('Extracting vocabulary and vector list from model 1')
     self.vector_1_list, self.word_1_list = self._extract_vectors_and_words(
         self.model_1, bilingual_dict.keys())
     self.logger.debug('Extracting vocabulary and vector list from model 2')
     (self.vector_2_list,
      self.word_2_list) = VectorSpaceMapper._extract_vectors_and_words(
          self.model_2, bilingual_dict.values())
     # Remove corresponding elements if any vectors were missing from models
     # across both languages
     (self.vector_1_list, self.word_1_list, self.vector_2_list,
      self.word_2_list) = zip(
          *[(self.vector_1_list[index], self.word_1_list[index],
             self.vector_2_list[index], self.word_2_list[index])
            for index in range(len(self.vector_1_list))
            if ((self.vector_1_list[index] is not None) and (
                self.vector_2_list[index] is not None))])
Exemplo n.º 3
0
 def __init__(self,
              corpus_fname,
              corpus_dir_path='.',
              encoding='utf-8',
              need_preprocessing=False,
              language='english',
              limit=None):
     """Constructor which initializes the BasePreprocessor constructor."""
     try:
         self.logger = LOGGER
     except NameError:
         self.logger = log.initialise('T-Vecs.Preprocessor')
     self.language = language
     # If language is not specified, regex pattern for split is default ''
     self.lang_split_sent = defaultdict(lambda: u'')
     # Specify language specific split regex pattern
     lang_split_sent = [
         ('hindi', u'[।]'),
     ]
     # Store language specific regex pattern in the defaultdict
     for k, v in lang_split_sent:
         self.lang_split_sent[k] = v
     self.logger.info('HcCorpusPreprocessor utilised')
     super(HcCorpusPreprocessor,
           self).__init__(corpus_fname,
                          corpus_dir_path=corpus_dir_path,
                          encoding=encoding,
                          need_preprocessing=need_preprocessing,
                          limit=limit)
Exemplo n.º 4
0
 def __init__(self, model_1, model_2, bilingual_dict, encoding='utf-8'):
     """Constructor initialization for the vector space mapper."""
     try:
         self.logger = LOGGER
     except NameError:
         self.logger = log.initialise('T-Vecs.VectorSpaceMapper')
     self.model_1 = model_1
     self.model_2 = model_2
     self.encoding = encoding
     self.lt = None
     self.bilingual_dict = bilingual_dict
     bilingual_dict = dict(bilingual_dict)
     self.logger.debug('Extracting vocabulary and vector list from model 1')
     self.vector_1_list, self.word_1_list = self._extract_vectors_and_words(
         self.model_1, bilingual_dict.keys()
     )
     self.logger.debug('Extracting vocabulary and vector list from model 2')
     (
         self.vector_2_list, self.word_2_list
     ) = VectorSpaceMapper._extract_vectors_and_words(
         self.model_2, bilingual_dict.values()
     )
     # Remove corresponding elements if any vectors were missing from models
     # across both languages
     (self.vector_1_list, self.word_1_list, self.vector_2_list,
         self.word_2_list) = zip(*[
             (self.vector_1_list[index], self.word_1_list[index],
                 self.vector_2_list[index], self.word_2_list[index])
             for index in range(len(self.vector_1_list))
             if (
                 (self.vector_1_list[index] is not None) and (
                     self.vector_2_list[index] is not None)
             )
         ]
     )
Exemplo n.º 5
0
 def __init__(
     self,
     corpus_fname,
     corpus_dir_path='.',
     encoding='utf-8',
     need_preprocessing=False,
     limit=None
 ):
     """Constructor initialization for BasePreprocessor."""
     try:
         self.logger = LOGGER
     except NameError:
         self.logger = log.initialise('T-Vecs.Preprocessor')
     self.limit = limit
     self.corpus_fname = corpus_fname
     self.corpus_path = os.path.join(
         corpus_dir_path, self.corpus_fname
     )
     self.encoding = encoding
     if need_preprocessing is True:
         self.preprocessed_corpus_path = os.path.join(
             corpus_dir_path, '%s.preprocessed' % corpus_fname
         )
         if os.path.exists(self.preprocessed_corpus_path) is False:
             with codecs.open(
                 self.corpus_path, 'r', encoding=self.encoding
             ) as file:
                 self.logger.debug('Extracting Corpus Data')
                 self._save_preprocessed_data(
                     data=self._extract_corpus_data(
                         data=file.read()
                     ),
                     output_fpath=self.preprocessed_corpus_path
                 )
                 self.logger.debug('Saved Intermediate Preprocessed File')
         else:
             self.logger.info(
                 'Preprocessed Corpus found: %s.preprocessed', corpus_fname
             )
         self.preprocessed_corpus_fname = '%s.preprocessed' % corpus_fname
     else:
         self.logger.info('Utilising Preprocessed Corpus: %s' % (
             self.corpus_fname
         ))
         self.preprocessed_corpus_fname = self.corpus_fname
         self.preprocessed_corpus_path = os.path.join(
             corpus_dir_path,
             self.preprocessed_corpus_fname
         )
Exemplo n.º 6
0
    def __init__(
        self,
        corpus_fname,
        corpus_dir_path='.',
        encoding='utf-8',
        need_preprocessing=False,
        language='english',
        limit=None
    ):
        """Constructor which initializes the BasePreprocessor constructor."""
        try:
            self.logger = LOGGER
        except NameError:
            self.logger = log.initialise('T-Vecs.Preprocessor')
        self.language = language
        # If language is not specified, regex pattern for split is default ''
        self.lang_split_sent = defaultdict(lambda: u'')
        # Specify language specific split regex pattern
        lang_split_sent = [
            ('hindi', u'[।]'),
        ]
        # Store language specific regex pattern in the defaultdict
        for k, v in lang_split_sent:
            self.lang_split_sent[k] = v
        self.logger.info('LeipzigPreprocessor utilised')
        preprocessed_corpus_fname = "%s.preprocessed" % corpus_fname
        if not os.path.exists(
            os.path.join(corpus_dir_path, preprocessed_corpus_fname)
        ):
            # < -- call function to preprocess leipzig corpus -- >
            self._leipzig_corpus_preprocess(
                corpus_fname, corpus_dir_path, encoding
            )

        # < -- call BasePreprocessor Constructor -- >
        super(LeipzigPreprocessor, self).__init__(
            corpus_fname=preprocessed_corpus_fname,
            corpus_dir_path=corpus_dir_path,
            encoding=encoding,
            need_preprocessing=False,
            limit=limit
        )
Exemplo n.º 7
0
 def __init__(self,
              corpus_fname,
              corpus_dir_path='.',
              encoding='utf-8',
              need_preprocessing=False,
              limit=None):
     """Constructor initialization for BasePreprocessor."""
     try:
         self.logger = LOGGER
     except NameError:
         self.logger = log.initialise('T-Vecs.Preprocessor')
     self.limit = limit
     self.corpus_fname = corpus_fname
     self.corpus_path = os.path.join(corpus_dir_path, self.corpus_fname)
     self.encoding = encoding
     if need_preprocessing is True:
         self.preprocessed_corpus_path = os.path.join(
             corpus_dir_path, '%s.preprocessed' % corpus_fname)
         if os.path.exists(self.preprocessed_corpus_path) is False:
             with codecs.open(self.corpus_path, 'r',
                              encoding=self.encoding) as file:
                 self.logger.debug('Extracting Corpus Data')
                 self._save_preprocessed_data(
                     data=self._extract_corpus_data(data=file.read()),
                     output_fpath=self.preprocessed_corpus_path)
                 self.logger.debug('Saved Intermediate Preprocessed File')
         else:
             self.logger.info('Preprocessed Corpus found: %s.preprocessed',
                              corpus_fname)
         self.preprocessed_corpus_fname = '%s.preprocessed' % corpus_fname
     else:
         self.logger.info('Utilising Preprocessed Corpus: %s' %
                          (self.corpus_fname))
         self.preprocessed_corpus_fname = self.corpus_fname
         self.preprocessed_corpus_path = os.path.join(
             corpus_dir_path, self.preprocessed_corpus_fname)
# -*- coding: utf-8 -*-
"""Module to map two Vector Spaces using a bilingual dictionary."""
import os
import codecs
import logging
from gensim.models import Word2Vec
import scipy.spatial.distance as dist
from sklearn.linear_model import RidgeCV
from sklearn import metrics
from nltk.corpus import wordnet as wn
from tvecs.bilingual_generator import bilingual_generator as bg
from tvecs.logger import init_logger as log
from itertools import chain
from sklearn.linear_model import LinearRegression

LOGGER = log.initialise('TVecs.VectorSpaceMapper')


class VectorSpaceMapper(object):
    """
    Class to map two vector spaces together.

    - Vector spaces obtained using the two Word2Vec models.
    - Bilingual Dict used to map semantic embeddings between vector spaces.
    - Linear Regression utilised for the mapping from
        :mod:`sklearn.linear_model`

    API Documentation:
        :param model_1: Model constructed from Language 1 built using
            :mod:`tvecs.model_generator.model_generator`.
        :param model_2: Model constructed from Language 2 built using
Exemplo n.º 9
0
    * Correlation Coefficient
    * P Value
"""

import os
import csv
import time
import codecs

from tvecs.evaluation import evaluation
from tvecs.logger import init_logger as log
from tvecs.model_generator import model_generator
from tvecs.preprocessor.hccorpus_preprocessor import HcCorpusPreprocessor
from tvecs.vector_space_mapper.vector_space_mapper import VectorSpaceMapper

LOGGER = log.initialise('TVecs.Multivariate')


def evaluate(vsm, wordsim_dataset_path):
    """Extract Correlation, P-Value for specified vector space mapper."""
    return evaluation.extract_correlation_coefficient(
        score_data_path=wordsim_dataset_path, vsm=vsm)


def multivariate_analyse():
    """Perform multivariate analysis."""
    corpus_size = [54708929, 82063393, 109417858, 136772323]
    bilingual_size = [4516, 6774, 9032, 11291]
    dir_path = os.path.join('data', 'evaluate')
    wordsim_datasets = [('wordsim_relatedness_goldstandard.txt_translate',
                         dir_path),
Exemplo n.º 10
0
- Preprocessing Corpus - Implementation of BasePreprocessor module
    - HcCorpusPreprocessor

- Word2Vec Model Building
    - Gensim Word2Vec SkipGram implementation
"""
import os
import gensim

from tvecs.logger import init_logger as log
from tvecs.preprocessor.hccorpus_preprocessor import HcCorpusPreprocessor
from tvecs.preprocessor.emille_preprocessor import EmilleCorpusPreprocessor
from tvecs.preprocessor.leipzig_preprocessor import LeipzigPreprocessor

LOGGER = log.initialise('TVecs.ModelGeneration')


def generate_model(preprocessor_type,
                   language,
                   corpus_fname,
                   corpus_dir_path='.',
                   output_fname=None,
                   output_dir_path=os.path.join('data', 'models'),
                   need_preprocessing=True,
                   iterations=5):
    """
    Function used to preprocess and generate models.

    API Documentation
        :param preprocessor_type: Class Name for preprocessor.
Exemplo n.º 11
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""Module to map two Vector Spaces using a bilingual dictionary."""
import os
import codecs
import logging
from gensim.models import Word2Vec
import scipy.spatial.distance as dist
from sklearn.linear_model import RidgeCV
from sklearn import metrics

from tvecs.bilingual_generator import bilingual_generator as bg
from tvecs.logger import init_logger as log

LOGGER = log.initialise('TVecs.VectorSpaceMapper')


class VectorSpaceMapper(object):
    """
    Class to map two vector spaces together.

    - Vector spaces obtained using the two Word2Vec models.
    - Bilingual Dict used to map semantic embeddings between vector spaces.
    - Linear Regression utilised for the mapping from
        :mod:`sklearn.linear_model`

    API Documentation:
        :param model_1: Model constructed from Language 1 built using
            :mod:`tvecs.model_generator.model_generator`.
        :param model_2: Model constructed from Language 2 built using
            :mod:`tvecs.model_generator.model_generator`.
Exemplo n.º 12
0
def args_parser():
    """Utilised for cmdline arguments parsing."""
    global order_of_tvex_calls, order_of_evaluation
    parser = argparse.ArgumentParser(
        description='Script used to generate models'
    )
    parser.add_argument(
        "-v",
        "--verbose",
        help="increase output verbosity",
        action="store_true"
    )
    parser.add_argument(
        "-s",
        "--silent",
        help="silence all logging",
        action="store_true"
    )
    parser.add_argument(
        "-i",
        "--iter",
        help="number of Word2Vec iterations",
        default=5,
        action="store"
    )
    parser.add_argument(
        "-m1",
        "--model1",
        dest="model1",
        help="pre-computed model file path",
        action="store"
    )
    parser.add_argument(
        "-m2",
        "--model2",
        dest="model2",
        help="pre-computed model file path",
        action="store"
    )
    parser.add_argument(
        "-l1",
        "--language1",
        dest="language1",
        help="language name of model 1/ text 1",
        action="store"
    )
    parser.add_argument(
        "-l2",
        "--l2",
        dest="language2",
        help="language name of model 2/ text 2",
        action="store"
    )
    parser.add_argument(
        "-c",
        "--config",
        dest="config",
        help="config file path",
        action="store"
    )
    parser.add_argument(
        "-b",
        "--bilingual",
        dest="bilingual_dict",
        help="bilingual dictionary path",
        action="store"
    )
    parser.add_argument(
        "-r",
        "--recommendations",
        dest="recommendations",
        help="provide recommendations",
        action="store_true"
    )
    args = parser.parse_args()
    logger = log.initialise('TVecs')
    log.set_logger_normal(logger)
    parse_success = False
    try:
        # if Config is given higher priority, cmd line args are overriden
        if args.config:
            (
                args.language1, args.language2,
                args.model1, args.model2,
                args.corpus1, args.corpus2,
                args.iter, args.silent, args.verbose,
                args.bilingual_dict
            ) = parse_config(args.config)

        if args.verbose is True:
            log.set_logger_verbose(logger)

        elif args.silent is True:
            log.set_logger_silent(logger)

        valid_model = args.model1 and args.model2
        valid_lang = args.language1 and args.language2
        # Load a precomputed model for trsl
        if valid_model and valid_lang and args.bilingual_dict:
            logger.info(
                'Loading Model of %s :%s', args.language1, args.model1
            )
            model_1 = Word2Vec.load(args.model1)
            logger.info(
                'Loading Model of %s :%s', args.language2, args.model2
            )
            model_2 = Word2Vec.load(args.model2)
            order_of_evaluation = order_of_tvex_calls[2:]
            tvex_calls['model_generator']['result'] = (
                model_1,
                model_2
            )
            parse_success = True

        # Build trsl using precomputed word sets and a config file
        elif args.corpus1 and args.corpus2:
            order_of_evaluation = order_of_tvex_calls[:]
            parse_success = True

    except AttributeError:
        parse_success = False

    # Insufficient arguments passed to build trsl
    if parse_success is False:
        logger.error(
            "Required arguments not passed, run --help for more details"
        )
        return

    old_time = time.time()
    evaluate(logger, args)
    tvecs_vm = tvex_calls['vector_space_mapper']['result']
    logger.info(
        'Evaluation of Training Dataset'
    )
    tvecs_vm.obtain_mean_square_error_from_dataset(
        dataset_path=args.bilingual_dict
    )
    fpath, fname = ntpath.split(args.bilingual_dict)
    test_fname = fname.replace('train', 'test')
    if os.path.exists(os.path.join(fpath, test_fname)):
        logger.info(
            'Evaluation of Testing Dataset'
        )
        tvecs_vm.obtain_mean_square_error_from_dataset(
            dataset_path=os.path.join(fpath, fname)
        )
    new_time = time.time()
    loading_time = new_time - old_time
    logger.info("Execution Time: " + str(loading_time))
    if args.recommendations is True:
        logger.info(
            "Recommendation Engine: %s => %s" % (
                args.language1, args.language2
            )
        )
        while int(raw_input(
            '\nEnter your Choice:\n1> Recommendation\n2> Exit\n\nChoice: '
        )) == 1:
            word = raw_input(
                "\nEnter word in Language %s: " % args.language1
            )
            tvecs_vm.get_recommendations_from_word(
                word,
                pretty_print=True
            )
Exemplo n.º 13
0
# -*- coding: utf-8 -*-
"""
Utilise Yandex Translation Service.

- Obtain bilingual semantic human score.
"""

import os
import json
import codecs
import requests

from tvecs.logger import init_logger as log


LOGGER = log.initialise('TVecs.Yandex')


def get_valid_translation(word, from_to):
    """
    Ensure the translation is valid.

    Return only single word translations.
    If multiple words translations, return None.

    API Documentation
        :param word: word to be translated
        :param from_to: language codes pair representing the src/target lang
        :type from_to: String
        :type word: String
        :return: translated word
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""Module to Evaluate T-Vecs model against Human Semantic Similarity Score."""
import os
import codecs
from scipy.stats import pearsonr
from gensim.models import Word2Vec

from tvecs.logger import init_logger as log
from tvecs.bilingual_generator import bilingual_generator as bg
from tvecs.vector_space_mapper.vector_space_mapper import VectorSpaceMapper

LOGGER = log.initialise('TVecs.Evaluation')


def extract_correlation_coefficient(score_data_path, vsm):
    """
    Extract Human Score, Word1, Word2. Compute T-Vecs Score.

    API Documentation
        :param score_data_path: File generated by preprocessor/yandex
        :param vsm: Vector spaces mapped using 2 models.
        :type score_data_path: :class:`String`
        :type vsm: :mod:`tvecs.vector_space_mapper.vector_space_mapper`
        :return: Returns (Correlation coefficient, P-Value)
        :rtype: :class:`Tuple(Float, Float)`
    """
    LOGGER.info('Extracting Human Score from score data path: %s',
                score_data_path)
    with codecs.open(score_data_path, 'r', encoding='utf-8') as score_file:
        human_score, calculated_score = zip(*[[
Exemplo n.º 15
0
    * Correlation Coefficient
    * P Value
"""

import os
import csv
import time
import codecs

from tvecs.evaluation import evaluation
from tvecs.logger import init_logger as log
from tvecs.model_generator import model_generator
from tvecs.preprocessor.hccorpus_preprocessor import HcCorpusPreprocessor
from tvecs.vector_space_mapper.vector_space_mapper import VectorSpaceMapper

LOGGER = log.initialise('TVecs.Multivariate')


def evaluate(vsm, wordsim_dataset_path):
    """Extract Correlation, P-Value for specified vector space mapper."""
    return evaluation.extract_correlation_coefficient(
        score_data_path=wordsim_dataset_path,
        vsm=vsm
    )


def multivariate_analyse():
    """Perform multivariate analysis."""
    corpus_size = [54708929, 82063393, 109417858, 136772323]
    bilingual_size = [4516, 6774, 9032, 11291]
    dir_path = os.path.join(
Exemplo n.º 16
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""
Utilise Yandex Translation Service.

- Obtain bilingual semantic human score.
"""

import os
import json
import codecs
import requests

from tvecs.logger import init_logger as log

LOGGER = log.initialise('TVecs.Yandex')


def get_valid_translation(word, from_to):
    """
    Ensure the translation is valid.

    Return only single word translations.
    If multiple words translations, return None.

    API Documentation
        :param word: word to be translated
        :param from_to: language codes pair representing the src/target lang
        :type from_to: String
        :type word: String
        :return: translated word
Exemplo n.º 17
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""EMILLE Corpus Preprocessor which inherits from BasePreprocessor."""
import regex as re
from bs4 import BeautifulSoup
from collections import defaultdict
from nltk.tokenize import sent_tokenize

from tvecs.preprocessor.base_preprocessor import BasePreprocessor
from tvecs.logger import init_logger as log

LOGGER = log.initialise('TVecs.Preprocessor')


class EmilleCorpusPreprocessor(BasePreprocessor):
    """
    Emille Corpus Preprocessor which preprocesses the EMILLE Corpus.

    API Documentation:
        :param corpus_fname: Corpus Filename to be preprocessed
        :param corpus_dir_path: Corpus Directory Path
                                [ Default Current Directory ]
        :param encoding: Encoding format of the corpus
                                [ Default utf-8 ]
        :param language: Language of the model constructed
                                [ Default English ]
        :param limit: Number of tokenized words to be limited to
                                [ Default None ]
        :param need_preprocessing: Preprocess corpus to obtain
            only the valid content from the file to an intermediate file
            [ False - Corpus has each sentence in seperate lines ]
Exemplo n.º 18
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""Module used to generate bilingual dictionary."""

import os
import random
import codecs
from gensim.models import Word2Vec

from tvecs.bilingual_generator import cluster as cl
from tvecs.logger import init_logger as log


LOGGER = log.initialise('TVecs.BilingualDictionary')


def load_bilingual_dictionary(bilingual_dictionary_path, encoding='utf-8'):
    """
    Load bilingual dictionary from the specified bilingual_dictionary_path.

    API Documentation
        :param bilingual_dictionary_path: Path for Bilingual Dictionary.
        :param encoding: Encoding of the bilingual dictionary.
        :type bilingual_dictionary_path: :class:`String`
        :type encoding: :class:`String`
        :return: Bilingual Dictionary loaded.
        :rtype: :class:`List`
    """
    LOGGER.info(
        'Loading Bilingual Dictionary: %s', bilingual_dictionary_path
    )
Exemplo n.º 19
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""HC Corpus Preprocessor which inherits from BasePreprocessor."""
import regex as re
from collections import defaultdict
from nltk.tokenize import sent_tokenize

from tvecs.preprocessor.base_preprocessor import BasePreprocessor
from tvecs.logger import init_logger as log

LOGGER = log.initialise('TVecs.Preprocessor')


class HcCorpusPreprocessor(BasePreprocessor):
    """
    Hc-Corpus Preprocessor which preprocesses the Hc-Corpus.

    API Documentation:
        :param corpus_fname: Corpus Filename to be preprocessed
        :param corpus_dir_path: Corpus Directory Path
                                [ Default Current Directory ]
        :param encoding: Encoding format of the corpus
                                [ Default utf-8 ]
        :param language: Language of the model constructed
                                [ Default English ]
        :param limit: Number of tokenized words to be limited to
                                [ Default None ]
        :param need_preprocessing: Preprocess corpus to obtain
            only the valid content from the file to an intermediate file
            [ False - Corpus has each sentence in seperate lines ]
        :type corpus_fname: :class:`String`
Exemplo n.º 20
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""Module to Evaluate T-Vecs model against Human Semantic Similarity Score."""
import os
import codecs
from scipy.stats import pearsonr
from gensim.models import Word2Vec

from tvecs.logger import init_logger as log
from tvecs.bilingual_generator import bilingual_generator as bg
from tvecs.vector_space_mapper.vector_space_mapper import VectorSpaceMapper

LOGGER = log.initialise('TVecs.Evaluation')


def extract_correlation_coefficient(score_data_path, vsm):
    """
    Extract Human Score, Word1, Word2. Compute T-Vecs Score.

    API Documentation
        :param score_data_path: File generated by preprocessor/yandex
        :param vsm: Vector spaces mapped using 2 models.
        :type score_data_path: :class:`String`
        :type vsm: :mod:`tvecs.vector_space_mapper.vector_space_mapper`
        :return: Returns (Correlation coefficient, P-Value)
        :rtype: :class:`Tuple(Float, Float)`
    """
    LOGGER.info(
        'Extracting Human Score from score data path: %s', score_data_path
    )
    with codecs.open(score_data_path, 'r', encoding='utf-8') as score_file:
Exemplo n.º 21
0
- Preprocessing Corpus - Implementation of BasePreprocessor module
    - HcCorpusPreprocessor

- Word2Vec Model Building
    - Gensim Word2Vec SkipGram implementation
"""
import os
import gensim

from tvecs.logger import init_logger as log
from tvecs.preprocessor.hccorpus_preprocessor import HcCorpusPreprocessor
from tvecs.preprocessor.emille_preprocessor import EmilleCorpusPreprocessor
from tvecs.preprocessor.leipzig_preprocessor import LeipzigPreprocessor

LOGGER = log.initialise('TVecs.ModelGeneration')


def generate_model(
        preprocessor_type,
        language,
        corpus_fname,
        corpus_dir_path='.',
        output_fname=None,
        output_dir_path=os.path.join('data', 'models'),
        need_preprocessing=True,
        iterations=5
):
    """
    Function used to preprocess and generate models.
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""Module used to generate bilingual dictionary."""
import re
import os
import random
import codecs
from gensim.models import Word2Vec

from tvecs.bilingual_generator import cluster as cl
from tvecs.logger import init_logger as log


LOGGER = log.initialise('TVecs.BilingualDictionary')


def load_bilingual_dictionary(bilingual_dictionary_path, encoding='utf-8'):
    """
    Load bilingual dictionary from the specified bilingual_dictionary_path.

    API Documentation
        :param bilingual_dictionary_path: Path for Bilingual Dictionary.
        :param encoding: Encoding of the bilingual dictionary.
        :type bilingual_dictionary_path: :class:`String`
        :type encoding: :class:`String`
        :return: Bilingual Dictionary loaded.
        :rtype: :class:`List`
    """
    LOGGER.info(
        'Loading Bilingual Dictionary: %s', bilingual_dictionary_path
    )
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""Preprocess Evaluation Dataset by translating 1 column."""

import os
import codecs
from tvecs.logger import init_logger as log
from tvecs.preprocessor import yandex_api as yp

LOGGER = log.initialise('TVecs.Evaluation.Preprocess')


def preprocess_dataset(dataset_path, delimiter='', encoding='utf-8'):
    """Preprocess Evaluation dataset by preprocessing 1 column."""
    with codecs.open(dataset_path, 'r', encoding=encoding) as dataset_handle:
        LOGGER.info('Opening Evaluation Dataset for Preprocessing.')
        data = dataset_handle.read().split()
        output_data = []
        for line in data:
            word_1, word_2, score = line.split(delimiter)
            t_word = yp.get_translation(word_2, 'en-hi').split()
            if len(t_word) > 0:
                t_word = t_word[0]
                processed_data = "\t".join([word_1, t_word, score])
                LOGGER.debug(
                    '%s : %s' %
                    ('Preprocessed Evaluation Dataset', preprocess_dataset))
                output_data.append(processed_data)
    with codecs.open('%s_%s' % (dataset_path, 'translate'),
                     'w',
                     encoding=encoding) as output_handle:
"""Test."""
import os
import json
import codecs
from sklearn.cluster import AffinityPropagation

from tvecs.logger import init_logger as log

LOGGER = log.initialise('TVecs.Clustering')


def build_clusters(entire_word_list, model, damping_factor=0.5):
    """
    Cluster word_list using Affinity Propagation.

    - Clustering based on the vectors from the Word2Vec model.

    API Documentation:
        :param entire_word_list: Word List provided to cluster.
        :param model:  Model to obtain the vectors for the word_list.
        :param damping_factor: Damping factor for the affinity propagation.
        :type entire_word_list: :class:`List`
        :type model: :mod:`gensim.models.Word2Vec`
        :type damping_factor: :class:`Float`
    """
    vocab = set(entire_word_list)
    vocab_dict = {}
    for word in vocab:
        try:
            vocab_dict[word] = model[word]
        except KeyError: