示例#1
0
    def create_mat(self, list_, row2word=None, col2word=None):
        if row2word is None:
            row2word = self.row2word
        if col2word is None:
            col2word = self.col2word

        return IndexMatrix(sp.coo_matrix(list_), row2word, col2word)
示例#2
0
 def create_unknown(self):
     """
     This is a vector that is orthogonal to all other vectors.
     This is used for words that are unknown to the model.
     """
     # Create index vector if not exist
     hsh = hashlib.md5()
     hsh.update("$UNKNOWN$".encode())
     # highest number allowed by seed
     seed = int(hsh.hexdigest(), 16) % 4294967295
     np.random.seed(seed)
     rand_indices = np.random.permutation(
         self.config['dimensionality'])[:self.config['num_indices']]
     pos_indices = rand_indices[rand_indices.size // 2:]
     neg_indices = rand_indices[:rand_indices.size // 2]
     unknown_vec = np.zeros((1, self.config['dimensionality']))
     unknown_vec[0, pos_indices] = 1
     unknown_vec[0, neg_indices] = -1
     unknown_vec = IndexMatrix(unknown_vec,
                               ['$UNKNOWN$'],
                               list(range(self.config['dimensionality'])))
     return unknown_vec / unknown_vec.norm()
示例#3
0
 def setUp(self):
     self.spmat = sp.coo_matrix([[2, 5, 3], [0, 1, 9]])
     self.row2word = ['a', 'b']
     self.col2word = ['furiously', 'makes', 'sense']
     self.mat = IndexMatrix(self.spmat, self.row2word, self.col2word)
示例#4
0
 def __init__(self, config=None):
     super().__init__(config=config)
     self.langvectors = IndexMatrix({})
     self.unknown_vec = self.create_unknown()
示例#5
0
class Eigenvectors(RILangID):

    """
    The idea behind Eigenvectors is that we create a language space by creating random indexing matrices for
    each language. Each RI matrix is then collapsed into a vector by summing all columns. This vector will
    be an approximation to the first eigenvector, according to the power iteration matrix.
    http://en.wikipedia.org/wiki/Power_iteration

    By doing this for each language, we get a language space of one language vector per language being the
    first eigenvectors. When identifying a sentence, we create a sentence vector of the sentence for each language.
    A sentence vector that is very close the eigenvector for the given language should mean that the sentence is very
    similar to the langugage. As such, the sentence with the smallest distance from its language is the determined
    language.

    This doesn't really seem to work very well.

    """

    def __init__(self, config=None):
        super().__init__(config=config)
        self.langvectors = IndexMatrix({})
        self.unknown_vec = self.create_unknown()

    def create_unknown(self):
        """
        This is a vector that is orthogonal to all other vectors.
        This is used for words that are unknown to the model.
        """
        # Create index vector if not exist
        hsh = hashlib.md5()
        hsh.update("$UNKNOWN$".encode())
        # highest number allowed by seed
        seed = int(hsh.hexdigest(), 16) % 4294967295
        np.random.seed(seed)
        rand_indices = np.random.permutation(
            self.config['dimensionality'])[:self.config['num_indices']]
        pos_indices = rand_indices[rand_indices.size // 2:]
        neg_indices = rand_indices[:rand_indices.size // 2]
        unknown_vec = np.zeros((1, self.config['dimensionality']))
        unknown_vec[0, pos_indices] = 1
        unknown_vec[0, neg_indices] = -1
        unknown_vec = IndexMatrix(unknown_vec,
                                  ['$UNKNOWN$'],
                                  list(range(self.config['dimensionality'])))
        return unknown_vec / unknown_vec.norm()

    def identify(self, sentence):
        """
        Create a sentence vector for each language.
        When a word is unknown for the given language, it is treated as $UNKNOWN$.
        """
        words = sentence.split(" ")
        best_lang = None
        best_score = 0
        assure_consistency = self.config.get('assure_consistency', False)
        for language, mat in self.matrix.items():
            distance = 0
            for w in words:
                if w in mat.row2word:
                    wordvec = mat[w]
                    distance += abs(pydsm.similarity.cos(wordvec,
                                                         self.langvectors[language],
                                                         assure_consistency=assure_consistency)[0, 0])

            if distance > best_score:
                best_lang = language
                best_score = distance

        return best_lang

    def train(self, corpora):
        """
        Train the model according to the class documentation.
        """
        self.matrix = {}
        for language, corpus in corpora.items():
            print("Reading {}...".format(language))
            self.matrix[language] = self.build(corpus)
            langmodel = self.matrix[language].sum(axis=0)
            langmodel.row2word = [language]
            self.langvectors = self.langvectors.merge(langmodel)

    def build(self, text):
        """
        Create a random indexing space for each language.
        """
        model = RandomIndexing(corpus=text, config=self.config).matrix
        return model