예제 #1
0
 def __init__(self,
              corpus,
              feature_num=10,
              model='onehot',
              wn_method='path',
              vec_file='models/GoogleNews-vectors-negative300.bin',
              binary=True):
     """
     :param corpus: use a corpus to train a vector representation
     :param feature_num: number of dimensions
     :param model: onehot or wordnet or word2vec or both
     """
     self._model = model
     self._wn_method = wn_method
     self._features = self.extract_features(corpus, feature_num)
     self._wns = WordNetSimilarity(
     ) if model == 'wordnet' or model == 'both' else None
     self._wvs = WordVecSimilarity(
         vec_file,
         binary) if model == 'word2vec' or model == 'both' else None
예제 #2
0
class TextPreprocessor(BaseEstimator, TransformerMixin):
    """
    Transform input text into feature representation
    """
    def __init__(self,
                 corpus,
                 feature_num=10,
                 model='onehot',
                 wn_method='path',
                 vec_file='models/GoogleNews-vectors-negative300.bin',
                 binary=True):
        """
        :param corpus: use a corpus to train a vector representation
        :param feature_num: number of dimensions
        :param model: onehot or wordnet or word2vec or both
        """
        self._model = model
        self._wn_method = wn_method
        self._features = self.extract_features(corpus, feature_num)
        self._wns = WordNetSimilarity(
        ) if model == 'wordnet' or model == 'both' else None
        self._wvs = WordVecSimilarity(
            vec_file,
            binary) if model == 'word2vec' or model == 'both' else None

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return X

    def extract_features(self, corpus, feature_num=10):
        cat_word = {}
        for sent, cat in corpus:
            cat_word.setdefault(cat,
                                []).extend(lemmatization(word_tokenize(sent)))
        features = {cat: Counter(cat_word[cat]) for cat in cat_word}
        feature_words = []
        for c, f in features.iteritems():
            words, counts = zip(*f.most_common(feature_num))
            feature_words.extend(list(words))
        feature_words = set(feature_words)
        return feature_words

    def similarity(self, tokens, feature, method='wordnet'):
        if method == 'wordnet':
            sim = lambda x: self._wns.word_similarity(feature, x, self.
                                                      _wn_method)
        else:
            sim = lambda x: self._wvs.word_similarity(feature, x)
        return max(map(sim, tokens) + [0.0])

    def unigram_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['contains({})'.format(f)] = (f in words)
        return features

    def wordnet_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['wns({})'.format(f)] = self.similarity(words, f)
        return features

    def word2vec_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['w2v({})'.format(f)] = self.similarity(words,
                                                            f,
                                                            method='word2vec')
        return features

    def semantic_features(self, tokens):
        words = set(tokens)
        features = {}
        for f in self._features:
            features['wns({})'.format(f)] = self.similarity(words, f)
            features['w2v({})'.format(f)] = self.similarity(words,
                                                            f,
                                                            method='word2vec')
        return features

    def transform(self, X):
        tokenize = lambda x: lemmatization(word_tokenize(x))
        X_tokens = map(tokenize, X)
        if self._model == 'onehot':
            return map(self.unigram_features, X_tokens)
        elif self._model == 'wordnet':
            return map(self.wordnet_features, X_tokens)
        elif self._model == 'word2vec':
            return map(self.word2vec_features, X_tokens)
        elif self._model == 'both':
            return map(self.semantic_features, X_tokens)