Пример #1
0
 def train(cls, corpus, sim_metric, feature_num=5, sim_model='weighted'):
     '''
     Extract categories, features, feature weights, from corpus.
     Compute the weight for each feature token in each category
     The weight is computed as token_count / total_feature_count
     '''
     cat_word = {}
     for sent, cat in corpus:
         cat_word.setdefault(cat,
                             []).extend(lemmatization(word_tokenize(sent)))
     features = {cat: Counter(cat_word[cat]) for cat in cat_word}
     labels = features.keys()
     cat_features = {}
     feature_weights = {}
     for c, f in features.iteritems():
         w_c_pairs = f.most_common(feature_num)
         words, counts = zip(*w_c_pairs)
         cat_features[c] = words
         total_count = float(sum(counts))
         word_weights = []
         for w, count in w_c_pairs:
             word_weights.append((w, count / total_count))
         feature_weights[c] = word_weights
     return cls(labels, cat_features, feature_weights, sim_metric,
                sim_model)
Пример #2
0
 def transform(self, X):
     tokenize = lambda x: lemmatization(word_tokenize(x))
     X_tokens = map(tokenize, X)
     if self._model == 'onehot':
         return map(self.unigram_features, X_tokens)
     elif self._model == 'wordnet':
         return map(self.wordnet_features, X_tokens)
     elif self._model == 'word2vec':
         return map(self.word2vec_features, X_tokens)
     elif self._model == 'both':
         return map(self.semantic_features, X_tokens)
Пример #3
0
 def extract_features(self, corpus, feature_num=10):
     cat_word = {}
     for sent, cat in corpus:
         cat_word.setdefault(cat,
                             []).extend(lemmatization(word_tokenize(sent)))
     features = {cat: Counter(cat_word[cat]) for cat in cat_word}
     feature_words = []
     for c, f in features.iteritems():
         words, counts = zip(*f.most_common(feature_num))
         feature_words.extend(list(words))
     feature_words = set(feature_words)
     return feature_words
Пример #4
0
def test_sim_graph():
    from sematch.semantic.graph import SimGraph
    from sematch.semantic.similarity import WordNetSimilarity
    from sematch.nlp import Extraction, lemmatization
    from sematch.sparql import EntityFeatures
    from collections import Counter
    madrid = EntityFeatures().features('http://dbpedia.org/resource/Tom_Cruise')
    words = Extraction().extract_words_sent(madrid['abstract'])
    words = list(set(lemmatization(words)))
    wns = WordNetSimilarity()
    word_graph = SimGraph(words, wns.word_similarity)
    word_scores = word_graph.page_rank()
    words, scores =zip(*Counter(word_scores).most_common(10))
    assert words is not None
Пример #5
0
def test_sim_graph():
    from sematch.semantic.graph import SimGraph
    from sematch.semantic.similarity import WordNetSimilarity
    from sematch.nlp import Extraction, lemmatization
    from sematch.sparql import EntityFeatures
    from collections import Counter
    madrid = EntityFeatures().features(
        'http://dbpedia.org/resource/Tom_Cruise')
    words = Extraction().extract_words_sent(madrid['abstract'])
    words = list(set(lemmatization(words)))
    wns = WordNetSimilarity()
    word_graph = SimGraph(words, wns.word_similarity)
    word_scores = word_graph.page_rank()
    words, scores = zip(*Counter(word_scores).most_common(10))
    assert words is not None
Пример #6
0
 def train(cls, X, y, classifier=LinearSVC, model='bow'):
     """
     :param X:
     :param y:
     :param classifier:
     :param model: bow or tfidf
     :return:
     """
     tokenize = lambda x: lemmatization(word_tokenize(x))
     labels = LabelEncoder()
     y_train = labels.fit_transform(y)
     vectorizer = CountVectorizer(tokenizer=tokenize) \
         if model == 'bow' else TfidfVectorizer(tokenizer=tokenize)
     X_train = vectorizer.fit_transform(X)
     if isinstance(classifier, type):
         classifier = classifier()
     classifier.fit_transform(X_train, y_train)
     return cls(labels, vectorizer, classifier)
Пример #7
0
    def classify_single(self, sent, feature_model='max'):
        """
        The input feature words are compared to each category based on category similarity.
        Sum the semantic similarity score between features and category.
        The category having highest similarity score is the correct category.

        :param featuresets: feature sets such as word list
        :param method: specify the semantic similarity metric
        :param model: similarity combination model 'max', 'sum'. Default is 'max'
        :return: the correct category label.
        """
        feature_words = list(set(lemmatization(word_tokenize(sent))))
        score = {}
        for c in self._categories:
            if feature_model == 'max':
                score[c] = max(
                    [self.category_similarity(w, c)
                     for w in feature_words] + [0.0])
            else:
                score[c] = sum(
                    [self.category_similarity(w, c)
                     for w in feature_words] + [0.0])
        return Counter(score).most_common(1)[0][0]
Пример #8
0
 def extract_words(self, text):
     return lemmatization(word_tokenize(text))
Пример #9
0
 def gloss_overlap(self, c1, c2):
     gloss1 = lemmatization(word_tokenize(c1.definition()))
     gloss2 = lemmatization(word_tokenize(c2.definition()))
     gloss1 = set(map(porter.stem, gloss1))
     gloss2 = set(map(porter.stem, gloss2))
     return len(gloss1.intersection(gloss2))
Пример #10
0
 def extract_words(self, text):
     return lemmatization(word_tokenize(text))
Пример #11
0
 def gloss_overlap(self, c1, c2):
     gloss1 = lemmatization(word_tokenize(c1.definition()))
     gloss2 = lemmatization(word_tokenize(c2.definition()))
     gloss1 = set(map(porter.stem, gloss1))
     gloss2 = set(map(porter.stem, gloss2))
     return len(gloss1.intersection(gloss2))
Пример #12
0
 def context2words(self, sent):
     words = word_tokenize(sent.lower())
     words = [w for w in words if len(w) > 2]
     return lemmatization(words)