Пример #1
0
def build_text_processor(
    tokenize=True,
    lowercase=True,
    strip_accents='unicode',
    **kwargs,
):
    """ Generates a text preprocessor from sklearn CountVectorizer tools

    It is based on sklearn CountVectorizer functionalities.
    tokenize means that the input string will be tokenized as words before
    being glued back with single spaces. Its purpose is to handle
    whitespaces (newlines, tabs, multiple spaces, ...) and punctuation.
    kwargs are directly passed to CountVectorizer constructor, and will
    serve to process the texts. Most useful args are 'strip_accent' and
    'lowercase'.
    """
    preprocessor_countvect = CountVectorizer(
        lowercase=lowercase,
        strip_accents=strip_accents,
        **kwargs,
    )
    preprocessor = preprocessor_countvect.build_preprocessor()
    tokenizer = preprocessor_countvect.build_tokenizer()
    if tokenize:

        def transformer(x):
            return (' '.join(tokenizer(preprocessor(x))))
    else:
        transformer = preprocessor
    return (transformer)
def dump_sentences():
    corpus = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
    docs = corpus.data
    labels = corpus.target
    label_names = corpus.target_names
    vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
    preprocess = vectorizer.build_preprocessor()
    tokenize = vectorizer.build_tokenizer()
    
    def words(doc):
        p = preprocess(doc)
        return ' '.join(t.encode('ascii', 'replace') for t in tokenize(p))
    
    doccount = 0
    vocab = set()
    with open('20news.txt', 'w') as f:
        for doc, lbl in zip(docs, labels):
            w = words(doc)
            print >> f, label_names[lbl]
            print >> f, w
            doccount += 1
            vocab.update(w.split(' '))
    
    print 'Number of documents:', doccount
    print 'Number of unique words:', len(vocab)
Пример #3
0
 def nlp(self, model):
     if model == "default":
         cv = CountVectorizer(lowercase=self.lower_case)
         sk_word_tokenize = cv.build_tokenizer()
         sk_preprocesser = cv.build_preprocessor()
         self._nlp = lambda doc: sk_word_tokenize(sk_preprocesser(doc))
     else:
         self._nlp = model
Пример #4
0
    def Common_Vectorizer_usage():
        from sklearn.feature_extraction.text import CountVectorizer
        vectorizer = CountVectorizer(min_df=1)
        corpus = [
            'This is the first document.',
            'This is the second second document.',
            'And the third one.',
            'Is this the first document?',
        ]

        analyze = vectorizer.build_analyzer()
        print analyze("This is a text document to analyze.")
        print analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze']
        
        X=vectorizer.fit_transform(corpus)
        print vectorizer.get_feature_names()
        print vectorizer.vocabulary_    #.get('document')
        print vectorizer.transform(['Something completely new.']).toarray()
        print list(X) 
        
        #bigram========================================================
        bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
        analyze = bigram_vectorizer.build_analyzer()
        print analyze('Bi-grams are cool!')
        X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
        print X_2

        feature_index = bigram_vectorizer.vocabulary_.get('is this')
        print X_2[:, feature_index] 
        
        #marui test
        print '\n\nmarui test====================='
        def t_preprocessor(s):
            return ','.join([x.lower() for x in s.split(' ')])

        stop_words1=['is','a','this']           #is ok: frozenset(['a', 'this', 'is'])
        stop_words2={'is':0,'a':1,'this':2}     #is ok: convert to frozenset(['a', 'this', 'is'])    
            
        cv = CountVectorizer(preprocessor=t_preprocessor,stop_words=stop_words2)
        params=cv.get_params()
        print 'get_params()',type(params),'---------------'
        for k in params:
            print k,'\t',params[k]
        print 'get_params end--------------'
        print '\nget_stop_words=',cv.get_stop_words()
        
        cv.fit(corpus)
        print cv.get_feature_names()
        print cv.transform(corpus).toarray()
        print '\n测试preprocesser, result:\t',cv.build_preprocessor()('this is a document')
        print '\n测试tokenizer,result',cv.build_tokenizer()('this is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th-is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th_is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th&is is a document')

        """
Пример #5
0
 def __init__(self, mask_dates, max_length=MAX_LENGTH):
     # Steal the defaul preprocessor and tokenizer from sklearn
     v = CountVectorizer()
     self.max_length = max_length
     self.dat = re.compile(r'\b\d{1,2}\-?[a-z]{3}\-?\d{2,4}\b')
     if mask_dates:
         self.preprocess = lambda x: self.dat.sub('<DATE>', str(x).lower())
     else:
         self.preprocess = v.build_preprocessor()
     self.tokenize = v.build_tokenizer()
     self.is_num = re.compile(r'\b\d+\b')  # isolated numbers
    def create_feature_matrix_token_counts(self):
        '''
        Create a n by m matrix of n twitter messages with m features representing
        count of preprocessed, stemmed, tokenized words
        :return: n by m feature matrix of n twitter messages and m features (i.e. word tokens)
        '''

        #Create the basic count vectorizer so that we can copy its preprocessor and tokenizer
        basic_vectorizer = CountVectorizer(stop_words='english')
        preprocessor = basic_vectorizer.build_preprocessor();
        tokenizer = basic_vectorizer.build_tokenizer();

        #Create a stemmer for additional processing after preprocessing and tokenizer
        stemmer = EnglishStemmer()

        #Custom analyzer for Count Vectorizer which stems tokens after preprocessing
        def stemming_analyzer(document):

            if self.filter_numbers:
                return [token for token in map(stemmer.stem, tokenizer(preprocessor(document))) if not vec_tools.number_pattern().search(token)]
            else:
                return map(stemmer.stem, tokenizer(preprocessor(document)))

        if self.uni_bi_gram:
            vectorizer = CountVectorizer(stop_words='english', min_df=2, analyzer="char_wb", ngram_range=(1,2))
        else:
            vectorizer = CountVectorizer(stop_words='english', min_df=self.min_df, analyzer=stemming_analyzer)


        all_twitter_msg_text = [t.msg_text for t in self.twitter_messages]
        all_twitter_msg_polarity = [t.polarity for t in self.twitter_messages]

        if self.filter_url_hashtag_username:
            vec_tools.filter_url_username_hashtag(all_twitter_msg_text)

        self.feature_matrix_token_counts = vectorizer.fit_transform(all_twitter_msg_text)

        if self.select_k_best:
            self.feature_matrix_token_counts = SelectKBest(chi2,self.k).fit_transform(self.feature_matrix_token_counts, all_twitter_msg_polarity)
            self.token_feature_names = [i for i in range(self.feature_matrix_token_counts.shape[1])]
            self.amount_of_token_features = self.feature_matrix_token_counts.shape[1]
        else:
            self.token_feature_names = vectorizer.get_feature_names()
            self.amount_of_token_features = len(self.token_feature_names)

        return self.feature_matrix_token_counts
def run():
    ''' create a product dictionary based on all tokens in the best buy product corpus '''

    soup = BeautifulSoup(open(constants.BESTBUY_PRODUCT_CORPUS_FILE, 'rb'), 'html.parser')
    vectorizer = CountVectorizer(strip_accents='ascii')
    
    tokenizer = vectorizer.build_tokenizer()
    preprocessor = vectorizer.build_preprocessor()

    tokens = set()

    for item in tokenizer(soup.get_text()):
        tokens.add(preprocessor(item))

    with codecs.open(constants.PERSONAL_WORD_DICTIONARY_FILE, mode='wb', encoding='utf-8') as f:
        for token in tokens:
            f.write(token + '\n')
def dump_reviews():
    download()
    print 'making dataset'
    vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
    preprocess = vectorizer.build_preprocessor()
    tokenize = vectorizer.build_tokenizer()
    
    def dumbascii(thing):
        try:
            thing.encode('ascii', 'replace')
            return True
        except UnicodeDecodeError:
            return False

    def words(doc):
        p = preprocess(doc)
        return ' '.join(t.encode('ascii', 'replace') for t in tokenize(p) if dumbascii(t))
    
    doccount = 0
    vocab = set()
   
    with open('reviews.txt', 'w') as fout:
        for topicdir in DIRS:
            with open(os.path.join(topicdir, POSREV), 'r') as f:
                text = f.read()
            for doc in REVREGEX.findall(text):
                w = words(doc)
                print >> fout, 'positive'
                print >> fout, w
                doccount += 1
                vocab.update(w.split(' '))

            with open(os.path.join(topicdir, NEGREV), 'r') as f:
                text = f.read()
            for doc in REVREGEX.findall(text):
                w = words(doc)
                print >> fout, 'negative'
                print >> fout, w
                doccount += 1
                vocab.update(w.split(' '))
    
    print 'Number of documents:', doccount
    print 'Number of unique words:', len(vocab)
Пример #9
0
def get_sparse_repr(docs, V, sort_data):
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer(stop_words="english", max_features=V)
    default_preproc = vectorizer.build_preprocessor()

    def preproc(s):
        return re.sub(r' \d+ ', 'anumber ', default_preproc(s))

    vectorizer.preprocessor = preproc

    counts = vectorizer.fit_transform(docs).astype(np.uint32)
    words = vectorizer.get_feature_names()
    if sort_data:
        counts, words = sort_vocab(counts, words)
        assert is_column_sorted(counts)

    print('loaded {} documents with a size {} vocabulary'.format(*counts.shape))
    print('with {} words per document on average'.format(np.mean(counts.sum(1))))
    print()

    return counts, words
Пример #10
0
def get_sparse_repr(docs, V, sort_data):
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer(stop_words="english", max_features=V)
    default_preproc = vectorizer.build_preprocessor()

    def preproc(s):
        return re.sub(r' \d+ ', 'anumber ', default_preproc(s))

    vectorizer.preprocessor = preproc

    counts = vectorizer.fit_transform(docs).astype(np.uint32)
    words = vectorizer.get_feature_names()
    if sort_data:
        counts, words = sort_vocab(counts, words)
        assert is_column_sorted(counts)

    print('loaded {} documents with a size {} vocabulary'.format(*counts.shape))
    print('with {} words per document on average'.format(np.mean(counts.sum(1))))
    print()

    return counts, words
class TextTransformer(object):

    from re import sub

    def __init__(self):

        #from nltk.stem.lancaster import LancasterStemmer
        from sklearn.feature_extraction.text import CountVectorizer

        import enchant

        #self.stemmer = LancasterStemmer()
        self._vectorizer = CountVectorizer(strip_accents='ascii')
        self.tokenizer = self._vectorizer.build_tokenizer()
        self.preprocessor = self._vectorizer.build_preprocessor()
        self.spellchecker = enchant.DictWithPWL("en_US",
            pwl=constants.PERSONAL_WORD_DICTIONARY_FILE)


    def transform_text(self, raw_text):
    
        tokens = []
        for token in self.tokenizer(raw_text):
            clean_token = self.preprocessor(token)
            if not self.spellchecker.check(clean_token):
                corrections = self.spellchecker.suggest(clean_token)
                if len(corrections) > 0:
                    clean_token = corrections[0]

            tokens.append(clean_token)

        return ' '.join(tokens)


    def sub_numbers(self, text):
        return sub("[0-9]+", " numbr ", text)
Пример #12
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert_equal(counts_test[0, vocabulary["salad"]], 1)
        assert_equal(counts_test[0, vocabulary["tomato"]], 1)
        assert_equal(counts_test[0, vocabulary["water"]], 1)

        # stop word from the fixed list
        assert_false("the" in vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false("copyright" in vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, vocabulary["coke"]], 0)
        assert_equal(counts_test[0, vocabulary["burger"]], 0)
        assert_equal(counts_test[0, vocabulary["beer"]], 0)
        assert_equal(counts_test[0, vocabulary["pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert_equal(len(t1.idf_), len(v1.vocabulary_))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert_equal(t2.idf_, None)

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    assert_raises(ValueError, t3.transform, counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5],
         [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3],
                 [1, 3]]
    assert_raises(ValueError, t3.transform, X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')
    assert_false(tv.fixed_vocabulary)

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    assert_equal(v3.build_preprocessor(), strip_accents_ascii)

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    assert_raises(ValueError, v3.build_preprocessor)

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    assert_raises(ValueError, v3.build_analyzer)
Пример #13
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert counts_train[0, v1.vocabulary_["pizza"]] == 2

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert counts_test[0, vocabulary["salad"]] == 1
        assert counts_test[0, vocabulary["tomato"]] == 1
        assert counts_test[0, vocabulary["water"]] == 1

        # stop word from the fixed list
        assert "the" not in vocabulary

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert "copyright" not in vocabulary

        # not present in the sample
        assert counts_test[0, vocabulary["coke"]] == 0
        assert counts_test[0, vocabulary["burger"]] == 0
        assert counts_test[0, vocabulary["beer"]] == 0
        assert counts_test[0, vocabulary["pizza"]] == 0

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert len(t1.idf_) == len(v1.vocabulary_)
    assert tfidf.shape == (n_train, len(v1.vocabulary_))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert tfidf_test.shape == (len(test_data), len(v1.vocabulary_))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert not hasattr(t2, "idf_")

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    with pytest.raises(ValueError):
        t3.transform(counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5],
         [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3],
                 [1, 3]]
    with pytest.raises(ValueError):
        t3.transform(X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert not tv.fixed_vocabulary_
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    with pytest.raises(ValueError):
        v3.transform(train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    processor = v3.build_preprocessor()
    text = ("J'ai mangé du kangourou  ce midi, "
            "c'était pas très bon.")
    expected = strip_accents_ascii(text)
    result = processor(text)
    assert expected == result

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    with pytest.raises(ValueError):
        v3.build_preprocessor()

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    with pytest.raises(ValueError):
        v3.build_analyzer()
Пример #14
0
class sentMod:
    def sequence_setup(self, X_train):

        self.vectorizer = CountVectorizer(
            binary=True,
            stop_words=stopwords.words('english'),
            min_df=3,
            max_df=0.9,
            max_features=None)

        X_train_onehot = self.vectorizer.fit_transform(X_train)

        #They take word-ids as input, so we first have to transform the input into a series of word ids
        self.word2idx = {
            word: idx
            for idx, word in enumerate(self.vectorizer.get_feature_names())
        }
        self.tokenize = self.vectorizer.build_tokenizer()
        self.preprocess = self.vectorizer.build_preprocessor()

        X_train_sequences = [
            to_sequence(self.tokenize, self.preprocess, self.word2idx, x)
            for x in X_train
        ]

        self.MAX_SEQ_LENGHT = len(max(X_train_sequences, key=len))
        self.N_FEATURES = len(self.vectorizer.get_feature_names())

        X_train_sequences = pad_sequences(X_train_sequences,
                                          maxlen=self.MAX_SEQ_LENGHT,
                                          value=self.N_FEATURES)

        return X_train_sequences

    def create_model(self):

        # load training data
        X_train, X_test, y_train, y_test = fill_set()
        # setup preprocessing tools for embeddings
        X_train_sequences = self.sequence_setup(X_train)

        #Prepare model
        self.model = Sequential()

        self.model.add(
            Embedding(len(self.vectorizer.get_feature_names()) + 1,
                      64,
                      input_length=self.MAX_SEQ_LENGHT))
        self.model.add(Conv1D(64, 5, activation='relu'))
        self.model.add(MaxPooling1D(5))
        self.model.add(Flatten())
        self.model.add(
            Dense(units=500,
                  activation='relu',
                  input_dim=len(self.vectorizer.get_feature_names())))
        self.model.add(Dense(units=1, activation='sigmoid'))

        self.model.compile(loss='binary_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])
        self.model.summary()

        self.model.fit(X_train_sequences[:-100],
                       y_train[:-100],
                       epochs=3,
                       batch_size=512,
                       verbose=1,
                       validation_data=(X_train_sequences[-100:],
                                        y_train[-100:]))

        # Test the out accuracy
        print("Accuracy:", self.get_accuracy())

        # Save the model to the disk
        self.model.save(f'sentimentModel')
        print('Sentiment Model Saved to Disk!')

    def __init__(self,
                 training="data/mc_training.csv",
                 testing="data/mc_testing.csv"):

        if os.path.exists("sentimentModel/") == False:
            self.create_model()
        else:
            X_train = fill_set(training, testing)[0]
            self.sequence_setup(X_train)
            self.model = load_model("sentimentModel/")
            self.get_accuracy()

    def format_predict(self, data):
        temp_sequences = [
            to_sequence(self.tokenize, self.preprocess, self.word2idx, x)
            for x in data
        ]
        temp_sequences = pad_sequences(temp_sequences,
                                       maxlen=self.MAX_SEQ_LENGHT,
                                       value=self.N_FEATURES)
        return temp_sequences

    def get_accuracy(self):
        x, X_test, y, y_test = fill_set()
        X_test_sequences = self.format_predict(X_test)
        scores = self.model.evaluate(X_test_sequences, y_test, verbose=1)
        self.accuracy = scores[1]
        return scores[1]

    def get_results(self):
        x, X_test, y, y_test = fill_set()
        predictions = self.model.predict(self.format_predict(X_test))
        result = []
        for pred in predictions:
            result.append(pred[0])
        return result

    def predict(self, tests, pretty=False):
        if pretty == False:
            return self.model.predict(self.format_predict(tests))
        else:
            predictions = self.model.predict(self.format_predict(tests))
            i = 0
            #print(len(predictions))
            for pred in predictions:
                print(tests[i] + ": " + str(pred[0]))
                i += 1
Пример #15
0
model.fit(X_train_onehot[:-100],
          y_train[:-100],
          epochs=2,
          batch_size=128,
          verbose=1,
          validation_data=(X_train_onehot[-100:], y_train[-100:]))

scores = model.evaluate(vectorizer.transform(X_test), y_test, verbose=1)
print("Accuracy:", scores[1])  # Accuracy: 0.875

word2idx = {
    word: idx
    for idx, word in enumerate(vectorizer.get_feature_names())
}
tokenize = vectorizer.build_tokenizer()
preprocess = vectorizer.build_preprocessor()


def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes


print(to_sequence(tokenize, preprocess, word2idx,
                  "This is an important test!"))  # [2269, 4453]
X_train_sequences = [
    to_sequence(tokenize, preprocess, word2idx, x) for x in X_train
]
print(X_train_sequences[0])
Пример #16
0
 clf_7 = Pipeline([
     ('vect', TfidfVectorizer(
                 stop_words=stop_words,
                 token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
     )),
     ('clf', MultinomialNB(alpha=0.01)),
 ]) 
 
 evaluate_cross_validation(clf_7, news.data, news.target, 5)
 '''
 
 
 
 from sklearn.feature_extraction.text import TfidfTransformer
 transformer = TfidfTransformer()
 
 def my_tokenizer(s):
     return s.split()
 vectorizer = CountVectorizer(tokenizer=my_tokenizer)
 str = 'I am sure some bashers of Pens fans are pretty confused about the lack'
 print vectorizer.build_analyzer()(str)
 print vectorizer.build_tokenizer()(str)
 print vectorizer.build_preprocessor()(str)
 
 s1 = 'rạng sáng nay theo giờ hà_nội danh_hiệu cầu_thủ giá_trị mvp giải mls năm được công_bố tiền_đạo gốc việt_lee_nguyễn ứng_viên sáng_giá không kém đôi ngôi_sao đá giải ngoại_hạng robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders bình_chọn dựa số phiếu clb dự mls giới truyền_thông cầu_thủ robbie_keane người số phiếu trận chung_kết mls cup robbie_keane los_angeles_galaxy giành danh_hiệu cầu_thủ giá_trị mls lee_nguyễn được đánh_giá cao bình_chọn ảnh espn lee_nguyễn xếp thứ_ba bình_chọn đạt tổng_số phiếu mùa lee_nguyễn ghi bàn năm pha kiến_tạo cuối giải thi_đấu ấn_tượng vai_trò cầm_trịch lối chơi ghi_bàn cho new_england_revolution vòng play off mls cup tiền vệ_sinh năm ghi thêm hai bàn ba pha kiến_tạo đưa revolution đoạt vô_địch mls khu_vực miền đông giành vé dự chung_kết mls cup đối_đầu đội bóng keane la galaxy tháng lee_nguyễn được hlv jurgen_klinsmann triệu_tập trở_lại tuyển mỹ nhờ phong_độ ấn_tượng mls cựu inter_milan newcastle_utd obafemi_martins đứng thứ_hai số phiếu bầu cầu_thủ clb phiếu bầu clb phiếu bầu truyền thông phiếu bầu cầu thủ tổng robbie_keane la galaxy obafemi_martins seattle_sounders lee_nguyễn new england rev bradley_wright phillips ny  red_bulls tuấn'
 s2 = 'lee_nguyễn trải một năm thi_đấu hoàn_hảo ảnh usa today kết_quả được công_bố trang thông_tin chính_thức ban tổ_chức giải mls phần bình_luận tiền_vệ công lee_nguyễn đoạn lọt danh_sách bầu_chọn cuối_cùng cho danh_hiệu cầu_thủ giá_trị mls cho thấy lee_nguyễn một bước đột_phá sự_nghiệp nơi đanh ghi bàn đứng thứ_tư danh_sách vua_phá_lưới mùa vừa_qua tiền_vệ ghi_bàn cao lịch_sử mls chân chuyền đứng thứ_hai new_england năm pha kiến_tạo thành_công lee_nguyễn hoàn_toàn xứng_đáng lần đầu_tiên được lọt vào đội_hình tiêu_biểu mùa pha lập_công kiến_tạo lối chơi sáng_tạo ổn_định lee_nguyễn góp_phần quan_trọng làm_nên mùa giải thành_công rực_rỡ new_england_revolution họ nhì mls miền đông khi đăng_quang mls cup khu_vực đồng_nghĩa một suất vào chung_kết mls cup toàn_quốc nhờ lọt vào danh_sách rút_gọn cuối_cùng cho đua cầu_thủ giá_trị mvp robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders bàn thắng gỡ hòa 1-1 vào lưới houston_dynamo tuần ngôi_sao sinh năm lọt danh_sách bốn bàn thắng đẹp mls sau bảy năm được gọi trở_lại đội_tuyển mỹ đội_hình tiêu_biểu mùa vừa_qua los_angles_galaxy đóng_góp nhiều ba cá_nhân chia đều hàng thủ đến hàng công đội bóng đối_thủ cạnh_tranh vô_địch mls cup lee_nguyễn revolution sân stubhub_center california ngày tới đội_hình tiêu_biểu mls mùa thủ_môn bill_hamid dc united hậu_vệ bobby_boswell dc united omar_gonzalez los_angeles_galaxy chad_marshall seattle_sounders tiền_vệ landon_donovan los_angeles_galaxy thierry_henry new_york_red_bulls lee_nguyễn new_england_revolution diego_valeri portland_timbers tiền_đạo robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders fc bradley_wright phillips new_york_red_bulls đông_anh'
 s3 = 'thành_lương đỏ làm_nên tuyệt_phẩm trận đấu cuối_cùng bảng philippines ảnh giang_huy malaysia tập_trung hôm_qua để chuẩn_bị cho trận đấu tuyển việt_nam ngày sân_nhà shah_alam sau khi lách khe cửa hẹp để giành vị_trí thứ_hai bảng tay đội singapore thầy_trò salleh háo_hức muốn được kết_quả thật tốt một lời xin_lỗi để cđv nhà thất_vọng thời_gian gì phát_biểu có_thể thấy salleh nghiên_cứu kỹ báo_cáo hlv_u2 ong_kim_swee người được liên_đoàn bóng_đá malaysia fam cử sang hà_nội theo_dõi đối_thủ bảng trọng_tâm tuyển việt_nam đá giao_hữu tuyển việt_nam giải đấu nên phần_nào biết làm gì để kiềm_chế sức_mạnh họ salleh tiết_lộ báo_giới malaysia chúng tô đặc_biệt cẩn_trọng số nguyễn_văn_quyết số phạm_thành_lương cầu_thủ nguy_hiểm ong_kim_swee cho biết như_thế cầu_thủ văn_quyết đỏ chưa ghi_bàn được đối_thủ đánh_giá cao lối chơi ảnh giang_huy cá_nhân ong_kim_swee đưa nhận_xét tuyển việt_nam sau một thời_gian do_thám đội bóng xây_dựng được một phong_cách hoàn_toàn khác_biệt thời hlv người nhật_bản_toshiya_miura họ cầm bóng tốt không_bao_giờ chuyền bóng ngược sau luôn hướng lên phía miura sở_hữu cầu_thủ kỹ_thuật cá_nhân tốt malaysia cảnh_giác mỗi khi đối_phương bóng sát vòng cấm_địa việt_nam ghi hai bàn vào lưới philippines cú sút xa khi được hỏi điểm yếu tuyển việt_nam ong_kim_swee người giúp u23 malaysia vô_địch sea games tỏ bí_hiểm gì thấy một tập_thể gắn_kết mỗi vị_trí đều điểm yếu họ để thủng lưới ba lần điểm yếu có_thể tận_dụng khai_thác hlv salleh đen âm_thầm chuẩn_bị kế_hoạch gây bất_ngờ tuyển việt_nam sân_nhà ảnh ts bên_cạnh việc tìm cách phong_tỏa hai ngòi_nổ tuyển việt_nam salleh cố_gắng giải_quyết khoảng_trống shukor_adan mohd_amri_yahya để hai cầu_thủ trụ_cột đều vắng_mặt trận lượt_đi án treo_giò indra_putra_mahyuddin kunanlan manaf_mamat đều có_thể được tung vào sân_sau khi minh_chứng được khả_năng buổi tập safiq_rahim mohd_muslim có_thể đá vị_trí tiền_vệ trụ thay_thế cho shukor_adan salleh tiết_lộ ít_nhiều khung đội_hình thi_đấu cuối tuần người thay_thế amri_yahya trận đấu kulanan hoặc manaf_mamat tuấn'
 corpus = [s1, s2, s3]
 
 
 print 'DOne'
Пример #17
0
#print "testdata"
#print len(test_data)
#test_data = df1.iloc[:,1]
vctr =  CountVectorizer(stop_words='english',min_df = 1)
vctr2 = HashingVectorizer(stop_words='english')
vctr1 = TfidfVectorizer(stop_words='english')
count_pos = 0
count_neg = 0

######################################################################################################
train = []
test = []
for i in range(len(train_data)):
    string = train_data[i,0]
    #print string,i
    string = vctr.build_preprocessor()(string.lower()) 
    string = vctr.build_tokenizer()(string.lower())
    train.append(' '.join(string))

for i in range(len(test_data)):
    string = test_data[i,0]
    string = vctr.build_preprocessor()(string.lower()) 
    string = vctr.build_tokenizer()(string.lower())
    test.append(' '.join(string)) 
#print "len of the normalized test data obtained"    
#print len(test)  
######################################################################################################
train_data = vctr.fit_transform(train).toarray()
#print vctr1.inverse_transform(train_data)
y_train = np.asarray(label_train, dtype="|S6")
clf1 =   GradientBoostingClassifier(n_estimators = 660)
Пример #18
0
class IngredientExtractor(object):
    """Estimator that identifies the most 'ingredient like' block from a list
    """
    def __init__(self):
        """Constructor method of ingredient extractor"""
        pass

    def fit(self, X, y=None):
        """Fitter method of ingredient extractor

        X is an iterable of ingredient lists in the form of strings
        y is just here for compatibility in sklearn pipeline usage
        """
        self._count_vect = CountVectorizer()
        self.vectorized_texts_ = self._count_vect.fit_transform(X)
        self.vocabulary_ = self._count_vect.vocabulary_
        self.mean_corpus_ = self.vectorized_texts_.mean(axis=0)
        return (self)

    def predict(self, X):
        """Predicter method of ingredient extractor

        X is a list of text blocks.
        This methods returns the index of the text block that is most likely
        to hold the ingredient list"""
        X_against_ingred_voc = self._count_vect.transform(X)
        X_norms = sparse_norm(CountVectorizer().fit_transform(X), axis=1)
        X_dot_ingred = np.array(X_against_ingred_voc.sum(axis=1)).squeeze()
        pseudo_cosine_sim = np.divide(X_dot_ingred,
                                      X_norms,
                                      out=np.zeros(X_norms.shape),
                                      where=X_norms != 0)
        self.similarity_ = pseudo_cosine_sim
        return (np.argmax(pseudo_cosine_sim))

    def show_emphasize(self, X):
        """Method that prints strings with words from vocabulary emphasized
        """
        for text in self.emphasize_texts(X):
            print(text)

    def emphasize_texts(self, X):
        """Method that returns strings with words from vocabulary emphasized

        This method shows how some candidates texts are projected on the
        vocabulary that has been provided or gotten from fitting.
        It is useful to see how different blocks compare.
        X argument is an iterable of block candidates.
        """
        check_is_fitted(self)
        preprocessor = self._count_vect.build_preprocessor()
        tokenizer = self._count_vect.build_tokenizer()
        vocabulary = self._count_vect.vocabulary_
        emphasized_texts = []
        for block in X:
            text = self.emphasize_words(
                block,
                preprocessor=preprocessor,
                tokenizer=tokenizer,
                vocabulary=vocabulary,
            )
            emphasized_texts.append(text)
        return (emphasized_texts)

    def emphasize_words(
            self,
            text,
            preprocessor=None,
            tokenizer=None,
            vocabulary=None,
            ansi_color='\033[92m',  # green by default
    ):
        """Method that returns a string with words emhasized

        This methods takes a string and returns a similar string with the words
        emphasized (with color markers)
        """
        check_is_fitted(self)
        ansi_end_block = '\033[0m'
        if not preprocessor:
            preprocessor = self._count_vect.build_preprocessor()
        if not tokenizer:
            tokenizer = self._count_vect.build_tokenizer()
        if not vocabulary:
            vocabulary = self._count_vect.vocabulary_
        preprocessed_text = preprocessor(text)
        tokenized_text = tokenizer(preprocessed_text)
        idx = 0
        emphasized_text = ''
        for token in tokenized_text:
            if token in vocabulary:
                while preprocessed_text[idx:idx + len(token)] != token:
                    emphasized_text += text[idx]
                    idx += 1
                emphasized_text += (ansi_color + text[idx:idx + len(token)] +
                                    ansi_end_block)
                idx += len(token)
        emphasized_text += text[idx:]
        return (emphasized_text)

    def score(self, X, y):
        """Scorer method of ingredient extractor estimator

        X is an iterable of ingredient lists in the form of string
        y is the target as the index of the correct block.
        """
        pass
Пример #19
0
labels = ['No Default', 'Default']
plt.figure(figsize=(8,6))
sns.heatmap(cm,xticklabels=labels, yticklabels=labels, annot=True, fmt='d', cmap="Blues", vmin = 0.2);
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()


# In[126]:


#Trying a Convolutional Neural Network with word sequences
word2idx = {word: idx for idx, word in enumerate(vect.get_feature_names())}
tokenize = vect.build_tokenizer()
preprocess = vect.build_preprocessor()
def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes
X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in df_train.TEXT]
MAX_SEQ_LENGTH = len(max(X_train_sequences, key=len))
print("MAX_SEQ_LENGTH=", MAX_SEQ_LENGTH)
from keras.preprocessing.sequence import pad_sequences
N_FEATURES = len(vect.get_feature_names())
X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGTH, value=N_FEATURES)
print(X_train_sequences[0])
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding
model = Sequential()
model.add(Embedding(len(vect.get_feature_names()) + 1,
Пример #20
0
## Based on https://github.com/jc-healy/EmbedAllTheThings/commit/da9fd638af573e3cfdd41d7f7fdd3dfe02f1e7cd#diff-a1268b7d09e1e7b148cb6028dda26bff

from collections import defaultdict
import numpy as np
import numba
import scipy.sparse

# Just steal CountVectorizer for now; fix later
from sklearn.feature_extraction.text import CountVectorizer

_CV_INSTANCE = CountVectorizer()

_tokenizer = _CV_INSTANCE.build_tokenizer()
_preprocessor = _CV_INSTANCE.build_preprocessor()

# End stealing CountVectorizer

# Use nltk for senticizing for now
import nltk
nltk.download('punkt')


def nltk_sentencizer(text):
    return nltk.sent_tokenize(text)


# End nltk stealing


def regex_tokenizer(text):
    return _tokenizer(text)
fastTextModelDir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                '../fastText_demo_model/')
rawTextFile = os.path.join(fastTextModelDir, 'arxiv-untagged-data-2020.txt')
preprocessedTextFile = os.path.join(fastTextModelDir,
                                    'arxiv-2020-preprocessed.txt')

if __name__ == "__main__":

    # Construct vectorizer object accepting only lowercase letters
    cv = CountVectorizer(input='file',
                         stop_words=stopwords,
                         token_pattern=r"(?u)\b[a-z][a-z]+\b")

    # Function to strip accents and lowercase letters
    preprocess = cv.build_preprocessor()
    # Function to split into tokens using the above regex and excluding our stopwords
    tokenize = cv.build_tokenizer()

    inp = open(rawTextFile, "r")
    outp = open(preprocessedTextFile, "w")

    while True:
        line = inp.readline()
        if not line: break
        preprocessedLine = preprocess(line)
        tokenizedLine = tokenize(preprocessedLine)
        outp.write(" ".join(tokenizedLine))
        outp.write("\n")

    inp.close()
    def process_files(self, *filenames, stop_after_rows=None, overwrite_output_files=True, output_files_prefix=''):
        self.max_post_tokens = 0
        self.max_resp_tokens = 0
        random.seed(RANDOM_SEED)
        '''Preprocess the post and label data from the given files. 
        If stop_after_rows is given, this process stops after that many file rows (even if not all of the files are reached, as such).'''
        data = pd.read_csv(filenames[0]).values
        for filename in filenames[1:]: 
            data = np.append(data, pd.read_csv(filename).values, axis=0)
        posts = data[:stop_after_rows,1]
        r = data[:stop_after_rows,3]
     
        
        responses = []
        # print(responses[0])
        post_vectorizer = CountVectorizer()
        resp_vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b') # want to keep 1-char words in the responses when tokenizing them
        post_preprocessor = post_vectorizer.build_preprocessor()
        self.post_tokenizer = post_vectorizer.build_tokenizer() # seq2seq also uses this
        self.resp_tokenizer = resp_vectorizer.build_tokenizer() # seq2seq also uses this

        list_of_all_posts = np.empty(0)
        Y = np.empty(0)
        print("Preprocessing progress (by rows of original data):")
        for i in range(posts.shape[0]):
            if i % 100 == 0: print("%.0f%%" % (i*100/posts.shape[0]))
            row_posts_string = post_preprocessor(posts[i]) # preprocess the posts in this row (including making them lowercase)
            row_posts_list = re.split(r'\n\d+\.', row_posts_string) # split up all the posts in a given row
            j = 1
            for post in row_posts_list:
                post = post.strip("1.").strip() # remove any prepended "1." (that's the only case the regex split doesn't take care of), and then any prepended space/tab characters and any appended newline(s)
                post = re.sub(r'\.|,|;|:|\?|!|\(|\)|"|\u201C|\u201D', '', post) # remove certain punctuation

                # remove stopwords 
                post = re.sub(r'\u2018|\u2019', "'", post) # replace smart (curly) apostrophes with ASCII apostrophes, since that's what nltk uses
                post_words = post.split()
                post_words = list(filter(lambda word: word not in STOPWORDS, post_words))
                post = " ".join(post_words)
                
                # get rid of URLs
                post = re.sub( r'http\S+', '', post )
                
                # TODO: potential further preprocessing ideas:
                    # emojis -- not sure, might want to leave them (although we've already gotten rid of some punctuation and therefore punctuation-emojis, currently)
                    # address misspelling of significant words
                if len(self.post_tokenizer(post)) > self.max_post_tokens: self.max_post_tokens = len(self.post_tokenizer(post))
                list_of_all_posts = np.append(list_of_all_posts, post) # add it to our 1D numpy array of all posts
                
                # Check if theres no response
                if type(data[i,2]) != float: # it's a string representation of a list
                    # Remove brackets from idx entries
                    temp = data[i,2].replace('[', '')
                    temp = temp.replace(']', '')
                    # Convert the string representation to an actual list of ints
                    temp_arr = list(map(lambda a: int(a), temp.split(',')))
                    #If post matches hate_speech_idx, add 1 to Y


                    if j in temp_arr: # the jth post in this row is marked as hate speech
                        Y = np.append(Y, 1)
                        row_resps = ast.literal_eval(data[i,3])
                        row_max_resp_tokens = max(map(lambda resp: len(self.resp_tokenizer(resp)), row_resps))
                        if row_max_resp_tokens > self.max_resp_tokens: self.max_resp_tokens = row_max_resp_tokens
                        responses.append(random.choice(row_resps).lower())
                    else: # the jth post in this row is marked as not hate speech
                        Y = np.append(Y, 0)
                else: # it's 'n/a', which gets parsed as nan apparently. So none of these posts are marked as hate
                    Y = np.append(Y, 0)
                j += 1
        print("100%")
        process_responses(responses)
        # print(responses[0])
        # print(responses[1])
        # print(responses[2])
        # print(responses[3])
        counts = post_vectorizer.fit_transform(list_of_all_posts) # counts in a 2D matrix
        counts_np = np.array(counts.toarray()) # convert to normal numpy format

        feature_names = post_vectorizer.get_feature_names() # the 1D python list of features (i.e. words) that correspond to the columns of counts_np
        feature_names_np = np.array(feature_names) # convert to numpy

        resp_vectorizer.fit(responses)
        resp_tokens = resp_vectorizer.get_feature_names() # a 1D python list of all the tokens (probably words) used in the processed responses
        resp_tokens_np = np.array(resp_tokens)
        
        responses=np.array(responses)

        # remove unique features/columns (i.e. words that appear only in one post throughout the corpus)
        non_unique_indices = np.nonzero(np.count_nonzero(counts_np,axis=0)>1)[0] # the column indices of the features that appear in more than one document throughout the corpus
        non_unique_counts_np = counts_np[:,non_unique_indices] # select only the columns at those indices
        non_unique_feature_names_np = feature_names_np[non_unique_indices] # select only the feature names at those indices

        if overwrite_output_files:
            np.savez_compressed('data/' + output_files_prefix + 'preprocessed_data.npz', post_word_counts=non_unique_counts_np, post_feature_names=non_unique_feature_names_np, post_labels=Y, post_texts=list_of_all_posts, post_tokens=feature_names_np, response_texts=responses, resp_tokens=resp_tokens_np)
            with open('data/' + output_files_prefix + 'preprocessor.pkl', 'wb') as obj_file:
                pickle.dump(self, obj_file, pickle.HIGHEST_PROTOCOL)
        
        return {'post_word_counts': non_unique_counts_np, 'post_feature_names': non_unique_feature_names_np, 'post_labels': Y, 'post_texts': list_of_all_posts, 'post_tokens': feature_names_np, 'response_texts': responses, 'resp_tokens': resp_tokens_np}