Python TfidfTransformer 예제들, sklearn.feature_extraction.text.TfidfTransformer Python 예제들

예제 #1

0

파일 보기

파일: pipelines.py 프로젝트: Ravi5ingh/disaster-response

def create_disaster_pipeline(disaster_csv_path, category_name):

    disaster = ut.read_csv(disaster_csv_path)

    print('Getting data...')
    X = disaster['message'].values
    Y = disaster[category_name].values
    x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3)

    print('Creating pipeline...')
    pipeline = pi.Pipeline([
        ('vect',
         st.CountVectorizer(
             tokenizer=lambda text: (pt.pipe
                                     | __normalize_text__
                                     | __tokenize_text__
                                     | __remove_stopwords__
                                     | __lemmatize_text__)(text))),
        ('tfidf', st.TfidfTransformer()), ('clf', en.RandomForestClassifier())
    ])

    print('Fitting pipeline...')
    pipeline.fit(x_train, y_train)

    print('Predicting with pipeline...')
    y_pred = pipeline.predict(x_test)

    print('Displaying results...')
    display_results(y_test, y_pred)

    pass

예제 #2

0

파일 보기

def get_data():
    df = pd.read_table('SMSSpamCollection',
                       sep='\t',
                       header=None,
                       names=['label', 'sms_message'])

    df['label'] = df.label.map({'ham': 0, 'spam': 1})

    X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
                                                        df['label'],
                                                        random_state=1,
                                                        test_size=0.1)

    count_vector = text.CountVectorizer(ngram_range=[1, 4], analyzer='char_wb')

    # Fit the training data and then return the matrix

    training_data = count_vector.fit_transform(X_train)
    testing_data = count_vector.transform(X_test)

    # NEw layer of tf-idf for better model

    transformer = text.TfidfTransformer()
    training_data = transformer.fit_transform(training_data)
    testing_data = transformer.transform(testing_data)

    return (training_data, y_train), (testing_data,
                                      y_test), count_vector, transformer

예제 #3

0

파일 보기

파일: satire_main.py 프로젝트: arpitakumarsingh/Fake-News-Challenge-NLP-Project

def build_model(X_train, X_test, y_train, y_test):
    """Build and evaluate a model. Also returns the test-set predictions."""
    count_vect = sktext.CountVectorizer()
    tfidf_transformer = sktext.TfidfTransformer()

    X_train_counts = count_vect.fit_transform(X_train)
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_test_counts = count_vect.transform(X_test)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    X_train, X_test = feature_selection(X_train_tfidf, y_train, X_test_tfidf,
                                        'all')

    model = svm.SVC(C=10, kernel='linear')
    # model = dummy.DummyClassifier(strategy="stratified")
    model.fit(X_train, y_train)
    LOGGER.info("Model trained")

    # Test trained model:
    predicted = model.predict(X_test)
    df_out = pd.DataFrame(y_test)
    df_out['pred'] = predicted
    df_out['target'] = y_test
    df_out['match'] = df_out['pred'] == df_out['target']

    classifier = {
        "model": model,
        "counter": count_vect,
        "transformer": tfidf_transformer
    }
    return (classifier, df_out)

예제 #4

0

파일 보기

파일: input_parser.py 프로젝트: ZaydH/cmps242

def count_vectorizer(df, col_name, vocab=None):
    """
  String Vectorizer With Optional Dictionary Support

  Given a Pandas DataFrame, this function will tokenizer using either the provided dictionary
  or the one implicit to the data itself.  It then returns a matrix of the tf-idf
  scores of each of the words.

  :param df: Source data frame to vectorize
  :type df: pd.DataFrame
  :param col_name: Name of the feature column in the pandas DataFrame
  :type col_name: string
  :param vocab: Dictionary of support dictionary words to mapping of index number
  :type vocab: dict
  :return: TF-IDF word matrix and vocabulary.
  :rtype: Tuple(pd.DataFrame, dict)
  """
    stop_words = nltk.corpus.stopwords.words('english')
    vectorizer = sklearntext.CountVectorizer(lowercase=True,
                                             stop_words=stop_words,
                                             vocabulary=vocab)
    doc_word_matrix = vectorizer.fit_transform(df[col_name])
    if vocab is None:
        vocab = vectorizer.vocabulary_

    tf_idf = sklearntext.TfidfTransformer(
        norm=None).fit_transform(doc_word_matrix)
    return tf_idf.toarray(), vocab

예제 #5

0

파일 보기

파일: classnews_test.py 프로젝트: coder-leo-bian/NLP

def testmodel(test):
    cv, cv_train, model = load_pickle()
    tests = []
    x_test = stopwords(jieba.lcut(test))
    # x_test = '众所周知，中国人在世界上有着最出色的生意头脑，他们有着细腻的商业思维和精准的商业预判。' \
    #          '但有的时候却聪明反被聪明误。很多人看到一个赚钱的机会，大家都会一窝蜂似的挤进来都想分到一块“大蛋糕”。' \
    #          '而很多人又为了各自利益而不停模仿，导致最后都赚不到钱。中国的女鞋行业就是个典型例子'
    # x_test = stopwords(jieba.lcut(x_test))
    # x_test2 = '首页|财经中心|财经频道6月中国公路物流运价指数降幅收窄2016-07-04 19:51:00北京7月4日电 (记者 刘长忠)记者4日' \
    #           '从中国物流与采购联合会获悉，6月中国公路物流运价指数为101.3点，比上月回落1.5%，但比年初回升2.8%。数据显示，进入6月，' \
    #           '公路物流需求较前期小幅回升。一方面，工业物流需求保持平稳增长，其中采矿业、高耗能行业等传统行业增速虽有所回落，但原油、' \
    #           '橡胶等进口量较前期明显回升；另一方面，消费品物流需求继续保持平稳较快增长，特别是农副产品、食品、纺织品等物流需求加快增长。' \
    #           '分品种来看，钢材、有色金属等大宗商品物流需求趋弱；农副产品、食品、纺织品等物流需求上升较快。中国物流与采购联合会分析人士称，' \
    #           '总体来看，未来整车及零担公路物流需求将延续小幅回暖的走势，运量也有望继续回升。公路物流运价指数较前期可能延续回升走势，' \
    #           '回升幅度难有较大提升，预计总体将与上年同期水平基本持平。(完)'
    # x_test2 = stopwords(jieba.lcut(x_test2))
    tests.append(' '.join(x_test))
    print(tests)
    new_cv_train = cv.transform(tests)
    new_tfidf = ft.TfidfTransformer(use_idf=False)
    new_tfidf_train = new_tfidf.fit_transform(new_cv_train)
    pred_test_Y = model.predict(new_tfidf_train)
    return pred_test_Y


# x_test = '众所周知，中国人在世界上有着最出色的生意头脑，他们有着细腻的商业思维和精准的商业预判。' \
#              '但有的时候却聪明反被聪明误。很多人看到一个赚钱的机会，大家都会一窝蜂似的挤进来都想分到一块“大蛋糕”。' \
#              '而很多人又为了各自利益而不停模仿，导致最后都赚不到钱。中国的女鞋行业就是个典型例子'
# print(testmodel(x_test))

예제 #6

0

파일 보기

 def testmodel(self, model, tfidf_train, cv):
     tests = []
     x_test = '首页|体育新闻欧足联启用新分析体系|欧洲杯数据狂魔遗憾出局2016-07-04 09:10:00随着比利时被威尔士淘汰，' \
              '欧洲杯一夜之间送别了两位在身价榜单上位列前十的球员：阿扎尔、德布劳内。值得一提的是，' \
              '在欧足联官方的金足奖数据分析体系里，截至发稿，这两位球星都在前三之列，是夺取赛事官方MVP的热门人选。' \
              '德布劳内的发挥再次证明：比利时成也靠他、败也因他。欧洲杯激战至今，德布劳内是大赛的数据狂魔之一。助攻榜单上，' \
              '他3次助攻,位列第3，仅次于4次助攻的拉姆塞和阿扎尔。威胁传球次数，他有23次，领先19次的帕耶和17次的厄齐尔排名第一。' \
              '射门次数榜单，他21次与贝尔并列第二，仅次于C罗一人。由于本次欧洲杯，官方MVP的评选，是欧足联启用了一套全新的数据分析体系，' \
              '代入数据进行演算而直接得出排名，因此直到本战之前，德布劳内都是MVP即时榜单上的第一名，只是在本战后，被贝尔超越，沦为第二'
     x_test = self.stopwords(jieba.lcut(x_test))
     x_test2 = '首页|财经中心|财经频道6月中国公路物流运价指数降幅收窄2016-07-04 19:51:00北京7月4日电 (记者 刘长忠)记者4日' \
               '从中国物流与采购联合会获悉，6月中国公路物流运价指数为101.3点，比上月回落1.5%，但比年初回升2.8%。数据显示，进入6月，' \
               '公路物流需求较前期小幅回升。一方面，工业物流需求保持平稳增长，其中采矿业、高耗能行业等传统行业增速虽有所回落，但原油、' \
               '橡胶等进口量较前期明显回升；另一方面，消费品物流需求继续保持平稳较快增长，特别是农副产品、食品、纺织品等物流需求加快增长。' \
               '分品种来看，钢材、有色金属等大宗商品物流需求趋弱；农副产品、食品、纺织品等物流需求上升较快。中国物流与采购联合会分析人士称，' \
               '总体来看，未来整车及零担公路物流需求将延续小幅回暖的走势，运量也有望继续回升。公路物流运价指数较前期可能延续回升走势，' \
               '回升幅度难有较大提升，预计总体将与上年同期水平基本持平。(完)'
     x_test2 = self.stopwords(jieba.lcut(x_test2))
     tests.append(' '.join(x_test))
     tests.append(' '.join(x_test2))
     new_cv_train = cv.transform(tests)
     new_tfidf = ft.TfidfTransformer(use_idf=False)
     new_tfidf_train = new_tfidf.fit_transform(new_cv_train)
     pred_test_Y = model.predict(new_tfidf_train)
     print(pred_test_Y)

예제 #7

0

파일 보기

def train():
    """
    Builds the SVM based on training data.
    """
    features, labels = __init__.load_data('train')

    vectorizer = text.CountVectorizer(decode_error='ignore',
                                      stop_words='english')
    transformer = text.TfidfTransformer()

    classifier = linear_model.SGDClassifier(loss='hinge',
                                            penalty='l2',
                                            alpha=1e-3,
                                            tol=1e-3,
                                            random_state=42)

    # Serializes the processing steps that would be required of the above.
    text_clf = pipeline.Pipeline(
        steps=[('vect', vectorizer), ('tfidf',
                                      transformer), ('clf-sgdc', classifier)])

    start = time.time()
    text_clf.fit(features, labels)
    print 'Training time:\t%1.4f seconds' % (time.time() - start)

    __init__.evaluate(text_clf, features, labels)

    return text_clf

예제 #8

0

파일 보기

파일: pipelines.py 프로젝트: Ravi5ingh/disaster-response

def create_disaster_sequence(disaster_csv_path, category_name):

    disaster = ut.read_csv(disaster_csv_path)

    print('Getting Data...')
    X = disaster['message'].values
    Y = disaster[category_name].values
    x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3)

    print('Tokenizing and count vectorizing...')
    vect = st.CountVectorizer(tokenizer=lambda message: (
        pt.pipe
        | __normalize_text__
        | __tokenize_text__
        | __remove_stopwords__
        # | __stem_text__
        | __lemmatize_text__)(message))

    print('Tfidf transforming...')
    tfidf = st.TfidfTransformer()
    classifier = en.RandomForestClassifier()

    print('Fitting classifier on train...')
    x_train_counts = vect.fit_transform(x_train)
    x_train_tfidf = tfidf.fit_transform(x_train_counts)
    classifier.fit(x_train_tfidf, y_train)

    print('Running classifier on test...')
    x_test_counts = vect.transform(x_test)
    x_test_tfidf = tfidf.transform(x_test_counts)
    y_pred = classifier.predict(x_test_tfidf)

    print('Displaying results...')
    display_results(y_test, y_pred)

예제 #9

0

파일 보기

def parseLogs(inputFile, outputFile):
    vectorizer = ext.CountVectorizer(tokenizer=get_tokens,
                                     stop_words='english')
    with open(inputFile) as file:
        lines = [line.rstrip() for line in file]
    lineNos = dict(zip(range(1, len(lines)), lines))
    doc_matrix = vectorizer.fit_transform(lines)

    tf_idf_transformer = ext.TfidfTransformer().fit(doc_matrix)
    sparse = tf_idf_transformer.transform(doc_matrix).toarray()

    perLineScore = []
    for row in sparse:
        perLineScore.append(row.sum() / len(row.nonzero()[0]))

    lineScores = dict(zip(range(1, len(lines)), perLineScore))

    df = pd.DataFrame([lineNos, lineScores]).T
    df.columns = ['d{}'.format(i) for i, col in enumerate(df, 1)]
    df = df.sort_values(by=['d2'], ascending=False)

    with open(outputFile, 'w') as outFile:
        for index, row in df.iterrows():
            line = "{0:0=3d}  {1}\n"
            outFile.write(line.format(index, row['d1']))

예제 #10

0

파일 보기

파일: text_processing.py 프로젝트: maulikkamdar/PRISM

def fit_insights(insightids, response, c):
    """ create a naive bayes model based on the insight ids and response vector given """
    data = get_insights_by_id(insightids, c)
    nlp_pl = pipeline.Pipeline([('vect', text.CountVectorizer()),
                                ('tfidf', text.TfidfTransformer(use_idf=True)),
                                ('clf', naive_bayes.MultinomialNB())])
    text_clf = nlp_pl.fit(insights, response)
    return text_clf

예제 #11

0

파일 보기

    def compute_freq_mat(self, input_texts):
        if self.count_vect:
            word_doc_freq_mat = self.count_vect.transform(input_texts)
        else:
            self.count_vect = sk.CountVectorizer(ngram_range=self.ngram_range)
            word_doc_freq_mat = self.count_vect.fit_transform(input_texts)

        if self.feature_type == "tf":
            freq_transformer = sk.TfidfTransformer(
                use_idf=False).fit(word_doc_freq_mat)
        else:
            freq_transformer = sk.TfidfTransformer(
                use_idf=True).fit(word_doc_freq_mat)

        freq_mat = freq_transformer.transform(word_doc_freq_mat)

        return freq_mat

예제 #12

0

파일 보기

 def wordbow(self):
     # 转换词袋模型
     cv = ft.CountVectorizer()
     cv_train = cv.fit_transform(train)
     tfidf = ft.TfidfTransformer(use_idf=False)
     tfidf_train = tfidf.fit_transform(cv_train)
     # print(cv_train)
     # print(tfidf_train)
     self.trainmodel(cv_train, tfidf_train, cv)

예제 #13

0

파일 보기

def calculate_features(tr_tweets,
                       te_tweets,
                       targets_tr,
                       targets_te,
                       use_tfidf=False,
                       w_sentiment=True,
                       w_handcrafted=True,
                       **bow_kwargs):
    """
    Calculate all features, combine together into two arrays: one for tr and one for te
    :param tr_tweets: pandas Series of strings, raw texts to convert (from train set)
    :param te_tweets: pandas Series of strings, raw texts to convert (from test set)
    :param targets_tr: pandas Series of strings, target classes (from train set)
    :param targets_te: pandas Series of strings, target classes (from test set)
    :param use_tfidf: bool, whether to convert BoW to TF-IDF
    :param w_sentiment: bool, whether to include sentiment analysis inferences as features
    :param w_handcrafted: bool, whether to include handcrafted features
    :return: tuple: numpy array of training data, numpy array of test data, list of feature names
    """

    # Preprocess tweets (tokenise, stem, remove stop words)
    tr_tokens = preprocessing.preprocess_tweets(tr_tweets)
    te_tokens = preprocessing.preprocess_tweets(te_tweets)

    # Now join preprocessed tokenised tweets into single strings,
    # necessary for input to CountVectorizer
    tr_tweet_proc = tr_tokens.apply(lambda _tokens: ' '.join(_tokens))
    te_tweet_proc = te_tokens.apply(lambda _tokens: ' '.join(_tokens))

    # Calculate bag-of-words representation
    x_tr, x_te, feature_names = bag_of_words(tr_tweet_proc, te_tweet_proc,
                                             targets_tr, targets_te,
                                             **bow_kwargs)

    if use_tfidf:
        # Convert BoW to TF-IDF
        tfidfer = text_sk.TfidfTransformer()
        x_tr = tfidfer.fit_transform(x_tr)
        x_te = tfidfer.transform(x_te)

    if w_handcrafted:
        # Add handcrafted features
        x_tr_hc, feature_names_hc = hand_crafted_features(tr_tweets,
                                                          get_names=True)
        x_tr = np.hstack((x_tr, x_tr_hc))
        x_te = np.hstack((x_te, hand_crafted_features(te_tweets)))
        feature_names.extend(feature_names_hc)

    if w_sentiment:
        # Add inferred sentiment features
        x_tr_sent, sent_feat_names = infer_sentiment(tr_tweets, get_names=True)
        x_tr = np.hstack((x_tr, infer_sentiment(tr_tweets)))
        x_te = np.hstack((x_te, infer_sentiment(te_tweets)))
        feature_names.extend(sent_feat_names)
    return x_tr, x_te, feature_names

예제 #14

0

파일 보기

def tfidf_transform(text,feature_words):
    count_vector = txt.CountVectorizer()
    vectorized = count_vector.fit(text)
    #transforms into tfidf
    tfidf = txt.TfidfTransformer().fit(vectorized)
    vectorized_tfidf = tfidf.transform(vectorized)

    if feature_words:
        return tfidf.get_feature_names()
    else:
        return vectorized_tfidf

예제 #15

0

파일 보기

def tfIDfVectorizer(df):
    y = df['class']

    if '' in df.columns:
        df.drop(['class', ''], 1, inplace=True)
    else:
        df.drop(['class'], 1, inplace=True)

    trfm = txt.TfidfTransformer()
    trfm.fit(df)
    matrx = trfm.transform(df)
    matrx = matrx.todense()
    return matrx.A, y

예제 #16

0

파일 보기

파일: classnews_score_train.py 프로젝트: coder-leo-bian/NLP

def wordbow():
    # 词袋，做tfidf
    x_train, x_test, y_train, y_test = train_test_split(train,
                                                        classes,
                                                        test_size=0.3,
                                                        random_state=7)
    cv = ft.CountVectorizer()
    cv_x_train = cv.fit_transform(x_train)
    # print('cv_x_train: ', cv_x_train)
    tfidf = ft.TfidfTransformer(use_idf=False)
    tfidf_x_train = tfidf.fit_transform(cv_x_train)
    # print('tfidf_x_train: ', tfidf_x_train)
    return train_model(tfidf_x_train, y_train, x_test, y_test, cv, cv_x_train)

예제 #17

0

파일 보기

파일: create_baselines.py 프로젝트: PLBMR/701-Project

def make_BOW_features(X):
    '''
    Create feature vectors for the input list of sentences by using the tf-idf 
    value for each word that occurs in the sentence (idf calculated based on 
        entire input)
    X: list of sentences
    '''
    vectorizer = sktext.CountVectorizer(min_df=1)

    countX = vectorizer.fit_transform(X)

    transformer = sktext.TfidfTransformer()

    vecs = transformer.fit_transform(countX)

    return vecs

예제 #18

0

파일 보기

파일: vector_utils.py 프로젝트: l294265421/AC-MIMLLN

def to_tfidf_vectors(texts: list, tokenizer: tokenizers.BaseTokenizer()):
    """

    :param texts: list of str
    :param tokenizer:
    :return:
    """
    texts_tokenized = [' '.join(tokenizer(text)) for text in texts]
    vectorizer = sklearn_text.CountVectorizer()
    freq_word_matrix = vectorizer.fit_transform(texts_tokenized)

    transformer = sklearn_text.TfidfTransformer()
    tfidf_matrix = transformer.fit_transform(freq_word_matrix)

    X = tfidf_matrix.toarray()
    return X

예제 #19

0

파일 보기

def getTfidf(data, is_train=True):
    if is_train:
        bow = sk_txt.CountVectorizer(ngram_range=(1, 2),
                                     stop_words='english',
                                     lowercase=True)
        tfidf = sk_txt.TfidfTransformer()
        bow_t = bow.fit_transform(data[0], data[1])
        tfidf_t = tfidf.fit_transform(bow_t)
        pickle.dump(bow, open('bow.model', 'wb'))
        pickle.dump(tfidf, open('tfidf.model', 'wb'))
    else:
        bow = pickle.load(open('bow.model', 'rb'))
        tfidf = pickle.load(open('tfidf.model', 'rb'))
        bow_t = bow.transform(data[0])
        tfidf_t = tfidf.transform(bow_t)
    return tfidf_t

예제 #20

0

파일 보기

파일: actionModel.py 프로젝트: sock223/JubenAI_V2

    def go2trainAction(self):
        trainAction_data = []

        trainAction = sd.load_files('resources/Action/',
                                    encoding='utf8',
                                    shuffle=True,
                                    random_state=8)

        for i in trainAction.data:
            trainAction_data.append(" ".join(jieba.cut(i)))
        # train_data = train.data
        #print(trainAction_data)

        trainAction_y = trainAction.target
        #print(trainAction.target)

        self.categoriesAction = np.array(trainAction.target_names)
        #print(np.array(trainAction_data).shape)
        #print(np.array(trainAction_y).shape)
        #print(self.categoriesAction)

        # 构建TFIDF矩阵 使用1-gram 这边的词是根据空格划分的，前面jieba已经拆成空格了
        self.cvAction = ft.CountVectorizer(ngram_range=(1, 1))

        # # input to fit_transform() should be an iterable with strings
        # ngrams = self.cvAction.fit_transform(trainAction_data)
        #
        # # needs to happen after fit_transform()
        # vocab = self.cvAction.vocabulary_
        #
        # count_values = ngrams.toarray().sum(axis=0)
        #
        # # output n-grams
        # for ng_count, ng_text in sorted([(count_values[i], k) for k, i in vocab.items()], reverse=True):
        #     print(ng_count, ng_text)

        bowAction = self.cvAction.fit_transform(trainAction_data)
        #print(bowAction.shape)

        self.ttAction = ft.TfidfTransformer()
        tfidfAction = self.ttAction.fit_transform(bowAction)

        # 模型训练  使用MultinomialNB 是因为tfidf
        # 矩阵中样本的分布更匹配多项分布
        self.Actionmodel = nb.MultinomialNB()
        #print(trainAction_y.shape)
        self.Actionmodel.fit(tfidfAction, trainAction_y)

예제 #21

0

파일 보기

파일: vector_utils.py 프로젝트: l294265421/AC-MIMLLN

def get_trained_count_and_tfidf_model(texts: list,
                                      tokenizer: tokenizers.BaseTokenizer()):
    """

    :param texts:
    :param tokenizer:
    :return:
    """
    texts_tokenized = [' '.join(tokenizer(text)) for text in texts]
    count_model = sklearn_text.CountVectorizer()
    count_model.fit(texts_tokenized)
    freq_word_matrix = count_model.transform(texts_tokenized)

    tfidf_model = sklearn_text.TfidfTransformer()
    tfidf_model.fit(freq_word_matrix)

    return count_model, tfidf_model

예제 #22

0

파일 보기

def to_tfidf_vectors(texts: list, tokenizer: tokenizers.BaseTokenizer()):
    """

    :param texts: list of str
    :param tokenizer: 分词器
    :return: 向量数组，每行一个对应一个text的向量化结果
    """
    texts_tokenized = [' '.join(tokenizer(text)) for text in texts]
    # 词频矩阵：矩阵元素a[i][j] 表示j词在i类文本下的词频
    vectorizer = sklearn_text.CountVectorizer()
    freq_word_matrix = vectorizer.fit_transform(texts_tokenized)

    # 统计每个词语的tf-idf权值
    transformer = sklearn_text.TfidfTransformer()
    tfidf_matrix = transformer.fit_transform(freq_word_matrix)

    X = tfidf_matrix.toarray()
    return X

예제 #23

0

파일 보기

def get_trained_count_and_tfidf_model(texts: list,
                                      tokenizer: tokenizers.BaseTokenizer()):
    """

    :param texts:
    :param tokenizer:
    :return:
    """
    texts_tokenized = [' '.join(tokenizer(text)) for text in texts]
    # 词频矩阵：矩阵元素a[i][j] 表示j词在i类文本下的词频
    count_model = sklearn_text.CountVectorizer()
    count_model.fit(texts_tokenized)
    freq_word_matrix = count_model.transform(texts_tokenized)

    # 统计每个词语的tf-idf权值
    tfidf_model = sklearn_text.TfidfTransformer()
    tfidf_model.fit(freq_word_matrix)

    return count_model, tfidf_model

예제 #24

0

파일 보기

파일: data_helper.py 프로젝트: StanleyLeiSun/PlayGround

def build_vectors(sentences, vacabulary_size):
    vectorizer = skyfe.CountVectorizer()
    trans = vectorizer.fit_transform(sentences)
    fname = vectorizer.get_feature_names()
    print(trans)
    print(trans.toarray())
    print(fname)

    #enable tf-idf
    transformer = skyfe.TfidfTransformer()
    tfidf = transformer.fit_transform(trans)
    print(tfidf.toarray())
    print(tfidf.get_feature_names())

    #hashed
    vectorizer2 = skyfe.HashingVectorizer(n_features=6, norm=None)
    trans = vectorizer2.fit_transform(sentences)
    #fname = vectorizer2.get_feature_names()
    print(trans.toarray())

예제 #25

0

파일 보기

파일: classnews_score_train.py 프로젝트: coder-leo-bian/NLP

def predict_self(model, x_test, y_test, cv, score):
    # 测试模型
    cv_x_test = cv.transform(x_test)
    # print(cv_x_test)
    tfidf = ft.TfidfTransformer(use_idf=False)
    tfidf_x_test = tfidf.fit_transform(cv_x_test)
    # print(tfidf_x_test)
    pred_y = model.predict(tfidf_x_test)
    # print(pred_y)
    result = pred_y == y_test
    result = [r for r in result if r]

    # score: 查准率，召回率，F1得分，true_count：判断正确的数量， all_count：总数量， true_score：正确率
    return {
        'score': score,
        'true_count': len(result),
        'all_count': pred_y.size,
        'true_score': (len(result) / pred_y.size) * 100
    }

예제 #26

0

파일 보기

파일: question_j_svm_alternative.py 프로젝트: yanxiao0201/EE219

	def __init__(self, train_dict, cat1, cat2):
		self.cat1 = cat1
		self.cat2 = cat2
		
		count_vect = text.CountVectorizer(min_df=1, stop_words='english', analyzer = 'word', tokenizer=util.my_tokenizer)
		tfidf_transformer = text.TfidfTransformer()
		svd = TruncatedSVD(n_components=50, algorithm='arpack')

		X_train_counts = count_vect.fit_transform(train_dict[cat1].data + train_dict[cat2].data)
		X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
		X_train_svd = svd.fit_transform(X_train_tfidf)

		y_train = [1] * len(train_dict[cat1].filenames) + [-1] * len(train_dict[cat2].filenames)

		self.count_vect = count_vect
		self.tfidf_transformer = tfidf_transformer
		self.svd = svd

		self.clf = svm.LinearSVC(random_state=42, class_weight='balanced')
		self.clf.fit(X_train_svd, y_train)

예제 #27

0

파일 보기

def train():
    """Builds the random forest based on training data."""
    features, labels = __init__.load_data('train')

    vectorizer = text.CountVectorizer(decode_error='ignore',
                                      stop_words='english')
    transformer = text.TfidfTransformer()
    classifier = ensemble.RandomForestClassifier(n_estimators=10)

    text_clf = pipeline.Pipeline(
        steps=[('vect', vectorizer), ('tfidf',
                                      transformer), ('clf-rf', classifier)])

    start = time.time()
    text_clf.fit(features, labels)
    print 'Training time:\t%1.4f seconds' % (time.time() - start)

    __init__.evaluate(text_clf, features, labels)

    return text_clf

예제 #28

0

파일 보기

def gen_key_word_dict_list(string_list, max_key_words=10, stop_word_list=[]):
    from sklearn.feature_extraction import text
    import numpy as np
    if stop_word_list == []:
        stop_word_list = text.ENGLISH_STOP_WORDS
    vectorizer = text.TfidfVectorizer(stop_words=stop_word_list)
    transformer = text.TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(string_list))
    words = vectorizer.get_feature_names()
    weight = tfidf.toarray()
    result_list = []
    for w in weight:
        temp_dict = {}
        loc = np.argsort(-w)
        for i in range(max_key_words):
            if w[loc[i]] <= 0:
                break
            temp_dict[words[loc[i]]] = w[loc[i]]
        result_list.append(temp_dict)
    return result_list

예제 #29

0

파일 보기

    def init_fit_eval(self):
        """
        Initializes, fits, and evaluates a Random Forest Classifier on the disaster data
        """

        print('Creating Sparse Vector Pipeline...')

        X = self.__disaster__['message'].values

        i = 1
        num_cats = len(self.__categories_columns__)
        for category in self.__categories_columns__:

            Y = self.__disaster__[category]
            x_train, x_test, y_train, y_test = ms.train_test_split(
                X, Y, test_size=0.25)

            current_pipeline = pi.Pipeline([
                ('vect',
                 te.CountVectorizer(tokenizer=self.__tokenize_tweet__)),
                ('tfidf', te.TfidfTransformer()),
                ('clf', en.RandomForestClassifier())
            ])

            self.__pipelines__[category] = current_pipeline

            current_pipeline.fit(x_train, y_train)

            y_pred = current_pipeline.predict(x_test)

            self.__add_to_summary__(category, y_test, y_pred)

            ut.printover('Fitted ' + str(i) + ' out of ' + str(num_cats) +
                         ' models')
            i += 1

        print('\nDone fitting. Overall Accuracy: ' +
              str(st.mean(self.__accuracies__.values())))

예제 #30

0

파일 보기

def train():
    """
    Builds the classifier based on training data.
    """
    features, labels = __init__.load_data('train')
        
    vectorizer = text.CountVectorizer(decode_error='ignore', stop_words='english')
    transformer = text.TfidfTransformer()
    
    classifier = linear_model.LogisticRegression(solver='lbfgs')
    
    # Serializes the processing steps that would be required of the above.
    text_clf = pipeline.Pipeline(steps=[('vect', vectorizer),
                                       ('tfidf', transformer),
                                       ('clf-lr', classifier)])
    
    start = time.time()
    text_clf.fit(features, labels)
    print 'Training time:\t%1.4f seconds' % (time.time() - start)
    
    __init__.evaluate(text_clf, features, labels)

    return text_clf