Python pre_processing示例，MidtermProject.tools.pre_processing Python示例

示例#1

0

显示文件

def main():
    target_names = ['alt.atheism',
                    'comp.graphics',
                    'rec.autos',
                    'sci.med',
                    'talk.politics.guns'
                    ]
    news_train = fetch_20newsgroups(subset='train', categories=target_names)
    train_data = news_train.data
    print(len(train_data))
    print(type(train_data), type(train_data[0]), "\n")  # <class 'list'> <class 'str'>

    processed_data = [" ".join(pre_processing(data)) for data in train_data]
    stopwords = load_stopwords()

    max_df = 0.3
    min_df = 0.005

    # #### Learn Bag-of-words (BoW)
    # count_vec = CountVectorizer(stop_words='english')
    count_vec = CountVectorizer(stop_words=stopwords,
                                max_df=max_df,
                                min_df=min_df,
                                max_features=FEATURE_NUM)
    count_vec.fit(processed_data)
    data_bow = count_vec.transform(processed_data)
    feature_names_bow = count_vec.get_feature_names()
    print(len(processed_data), data_bow.shape, type(data_bow))

    # #### Learn TF-IDF model
    # tfidf_vec = TfidfVectorizer(stop_words='english')
    tfidf_vec = TfidfVectorizer(stop_words=stopwords,
                                max_df=max_df,
                                min_df=min_df,
                                max_features=FEATURE_NUM)
    tfidf_vec.fit(processed_data)
    data_tfidf = tfidf_vec.transform(processed_data)
    feature_names_tfidf = tfidf_vec.get_feature_names()
    print(len(processed_data), data_tfidf.shape, type(data_tfidf))

    # #### Train LDA models for BoW
    num_topics = 5
    num_topic_word = 20
    lda_bow = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
    lda_bow.fit(data_bow)
    display_topics(lda_bow, feature_names_bow, num_topic_word, "BoW_word_cloud")

    print('\n\n\n')

    # #### Train LDA models for TF-IDF
    num_topics = 5
    num_topic_word = 20
    lda_tfidf = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online',
                                          learning_offset=50.,
                                          random_state=0)
    lda_tfidf.fit(data_tfidf)
    display_topics(lda_tfidf, feature_names_tfidf, num_topic_word, "TF-IDF_word_cloud")

示例#2

0

显示文件

def main():
    target_names = ['alt.atheism',
                    'comp.graphics',
                    'rec.autos',
                    'sci.med',
                    'talk.politics.guns'
                    ]
    news_train = fetch_20newsgroups(subset='train',
                                    categories=target_names,
                                    remove=('headers', 'footers', 'quotes'))
    # news_train = fetch_20newsgroups(subset='train')
    train_data = news_train.data
    print(len(train_data))
    print(type(train_data), type(train_data[0]), "\n")  # <class 'list'> <class 'str'>

    processed_data = [pre_processing(data) for data in train_data]
    stopwords = load_stopwords()

    # #######################################################################
    # #### Learn TF-IDF model
    tfidf_vec = TfidfVectorizer(stop_words=stopwords)
    tfidf_vec.fit(processed_data)
    data_tfidf = tfidf_vec.transform(processed_data)
    feature_names_tfidf = tfidf_vec.get_feature_names()
    print(len(processed_data), data_tfidf.shape, type(data_tfidf))

    print('Start K-means for TF-IDF:')
    weight_tfidf = data_tfidf.toarray()
    clf_tfidf = KMeans(n_clusters=5)
    s = clf_tfidf.fit(weight_tfidf)
    print(s)
    # 5个中心点
    print(clf_tfidf.cluster_centers_)
    # # 每个样本所属的簇
    # print(clf.labels_)
    # i = 1
    # while i <= len(clf.labels_):
    #     print(i, clf.labels_[i - 1])
    #     i = i + 1
    # 用来评估簇的个数是否合适，距离越小说明簇分的越好，选取临界点的簇个数
    # Sum of distances of samples to their closest cluster center
    print("Inertia of TF-IDF", clf_tfidf.inertia_)

    # 使用T-SNE算法，对权重进行降维，准确度比PCA算法高，但是耗时长
    tsne = TSNE(n_components=2)
    decomposition_data = tsne.fit_transform(weight_tfidf)
    x, y = [], []
    for i in decomposition_data:
        x.append(i[0])
        y.append(i[1])
    fig = plt.figure(figsize=(10, 10))
    ax = plt.axes()
    plt.scatter(x, y, c=clf_tfidf.labels_, marker="x")
    plt.xticks(())
    plt.yticks(())
    plt.show()
    plt.savefig('./Chart/kMeans_sample_TFIDF.png', aspect=1)

示例#3

0

显示文件

def main():
    target_names = ['alt.atheism',
                    'comp.graphics',
                    'rec.autos',
                    'sci.med',
                    'talk.politics.guns'
                    ]
    news_train = fetch_20newsgroups(subset='train',
                                    categories=target_names,
                                    remove=('headers', 'footers', 'quotes'))
    # news_train = fetch_20newsgroups(subset='train')
    train_data = news_train.data
    print(len(train_data))
    print(type(train_data), type(train_data[0]), "\n")  # <class 'list'> <class 'str'>

    processed_data = [pre_processing(data) for data in train_data]
    stopwords = load_stopwords()

    # # #######################################################################
    # #### Learn Bag-of-words (BoW)
    count_vec = CountVectorizer(stop_words=stopwords)
    count_vec.fit(processed_data)
    data_bow = count_vec.transform(processed_data)
    feature_names_bow = count_vec.get_feature_names()
    print(len(processed_data), data_bow.shape, type(data_bow))
    # pprint(count_vec.get_feature_names())
    # pprint(count_vec.vocabulary_)

    print('Start K-means for BoW:')
    num_clusters = 5
    weight_bow = data_bow.toarray()
    clf_bow = KMeans(n_clusters=num_clusters)
    s = clf_bow.fit(weight_bow)
    print(s)
    # 5个中心点
    print(clf_bow.cluster_centers_)
    # Sum of distances of samples to their closest cluster center
    print("Inertia of BoW", clf_bow.inertia_)

    # 使用T-SNE算法，对权重进行降维，准确度比PCA算法高，但是耗时长
    tsne = TSNE(n_components=2)
    decomposition_data = tsne.fit_transform(weight_bow)
    x, y = [], []
    for i in decomposition_data:
        x.append(i[0])
        y.append(i[1])
    fig = plt.figure(figsize=(10, 10))
    ax = plt.axes()
    plt.scatter(x, y, c=clf_bow.labels_, marker="x")
    plt.xticks(())
    plt.yticks(())
    plt.show()
    plt.savefig('./Chart/kMeans_sample_BoW.png', aspect=1)

示例#4

0

显示文件

文件： 3_LDA_gensim.py 项目： dhr1676/EECE5642

def main():
    target_names = [
        'alt.atheism', 'comp.graphics', 'rec.autos', 'sci.med',
        'talk.politics.guns'
    ]
    remove = ('headers', 'footers', 'quotes')
    news_train = fetch_20newsgroups(subset='train',
                                    categories=target_names,
                                    remove=remove)
    train_data = news_train.data
    print(len(train_data))
    print(type(train_data), type(train_data[0]),
          "\n")  # <class 'list'> <class 'str'>

    processed_data = [pre_processing(data) for data in train_data]

    # Create Dictionary
    id2word = corpora.Dictionary(processed_data)

    # Create Corpus
    texts = processed_data

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=5,
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)

    pprint(lda_model.print_topics())
    # Visualize the topics
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    pyLDAvis.save_html(vis, 'lda.html')

示例#5

0

显示文件

def main():
    target_names = ['alt.atheism']
    news_train = fetch_20newsgroups(subset='train', categories=target_names)
    train_data = news_train.data
    print(len(train_data))
    print(type(train_data), type(train_data[0]),
          "\n")  # <class 'list'> <class 'str'>

    processed_data = [pre_processing(data) for data in train_data]

    # for i in range(5):
    #     print(processed_data[i])
    #     print("\n\n")
    # print("\n\n\n\n\n")

    max_epochs = 500
    vec_size = 20
    alpha = 0.025

    model = gensim.models.Doc2Vec(processed_data,
                                  dm=0,
                                  alpha=0.1,
                                  size=20,
                                  min_alpha=0.025)

示例#6

0

显示文件

文件： 1_statistics.py 项目： dhr1676/EECE5642

def main():
    # news_train = fetch_20newsgroups(subset='train')
    # print("Number of articles: " + str(len(news_train.data)))
    # print("Number of different categories: " + str(len(news_train.target_names)))
    # # Number of articles: 11314
    # # Number of different categories: 20
    # pprint(list(news_train.target_names))

    # target_names = ['alt.atheism',
    #                 'comp.graphics',
    #                 'comp.os.ms-windows.misc',
    #                 'comp.sys.ibm.pc.hardware',
    #                 'comp.sys.mac.hardware',
    #                 'comp.windows.x',
    #                 'misc.forsale',
    #                 'rec.autos',
    #                 'rec.motorcycles',
    #                 'rec.sport.baseball',
    #                 'rec.sport.hockey',
    #                 'sci.crypt',
    #                 'sci.electronics',
    #                 'sci.med',
    #                 'sci.space',
    #                 'soc.religion.christian',
    #                 'talk.politics.guns',
    #                 'talk.politics.mideast',
    #                 'talk.politics.misc',
    #                 'talk.religion.misc']

    # pie_chart = []
    # for i in range(len(target_names)):
    #     parts = [target_names[i]]
    #     part = fetch_20newsgroups(subset='train', categories=parts)
    #     # print(parts, part.filenames.shape)
    #     pie_chart.append(len(part.filenames))

    # x = pie_chart
    # plt.pie(x, labels=target_names, autopct='%.1f%%', startangle=90, counterclock=False)
    # plt.title("Pie Chart for 20 Newsgroup", fontsize='large', fontweight='bold')
    # plt.show()
    # plt.close()

    # parts = ['alt.atheism']  # 480
    # part = fetch_20newsgroups(subset='train', categories=parts)
    # # part.data is a list
    # raw = part.data[0]
    # print(raw, "--------------------------------------------------------\n\n\n\n")
    #
    # processed = pre_processing(raw)
    # print(processed)

    news_train = fetch_20newsgroups(subset='train')
    train_data = news_train.data
    print(len(train_data))
    print(type(train_data), type(train_data[0]),
          "\n")  # <class 'list'> <class 'str'>

    processed_data = [" ".join(pre_processing(data)) for data in train_data]
    stopwords = load_stopwords()

    # #### Learn Bag-of-words (BoW)
    max_df = 0.3
    min_df = 0.00005
    count_vec = CountVectorizer(strip_accents='unicode',
                                stop_words=stopwords,
                                max_df=max_df,
                                min_df=min_df)
    count_vec.fit(processed_data)
    data_bow = count_vec.transform(processed_data)

    feature_names_bow = count_vec.get_feature_names()
    print(len(processed_data), data_bow.shape, type(data_bow))

    freq_list = data_bow.toarray().sum(axis=0).tolist()

    print('对应特征总个数：', len(freq_list), type(freq_list))
    # 对应特征总个数： 90517 <class 'numpy.ndarray'>
    # array([  3, 239,  75,   2,   1,   1,   1,   2,   2,   1], dtype=int64)

    words = count_vec.vocabulary_
    index2words = dict(zip(words.values(), words.keys()))

    freq_dict = {}
    top_dict = {}
    for idx, freq in enumerate(freq_list):
        top_dict[index2words[idx]] = freq

    k = 20
    all = len(freq_list)
    top_k_list = sorted(top_dict.items(), key=lambda l: l[1], reverse=True)[:k]
    # m1 = 100
    # m2 = 130
    # top_k_list = sorted(top_dict.items(), key=lambda l: l[1], reverse=True)[m1:m2]

    print(top_k_list)

    x = range(len(top_k_list))
    x_list = [t[1] for t in top_k_list]
    y_list = [t[0] for t in top_k_list]

    plt.bar(x, x_list)
    plt.xticks(x, y_list, rotation=45)
    plt.tick_params(labelsize=8)
    plt.title("Top %d / %d Words of  with max_df = %.2f and min_df = %.3f" %
              (k, all, max_df, min_df),
              fontsize='large',
              fontweight='bold')
    # plt.title("Top %d-%d / %d Words of  with max_df = %.2f and min_df = %.3f" % (m1, m2, all, max_df, min_df),
    #           fontsize='large', fontweight='bold')
    plt.show()
    plt.close()