Python init_stopword示例，jiebaFunc.init_stopword Python示例

示例#1

0

显示文件

文件： keyword.py 项目： setsal/MASL

def main():
    logging.basicConfig(format='[%(levelname)s] : %(message)s', level=logging.INFO)

    test_data = getTestData('input/test.txt')

    tags = getSingleKeywords(test_data, 3)

    print("\nkey word: \n")
    print(" ".join(tags))
    print("\n")

    init_stopword()
    seg_list = getSingleSegment(test_data)
    seg_string = " ".join(seg_list)

    wc = generateWordCloud(seg_string)
    """

示例#2

0

显示文件

def main():
    logging.basicConfig(format='[%(levelname)s] : %(message)s',
                        level=logging.INFO)

    init_stopword()
    article_lists = getArticle("fb_fetch_article")
    seg_list = getSegment(article_lists)

    #print(seg_list)
    #'FGO','fgo','少女前線','白貓','寫真','cosplay','東方','演唱會','百合','艦娘','血小板','碧藍','偶像','音樂'

    train_list = []
    train_id = []
    idx = 0
    for doc in seg_list:
        for word in doc:
            if word in [
                    'FGO', 'fgo', 'Fate', '少女前線', '白貓', '寫真', 'cosplay', 'cos',
                    '東方', '演唱會', '百合', '艦娘', '血小板', '碧藍', '音樂', '工作細胞'
            ]:
                train_list.append(doc)
                train_id.append(idx)
                break
        idx += 1

    #print(train_list)
    #print(train_id)

    filt_list = [[
        y for y in x if y in {
            'FGO', 'fgo', 'Fate', '少女前線', '白貓', '寫真', 'cosplay', 'cos', '東方',
            '演唱會', '百合', '艦娘', '血小板', '碧藍', '音樂', '工作細胞'
        }
    ] for x in train_list]
    #print(filt_list)

    with open("output/fb.train_id", "wb") as fp:
        pickle.dump(train_id, fp)

    n = 20

    # 移除只出現n次的字詞
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in filt_list:
        for token in text:
            frequency[token] += 1
    """
    sorted_frequency = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
    for word in sorted_frequency:
        if word[1] >= 30:
            print(word[0]+":"+str(word[1]))
    """

    texts = [[token for token in text if frequency[token] >= n]
             for text in filt_list]

    # Create dictionary
    dictionary = corpora.Dictionary(texts)
    dictionary.compactify()
    dictionary.save("output/fb.dict")
    logging.info("Create dict success.")

    # Serialize it
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize("output/fb.mm", corpus)
    logging.info("Create data flow success.")

    tfidf = models.TfidfModel(corpus)
    tfidf.save("output/fb.tfidf")
    corpus_tfidf = tfidf[corpus]
    logging.info("Create TF-IDF model success.")

    num_topic = 9

    # Transfer to LSI model
    lda = models.LdaModel(corpus,
                          id2word=dictionary,
                          num_topics=num_topic,
                          iterations=100,
                          passes=20)
    lda.save("output/fb.lda")
    logging.info("Create LDA model success.")

    for i in range(lda.num_topics):
        print(lda.print_topic(i))
        print('\n')
    """

示例#3

0

显示文件

文件： train_ff.py 项目： setsal/MASL

def main():
    logging.basicConfig(format='[%(levelname)s] : %(message)s',
                        level=logging.INFO)

    if len(sys.argv) < 2:
        logging.error("No argument")
        logging.info("Usage: $python train_ff.py [activity]")
        logging.info("FF31~32 CWT48 49 PF26~28, etc.")
        sys.exit()

    datefrom = ""
    dateto = ""

    if sys.argv[1] == "FF31":
        datefrom = "\"2017-12-01\""
        dateto = "\"2018-04-14\""
    elif sys.argv[1] == "FF32":
        datefrom = "\"2018-04-15\""
        dateto = "\"2018-11-11\""

    elif sys.argv[1] == "CWT49":
        datefrom = "\"2018-04-01\""
        dateto = "\"2018-08-20\""
    elif sys.argv[1] == "CWT48":
        datefrom = "\"2017-12-15\""
        dateto = "\"2018-03-31\""

    elif sys.argv[1] == "PF26":
        datefrom = "\"2016-11-15\""
        dateto = "\"2017-04-29\""
    elif sys.argv[1] == "PF27":
        datefrom = "\"2017-04-30\""
        dateto = "\"2018-10-30\""
    elif sys.argv[1] == "PF28":
        datefrom = "\"2017-10-31\""
        dateto = "\"2018-05-28\""

    init_stopword()
    article_lists = getArticleByTime("fb_fetch_article", datefrom, dateto)
    #article_lists = getArticle("fb_fetch_article")
    seg_list = getSegment(article_lists)

    #print(seg_list)
    #'FGO','fgo','少女前線','白貓','寫真','cosplay','東方','演唱會','百合','艦娘','血小板','碧藍','偶像','音樂'

    train_list = []
    train_id = []
    idx = 0
    for doc in seg_list:
        for word in doc:
            if word in [
                    'PF', 'FF', 'FGO', 'fgo', 'Fate', '少女前線', '白貓', '寫真',
                    'cosplay', 'cos', '東方', '演唱會', '百合', '艦娘', '血小板', '碧藍',
                    '音樂', '工作細胞'
            ]:
                train_list.append(doc)
                train_id.append(idx)
                break
        idx += 1

    #print("-----????---\n",train_list)
    #print(train_id)

    filt_list = [[
        y for y in x if y in [
            'PF', 'FF', 'FGO', 'fgo', 'Fate', '少女前線', '白貓', '寫真', 'cosplay',
            'cos', '東方', '演唱會', '百合', '艦娘', '血小板', '碧藍', '音樂', '工作細胞'
        ]
    ] for x in train_list]
    #print("-----????---\n",filt_list)

    with open("output/fb_train_id", "wb") as fp:
        pickle.dump(train_id, fp)

    n = 20

    # 移除只出現n次的字詞
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in filt_list:
        for token in text:
            frequency[token] += 1
    """
    sorted_frequency = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
    for word in sorted_frequency:
        if word[1] >= 30:
            print(word[0]+":"+str(word[1]))
    """

    texts = [[token for token in text if frequency[token] >= n]
             for text in filt_list]

    # Create dictionary
    dictionary = corpora.Dictionary(texts)
    dictionary.compactify()
    dictionary.save("output/fb_" + sys.argv[1] + ".dict")
    logging.info("Create dict success.")

    # Serialize it
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize("output/fb_" + sys.argv[1] + ".mm", corpus)
    logging.info("Create data flow success.")

    tfidf = models.TfidfModel(corpus)
    tfidf.save("output/fb_" + sys.argv[1] + ".tfidf")
    corpus_tfidf = tfidf[corpus]
    logging.info("Create TF-IDF model success.")

    num_topic = 9

    # Transfer to LSI model
    lda = models.LdaModel(corpus,
                          id2word=dictionary,
                          num_topics=num_topic,
                          iterations=100,
                          passes=20)
    lda.save("output/fb_" + sys.argv[1] + ".lda")
    logging.info("Create LDA model success.")

    for i in range(lda.num_topics):
        print(lda.print_topic(i))
        print('\n')
    """

示例#4

0

显示文件

def main():
    logging.basicConfig(format='[%(levelname)s] : %(message)s',
                        level=logging.INFO)

    if (os.path.exists("output/0814.dict")):
        dictionary = corpora.Dictionary.load("output/0814.dict")
        corpus = corpora.MmCorpus("output/0814.mm")
        logging.info("Load model success")
    else:
        logging.info("Please run the train2.py to create dict & data flow")

    # Create tf-idf model
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    # Transfer to LSI model
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=40)
    corpus_lsi = lsi[corpus_tfidf]  # LSI潛在語義索引
    lsi.save('output/0814.lsi')
    corpora.MmCorpus.serialize('output/0814_lsi.mm', corpus_lsi)
    """
    print("LSI topics:")
    results = lsi.print_topics(5)
    for result in results:
        print(result)
    """

    # test_data = ''
    # with open('input/test.txt', 'r', encoding='utf-8') as f:
    #     for line in f:
    #         words = jieba.cut(line)
    #         test_data += ' '.join(words)
    #
    # print(test_data.split())

    test_data = []
    init_stopword()
    test_data = getTestData('input/test.txt')
    test_data_seg = getSingleSegment(test_data)

    vec_bow = dictionary.doc2bow(test_data_seg)
    vec_lsi = lsi[vec_bow]

    print("\nAriticle:\n%s" % test_data)

    # Create index
    index = similarities.MatrixSimilarity(lsi[corpus])
    index.save("output/0814.index")

    # Similarity
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    print("result:")
    print(sims[:5])

    # Print results
    articles = getArticle()
    for idx in sims[:3]:
        print("\nSimilar Ariticle：\n", articles[idx[0]])
        print("\nSimilarity：", idx[1])

示例#5

0

显示文件

文件： train_news.py 项目： setsal/MASL

def main():
    logging.basicConfig(format='[%(levelname)s] : %(message)s',
                        level=logging.INFO)

    if len(sys.argv) < 2:
        logging.error("No argument")
        logging.info("Usage: $python train_news.py [month]")
        logging.info("Jan for January, Feb for February, etc.")
        sys.exit()

    datefrom = ""
    dateto = ""

    if sys.argv[1] == "Sep":
        datefrom = "\"2018-09-01\""
        dateto = "\"2018-09-31\""
    elif sys.argv[1] == "Oct":
        datefrom = "\"2018-10-01\""
        dateto = "\"2018-10-30\""
    elif sys.argv[1] == "Nov":
        datefrom = "\"2018-11-01\""
        dateto = "\"2018-11-31\""
    elif sys.argv[1] == "Sep_e":
        datefrom = "\"2018-09-01\""
        dateto = "\"2018-09-15\""
    elif sys.argv[1] == "Sep_l":
        datefrom = "\"2018-09-16\""
        dateto = "\"2018-09-30\""
    elif sys.argv[1] == "Oct_e":
        datefrom = "\"2018-10-01\""
        dateto = "\"2018-10-15\""
    elif sys.argv[1] == "Oct_l":
        datefrom = "\"2018-10-16\""
        dateto = "\"2018-10-30\""
    elif sys.argv[1] == "Nov_e":
        datefrom = "\"2018-11-01\""
        dateto = "\"2018-11-15\""

    init_stopword()
    article_lists = getArticleByTime("media_fetch_news", datefrom, dateto)
    if not article_lists:
        logging.error("No news data in the month, please choose another")
        sys.exit()

    seg_list = getSegment(article_lists)

    minn = 30
    maxn = 200

    # 移除只出現n次的字詞
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in seg_list:
        for token in text:
            frequency[token] += 1
    """
    sorted_frequency = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)

    for word in sorted_frequency:
        if word[1] > 30 and word[1] < 200:
            print(word[0]+":"+str(word[1]))
    """

    texts = [[
        token for token in text
        if frequency[token] > minn and frequency[token] < maxn
    ] for text in seg_list]

    dictname = "news_" + sys.argv[1] + ".dict"
    mmname = "news_" + sys.argv[1] + ".mm"
    tfidfname = "news_" + sys.argv[1] + ".tfidf"
    ldaname = "news_" + sys.argv[1] + ".lda"

    # Create dictionary
    dictionary = corpora.Dictionary(texts)
    dictionary.compactify()
    dictionary.save("output/" + dictname)
    logging.info("Create dict success.")

    # Serialize it
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize("output/" + mmname, corpus)
    logging.info("Create data flow success.")

    tfidf = models.TfidfModel(corpus)
    tfidf.save("output/" + tfidfname)
    corpus_tfidf = tfidf[corpus]
    logging.info("Create TF-IDF model success.")

    num_topic = 6

    # Transfer to LSI model
    lda = models.LdaModel(corpus,
                          id2word=dictionary,
                          num_topics=num_topic,
                          iterations=100,
                          passes=20)
    lda.save("output/" + ldaname)
    logging.info("Create LDA model success.")

    for i in range(lda.num_topics):
        print(lda.print_topic(i))
        print('\n')