예제 #1
0
def add_word():
    jieba.add_word("创新办")
    jieba.add_word("专家")
    jieba.add_word("云计算")
    jieba.del_word("大福")
    cuts = jieba.cut("周大福是创新办主任也是云计算方面的专家", cut_all=False)
    logger.info(",".join(cuts))
예제 #2
0
def posseg():
    """
    词性标注
    :return:
    """
    words = jieba.posseg.cut("我爱北京天安门")
    for word, flag in words: logger.info('%s %s' % (word, flag))
예제 #3
0
def tf_idf():
    """
    tfidf关键词提取   tf=词w在文档d出现的次数/文档d包含的词数量,idf=log(总文档数量D/(包含词w的文档数量+1))
    :return:
    """
    s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
    tags = jieba.analyse.extract_tags(s, topK=20, withWeight=True)
    for x, w in tags: logger.info('%s %s' % (x, w))
예제 #4
0
def dictionary():
    """
    使用自定义词典
    :return:
    """
    jieba.set_dictionary(os.path.join(root_path, "data", "jieba", "dict.txt.big.txt"))
    cuts = jieba.cut("周大福是创新办主任也是云计算方面的专家", cut_all=False)
    logger.info(",".join(cuts))
예제 #5
0
def load_userdict():
    """
    添加用户自定义词典
    :return:
    """
    jieba.load_userdict(os.path.join(root_path, "data", "jieba", "userdict.txt"))
    cuts = jieba.cut("周大福是创新办主任也是云计算方面的专家", cut_all=False)
    logger.info(",".join(cuts))
예제 #6
0
def textrank():
    """
    textrank关键词提取
    :return:
    """
    s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
    tags = jieba.analyse.textrank(s, topK=20, withWeight=True)
    for x, w in tags:
        logger.info('%s %s' % (x, w))
예제 #7
0
def tokenize():
    """
    分词
    :return:
    """
    s = "周大福是创新办主任也是云计算方面的专家"
    result = jieba.tokenize(s)
    logger.info("普通模式")
    for tk in result: logger.info("word: {0} \t\t start: {1} \t\t end: {2}".format(tk[0], tk[1], tk[2]))

    logger.info("\n搜索模式")
    result = jieba.tokenize(s, mode='search')
    for tk in result: logger.info("word: {0} \t\t start: {1} \t\t end: {2}".format(tk[0], tk[1], tk[2]))
예제 #8
0
def keras_bostonhousing():
    (x_train, y_train), (x_test,
                         y_test) = keras.datasets.boston_housing.load_data(
                             os.path.join(root_path, "data", "boston_housing",
                                          "boston_housing.npz"))
    logger.info(x_train.shape)
    logger.info(y_train.shape)
    logger.info(x_test.shape)
    logger.info(y_test.shape)

    # x_train = keras.utils.np_utils.normalize(x_train, 1)
    # y_train = keras.utils.np_utils.normalize(y_train, 0)[0]
    x_train = x_train / 100.
    y_train = y_train / 100.
    x_test = x_test / 100.
    y_test = y_test / 100.

    # 数据预处理
    model = keras.models.Sequential()
    model.add(
        keras.layers.Dense(32,
                           activation="relu",
                           input_shape=(x_train.shape[1], ),
                           use_bias=True))
    model.add(keras.layers.Dense(128, activation="relu"))
    model.add(keras.layers.Dense(1, activation="sigmoid"))

    model.summary()

    model.compile(optimizer=keras.optimizers.SGD(lr=0.001), loss="mse")

    result = model.fit(x_train,
                       y_train,
                       batch_size=16,
                       epochs=50,
                       verbose=1,
                       validation_split=0.2)
    logger.info(result)

    predict_datas = model.predict(x_test, batch_size=64, verbose=1)
    for i in range(10):
        logger.info("real {0},predict: {1}".format(y_test[i],
                                                   predict_datas[i]))
예제 #9
0
def keras_cifar_cnn():
    """
    cifar图片分类多层神经网络实现
    :return:
    """
    (x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()

    # 图像数据预处理
    x_train = x_train / 255
    x_test = x_test / 255

    num_classes = max(np.max(y_train), np.max(y_test)) + 1
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    logger.info(x_train.shape)
    logger.info(y_train.shape)
    logger.info(x_test.shape)
    logger.info(y_test.shape)

    model = keras.models.Sequential()
    model.add(
        keras.layers.Convolution2D(64, (3, 3),
                                   padding="same",
                                   data_format="channels_last",
                                   activation="relu",
                                   input_shape=(32, 32, 3)))
    model.add(
        keras.layers.Convolution2D(64, (3, 3),
                                   padding="same",
                                   activation="relu"))
    model.add(keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    # model.add(keras.layers.Dropout(0.25))

    model.add(
        keras.layers.Convolution2D(128, (3, 3),
                                   padding="same",
                                   activation="relu"))
    model.add(
        keras.layers.Convolution2D(128, (3, 3),
                                   padding="same",
                                   activation="relu"))
    model.add(keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    # model.add(keras.layers.Dropout(0.25))

    model.add(
        keras.layers.Convolution2D(256, (3, 3),
                                   padding="same",
                                   activation="relu"))
    model.add(
        keras.layers.Convolution2D(256, (3, 3),
                                   padding="same",
                                   activation="relu"))
    model.add(
        keras.layers.Convolution2D(256, (3, 3),
                                   padding="same",
                                   activation="relu"))
    model.add(keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    # model.add(keras.layers.Dropout(0.25))

    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(128, activation="relu"))
    model.add(keras.layers.Dense(64, activation="relu"))
    model.add(keras.layers.Dense(num_classes, activation="softmax"))

    model.summary()
    model.compile(optimizer="adam",
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])

    model.fit(x_train,
              y_train,
              batch_size=64,
              epochs=10,
              verbose=1,
              validation_data=(x_test, y_test))

    test_loss, test_accuracy = model.evaluate(x_test,
                                              y_test,
                                              batch_size=64,
                                              verbose=0)
    logger.info("\ntest_loss:{0},test_accuracy:{1}".format(
        test_loss, test_accuracy))
예제 #10
0
def custorm_idf():
    s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
    jieba.analyse.set_idf_path(os.path.join(root_path, "data", "jieba", "idf.txt.big.txt"))
    tags = jieba.analyse.extract_tags(s, topK=20, withWeight=True)
    for x, w in tags: logger.info('%s %s' % (x, w))
예제 #11
0
def stop_words():
    s = "周大福是创新办主任也是云计算方面的专家"
    jieba.analyse.set_stop_words(os.path.join(root_path, "data", "jieba", "stopwords.txt"))
    tags = jieba.analyse.extract_tags(s, topK=5, withWeight=True)
    for x, w in tags: logger.info('%s %s' % (x, w))
예제 #12
0
def cut_for_search():
    cuts = jieba.cut_for_search("周大福是创新办主任也是云计算方面的专家")
    logger.info(",".join(cuts))
예제 #13
0
def cut():
    cuts = jieba.cut("周大福是创新办主任也是云计算方面的专家", cut_all=False)
    logger.info(",".join(cuts))
예제 #14
0
def keras_reuters_mlp(num_words=None,
                      maxlen=None,
                      num_categorical=None,
                      batch_size=32,
                      epochs=10,
                      mode=None):
    """
    使用mlp多层感知机模式进行情感评估,同时对比自归一化mlp和常规mlp的性能对比
    :return:
    """
    (X_train, y_train), (X_test, y_test) = reuters.load_data(
        path=os.path.join(root_path, "data", "reuters", "reuters.npz"),
        num_words=num_words)

    if not num_words:
        num_words = max(max([max(x)
                             for x in X_train]), max([max(x)
                                                      for x in X_test])) + 1
    if not maxlen:
        maxlen = max(max([len(x)
                          for x in X_train]), max([len(x)
                                                   for x in X_test])) + 1
    if not num_categorical:
        num_categorical = max(max(y_train), max(y_test)) + 1

    tokenizer = text.Tokenizer(num_words=num_words)
    X_train = tokenizer.sequences_to_matrix(X_train)
    y_train = keras.utils.to_categorical(y_train, num_categorical)
    X_test = tokenizer.sequences_to_matrix(X_test)
    y_test = keras.utils.to_categorical(y_test, num_categorical)

    input = keras.layers.Input(shape=(num_words, ))
    # 自归一化snn
    if mode == "self-normalizing":
        x = keras.layers.Dense(512,
                               activation=keras.activations.selu,
                               kernel_initializer="lecun_normal")(input)
        x = keras.layers.AlphaDropout(0.1)(x)

        x = keras.layers.Dense(256,
                               activation="selu",
                               kernel_initializer="lecun_normal")(x)
        x = keras.layers.AlphaDropout(0.1)(x)

        x = keras.layers.Dense(128,
                               activation="selu",
                               kernel_initializer="lecun_normal")(x)
        x = keras.layers.AlphaDropout(0.1)(x)
    else:
        x = keras.layers.Dense(512,
                               activation="relu",
                               kernel_initializer="glorot_normal")(input)
        x = keras.layers.BatchNormalization()(x)
        # x = keras.layers.Dropout(0.4)(x)

        x = keras.layers.Dense(256,
                               activation="relu",
                               kernel_initializer="glorot_normal")(x)
        x = keras.layers.BatchNormalization()(x)
        # x = keras.layers.Dropout(0.4)(x)

        x = keras.layers.Dense(128,
                               activation="relu",
                               kernel_initializer="glorot_normal")(x)
        x = keras.layers.BatchNormalization()(x)
        # x = keras.layers.Dropout(0.4)(x)

    x = keras.layers.Dense(num_categorical, activation="softmax")(x)

    model = keras.models.Model(inputs=input, outputs=x)
    model.summary()

    model.compile(optimizer="adadelta",
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    history = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_split=0.2)
    keras_history_plotcurve(history)

    score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
    logger.info('Test loss:{0}'.format(score[0]))
    logger.info('Test accuracy:{0}'.format(score[1]))
예제 #15
0
def keras_reuters_info():
    (X_train, y_train), (X_test, y_test) = reuters.load_data(path=os.path.join(
        root_path, "data", "reuters", "reuters.npz"),
                                                             skip_top=0,
                                                             maxlen=None,
                                                             test_split=0.2,
                                                             seed=113,
                                                             start_char=1,
                                                             oov_char=2,
                                                             index_from=3)
    logger.info(X_train.shape)
    logger.info(y_train.shape)
    logger.info(X_test.shape)
    logger.info(y_test.shape)

    word_index = reuters.get_word_index(
        os.path.join(root_path, "data", "reuters", "reuters_word_index.json"))
    logger.info(word_index)

    num_words = max(max([len(x)
                         for x in X_train]), max([len(x) for x in X_test])) + 1
    num_classify = max(max(y_train), max(y_test)) + 1
    num_vocab = max(max([max(x)
                         for x in X_train]), max([max(x) for x in X_test])) + 1

    logger.info("num_words {0}".format(num_words))
    logger.info("num_classify {0}".format(num_classify))
    logger.info("num_voc {0}".format(num_vocab))