def add_word(): jieba.add_word("创新办") jieba.add_word("专家") jieba.add_word("云计算") jieba.del_word("大福") cuts = jieba.cut("周大福是创新办主任也是云计算方面的专家", cut_all=False) logger.info(",".join(cuts))
def posseg(): """ 词性标注 :return: """ words = jieba.posseg.cut("我爱北京天安门") for word, flag in words: logger.info('%s %s' % (word, flag))
def tf_idf(): """ tfidf关键词提取 tf=词w在文档d出现的次数/文档d包含的词数量,idf=log(总文档数量D/(包含词w的文档数量+1)) :return: """ s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" tags = jieba.analyse.extract_tags(s, topK=20, withWeight=True) for x, w in tags: logger.info('%s %s' % (x, w))
def dictionary(): """ 使用自定义词典 :return: """ jieba.set_dictionary(os.path.join(root_path, "data", "jieba", "dict.txt.big.txt")) cuts = jieba.cut("周大福是创新办主任也是云计算方面的专家", cut_all=False) logger.info(",".join(cuts))
def load_userdict(): """ 添加用户自定义词典 :return: """ jieba.load_userdict(os.path.join(root_path, "data", "jieba", "userdict.txt")) cuts = jieba.cut("周大福是创新办主任也是云计算方面的专家", cut_all=False) logger.info(",".join(cuts))
def textrank(): """ textrank关键词提取 :return: """ s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" tags = jieba.analyse.textrank(s, topK=20, withWeight=True) for x, w in tags: logger.info('%s %s' % (x, w))
def tokenize(): """ 分词 :return: """ s = "周大福是创新办主任也是云计算方面的专家" result = jieba.tokenize(s) logger.info("普通模式") for tk in result: logger.info("word: {0} \t\t start: {1} \t\t end: {2}".format(tk[0], tk[1], tk[2])) logger.info("\n搜索模式") result = jieba.tokenize(s, mode='search') for tk in result: logger.info("word: {0} \t\t start: {1} \t\t end: {2}".format(tk[0], tk[1], tk[2]))
def keras_bostonhousing(): (x_train, y_train), (x_test, y_test) = keras.datasets.boston_housing.load_data( os.path.join(root_path, "data", "boston_housing", "boston_housing.npz")) logger.info(x_train.shape) logger.info(y_train.shape) logger.info(x_test.shape) logger.info(y_test.shape) # x_train = keras.utils.np_utils.normalize(x_train, 1) # y_train = keras.utils.np_utils.normalize(y_train, 0)[0] x_train = x_train / 100. y_train = y_train / 100. x_test = x_test / 100. y_test = y_test / 100. # 数据预处理 model = keras.models.Sequential() model.add( keras.layers.Dense(32, activation="relu", input_shape=(x_train.shape[1], ), use_bias=True)) model.add(keras.layers.Dense(128, activation="relu")) model.add(keras.layers.Dense(1, activation="sigmoid")) model.summary() model.compile(optimizer=keras.optimizers.SGD(lr=0.001), loss="mse") result = model.fit(x_train, y_train, batch_size=16, epochs=50, verbose=1, validation_split=0.2) logger.info(result) predict_datas = model.predict(x_test, batch_size=64, verbose=1) for i in range(10): logger.info("real {0},predict: {1}".format(y_test[i], predict_datas[i]))
def keras_cifar_cnn(): """ cifar图片分类多层神经网络实现 :return: """ (x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data() # 图像数据预处理 x_train = x_train / 255 x_test = x_test / 255 num_classes = max(np.max(y_train), np.max(y_test)) + 1 y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) logger.info(x_train.shape) logger.info(y_train.shape) logger.info(x_test.shape) logger.info(y_test.shape) model = keras.models.Sequential() model.add( keras.layers.Convolution2D(64, (3, 3), padding="same", data_format="channels_last", activation="relu", input_shape=(32, 32, 3))) model.add( keras.layers.Convolution2D(64, (3, 3), padding="same", activation="relu")) model.add(keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))) # model.add(keras.layers.Dropout(0.25)) model.add( keras.layers.Convolution2D(128, (3, 3), padding="same", activation="relu")) model.add( keras.layers.Convolution2D(128, (3, 3), padding="same", activation="relu")) model.add(keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))) # model.add(keras.layers.Dropout(0.25)) model.add( keras.layers.Convolution2D(256, (3, 3), padding="same", activation="relu")) model.add( keras.layers.Convolution2D(256, (3, 3), padding="same", activation="relu")) model.add( keras.layers.Convolution2D(256, (3, 3), padding="same", activation="relu")) model.add(keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))) # model.add(keras.layers.Dropout(0.25)) model.add(keras.layers.Flatten()) model.add(keras.layers.Dense(128, activation="relu")) model.add(keras.layers.Dense(64, activation="relu")) model.add(keras.layers.Dense(num_classes, activation="softmax")) model.summary() model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) model.fit(x_train, y_train, batch_size=64, epochs=10, verbose=1, validation_data=(x_test, y_test)) test_loss, test_accuracy = model.evaluate(x_test, y_test, batch_size=64, verbose=0) logger.info("\ntest_loss:{0},test_accuracy:{1}".format( test_loss, test_accuracy))
def custorm_idf(): s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" jieba.analyse.set_idf_path(os.path.join(root_path, "data", "jieba", "idf.txt.big.txt")) tags = jieba.analyse.extract_tags(s, topK=20, withWeight=True) for x, w in tags: logger.info('%s %s' % (x, w))
def stop_words(): s = "周大福是创新办主任也是云计算方面的专家" jieba.analyse.set_stop_words(os.path.join(root_path, "data", "jieba", "stopwords.txt")) tags = jieba.analyse.extract_tags(s, topK=5, withWeight=True) for x, w in tags: logger.info('%s %s' % (x, w))
def cut_for_search(): cuts = jieba.cut_for_search("周大福是创新办主任也是云计算方面的专家") logger.info(",".join(cuts))
def cut(): cuts = jieba.cut("周大福是创新办主任也是云计算方面的专家", cut_all=False) logger.info(",".join(cuts))
def keras_reuters_mlp(num_words=None, maxlen=None, num_categorical=None, batch_size=32, epochs=10, mode=None): """ 使用mlp多层感知机模式进行情感评估,同时对比自归一化mlp和常规mlp的性能对比 :return: """ (X_train, y_train), (X_test, y_test) = reuters.load_data( path=os.path.join(root_path, "data", "reuters", "reuters.npz"), num_words=num_words) if not num_words: num_words = max(max([max(x) for x in X_train]), max([max(x) for x in X_test])) + 1 if not maxlen: maxlen = max(max([len(x) for x in X_train]), max([len(x) for x in X_test])) + 1 if not num_categorical: num_categorical = max(max(y_train), max(y_test)) + 1 tokenizer = text.Tokenizer(num_words=num_words) X_train = tokenizer.sequences_to_matrix(X_train) y_train = keras.utils.to_categorical(y_train, num_categorical) X_test = tokenizer.sequences_to_matrix(X_test) y_test = keras.utils.to_categorical(y_test, num_categorical) input = keras.layers.Input(shape=(num_words, )) # 自归一化snn if mode == "self-normalizing": x = keras.layers.Dense(512, activation=keras.activations.selu, kernel_initializer="lecun_normal")(input) x = keras.layers.AlphaDropout(0.1)(x) x = keras.layers.Dense(256, activation="selu", kernel_initializer="lecun_normal")(x) x = keras.layers.AlphaDropout(0.1)(x) x = keras.layers.Dense(128, activation="selu", kernel_initializer="lecun_normal")(x) x = keras.layers.AlphaDropout(0.1)(x) else: x = keras.layers.Dense(512, activation="relu", kernel_initializer="glorot_normal")(input) x = keras.layers.BatchNormalization()(x) # x = keras.layers.Dropout(0.4)(x) x = keras.layers.Dense(256, activation="relu", kernel_initializer="glorot_normal")(x) x = keras.layers.BatchNormalization()(x) # x = keras.layers.Dropout(0.4)(x) x = keras.layers.Dense(128, activation="relu", kernel_initializer="glorot_normal")(x) x = keras.layers.BatchNormalization()(x) # x = keras.layers.Dropout(0.4)(x) x = keras.layers.Dense(num_categorical, activation="softmax")(x) model = keras.models.Model(inputs=input, outputs=x) model.summary() model.compile(optimizer="adadelta", loss="categorical_crossentropy", metrics=["accuracy"]) history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2) keras_history_plotcurve(history) score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1) logger.info('Test loss:{0}'.format(score[0])) logger.info('Test accuracy:{0}'.format(score[1]))
def keras_reuters_info(): (X_train, y_train), (X_test, y_test) = reuters.load_data(path=os.path.join( root_path, "data", "reuters", "reuters.npz"), skip_top=0, maxlen=None, test_split=0.2, seed=113, start_char=1, oov_char=2, index_from=3) logger.info(X_train.shape) logger.info(y_train.shape) logger.info(X_test.shape) logger.info(y_test.shape) word_index = reuters.get_word_index( os.path.join(root_path, "data", "reuters", "reuters_word_index.json")) logger.info(word_index) num_words = max(max([len(x) for x in X_train]), max([len(x) for x in X_test])) + 1 num_classify = max(max(y_train), max(y_test)) + 1 num_vocab = max(max([max(x) for x in X_train]), max([max(x) for x in X_test])) + 1 logger.info("num_words {0}".format(num_words)) logger.info("num_classify {0}".format(num_classify)) logger.info("num_voc {0}".format(num_vocab))