示例#1
0
    train_logger.info(f'num_classes: {config.num_classes}')
    train_logger.info(f'num_filters: {config.num_filters}')
    train_logger.info(f'kernel_size: {config.kernel_size}')
    train_logger.info(f'kernel_size: {config.kernel_size}')
    train_logger.info(f'vocab_size: {config.vocab_size}')
    train_logger.info(f'batch_size: {config.batch_size}')
    train_logger.info(f'dropout_keep_prob: {config.dropout_keep_prob}')
    train_logger.info(
        '########################################################')


if __name__ == '__main__':
    # if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
    # raise ValueError("""usage: python run_cnn.py [train / test]""")

    train_logger = logger_factory.get_logger('train', True)
    train_logger.info('train time: {}'.format(datetime.now()))

    train_logger.info('Configuring CNN model...')
    config = TCNNConfig()
    print_config_params(config)
    if not os.path.exists(vocab_txt):  # 如果不存在词汇表,重建
        build_vocab(train_txt, vocab_txt, config.vocab_size)
    categories, cat_to_id = read_category(clf_name_txt)
    words, word_to_id = read_vocab(vocab_txt)
    config.vocab_size = len(words)
    model = TextCNN(config)
    # test()
    answer('/home/tqhy/ip_nlp/resources/questions',
           '/home/tqhy/ip_nlp/resources/answers')
    ans_score()
示例#2
0
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from common import logger_factory
from utils import file_utils

logger = logger_factory.get_logger('spacy_nlp')


def load(file_path):
    logger.info(f'load model {file_path}')
    stream = gzip.open(file_path, "rb")
    model = pickle.load(stream)
    stream.close()
    return model


def save(file_path, model):
    logger.info(f'save model {file_path}')
    stream = gzip.open(file_path, "wb")
    pickle.dump(model, stream)
    stream.close()
示例#3
0
# coding: utf-8

import os
from collections import Counter

import numpy as np
import tensorflow.python.keras as kr

from common import logger_factory
from utils import file_utils

logger = logger_factory.get_logger('data_loader')


def build_vocab(train_txt_path, vocab_txt_path, vocab_size=5000):
    """根据训练集构建词汇表,存储"""
    contents = file_utils.read_line(train_txt_path,
                                    lambda line_contents: line_contents[1]
                                    if len(line_contents) > 1 else '',
                                    split='\t')

    counter = Counter(
        [word for content in contents for word in content.split()])
    count_pairs = counter.most_common(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    file_utils.save_list2file(words, vocab_txt_path)


def read_vocab(vocab_dir):
示例#4
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# @Description:
# @File: clf_tokens_task.py
# @Project: ip_nlp
# @Author: Yiheng
# @Email: [email protected]
# @Time: 7/22/2019 14:10
import json
import os.path

from common import logger_factory

logger = logger_factory.get_logger('clf_tokens_task')


def get_tokens(file_path):
    with open(file_path) as f:
        line = f.readline()
        line_json = json.loads(line)
        logger.info(f'line_json is {line_json}')
        keys = line_json['key']
        logger.info(f'keys is {keys}')
        for doc_id, tokens in keys.items():
            # logger.info(f'id {doc_id}, tokens {tokens}')
            yield tokens


def write_tokens(store_file_path, clf_name, tokens):
    with open(store_file_path, 'a', encoding='utf-8') as f:
示例#5
0
# @Description:
# @File: test.py
# @Project: ip_nlp
# @Author: Yiheng
# @Email: [email protected]
# @Time: 7/15/2019 10:30
import time

from pymongo import ASCENDING

from common import logger_factory
from mongo.connect import get_collection
from mongo.utils.query_filter_utils import get_clf_query_filter

logger = logger_factory.get_logger('doc_service')


def create_index(db_name, clc_name, field_name, sort=ASCENDING):
    """
    create index of doc field to specified db's collection
    :param db_name:
    :param clc_name:
    :param field_name:
    :param sort: default direction is asc
    :return:
    """

    clc = get_collection(db_name, clc_name)
    clc.create_index([(field_name, sort)], background=True)
示例#6
0
# @Description:
# @File: segment.py
# @Project: ip_nlp
# @Author: Yiheng
# @Email: [email protected]
# @Time: 7/17/2019 10:58
import os
import re

import jieba
import jieba.analyse

from common import logger_factory
from common import path_config

logger = logger_factory.get_logger('segment')

jieba.load_userdict(path_config.cnki_dict)
# match decimal or single character
digit_pattern = re.compile(r'^[0-9]+(\.[0-9]+)?[a-zA-Z%‰]?|^[a-zA-Z]$')
dna_pattern = re.compile(r'[ACTGU]{8,}')
chinese_pattern = re.compile(r'[\u4E00-\u9FFF]+')
chemistry_pattern1 = re.compile(
    r'(?P<chemistry>[a-zA-Z]+)(?P<digit>[0-9]+)?\.[0-9]+[%‰]?')
chemistry_pattern2 = re.compile(
    r'(?P<chemistry>[a-zA-Z]+)(?P<digit>[0-9]+[%‰])')

# chinese punctuations


def load_stop_words(file_path):