def get_conn():
    host = config.get_config('host')
    user = config.get_config('user')
    password = config.get_config('password')
    database = config.get_config('database')
    return pymysql.connect(host=host,
                           user=user,
                           password=password,
                           database=database,
                           charset='utf8mb4')
Пример #2
0
def load_stop_words():
    path = config.get_config('stop_word_path')
    with open(path, encoding='utf-8') as f:
        stop_words = f.readlines()
    for i in range(len(stop_words)):
        stop_words[i] = stop_words[i].strip()
    return stop_words
Пример #3
0
def load_dataset(text_field, label_field, args, **kwargs):
    train_dataset, dev_dataset = dataset.get_dataset('/home/ubuntu/user_space/lhw/public_opinion_monitoring/app/datas', text_field, label_field)
    vec_name = config.get_config('pretrained-name')
    vec_path = config.get_config('pretrained-path')
    if args.static and args.pretrained_name and args.pretrained_path:
        print('load word vector')
        vectors = load_word_vectors(vec_name, vec_path)
        text_field.build_vocab(train_dataset, dev_dataset, vectors=vectors)
    else:
        text_field.build_vocab(train_dataset, dev_dataset)
    label_field.build_vocab(train_dataset, dev_dataset)
    train_iter, dev_iter = data.Iterator.splits(
        (train_dataset, dev_dataset),
        batch_sizes=(args.batch_size, len(dev_dataset)),
        sort_key=lambda x: len(x.comment),
        **kwargs)
    return train_iter, dev_iter
Пример #4
0
def load_test_data():
    path = config.get_config('test_data_path')
    data = pd.read_csv(path, encoding='utf-8')
    sample = data.sample(10).reset_index()
    # print(sample)
    questions = sample.get('question')
    answer = sample.get('answer')
    return questions, answer
Пример #5
0
def split_dataset():
    path = config.get_config('data_path')
    data = pd.read_csv(path, encoding='utf-8')
    shuffle_data = data.sample(frac=1.0).reset_index()
    train_num = int(shuffle_data.shape[0] * 0.8)
    train_data = shuffle_data.loc[0:train_num].drop(labels='index', axis=1)
    test_data = shuffle_data.loc[train_num + 1:].drop(labels='index', axis=1)
    train_data.to_csv("../datas/train_data.csv", index=False)
    test_data.to_csv("../datas/test_data.csv", index=False)
    return
Пример #6
0
def load_data():
    path = config.get_config('train_data_path')
    data = pd.read_csv(path, encoding='utf-8')
    questions = data.get('question')
    answer = data.get('answer')
    stop_words = load_stop_words()
    word_dict = defaultdict()
    for each in range(len(questions)):
        try:
            generate_index_dict(questions[each], each, stop_words, word_dict)
        except Exception:
            print(each)
            print(questions[each])
    return word_dict, stop_words, answer, questions
Пример #7
0
def handle_corpus():
    path = config.get_config('train_data_path')
    data = pd.read_csv(path, encoding='utf-8')
    questions = data.get('question')
    answer = data.get('answer')
    corpus = []
    for each in range(len(questions)):
        try:
            words = jieba.lcut(questions[each] + ',' + answer[each])
            # temp = []
            # for word in words:
            #     w = re.match('[\u4e00-\u9fa5]', word, False)
            #     if w is not None:
            #         temp.append(w.string)
            # if len(temp) <= 0:
            #     continue
            sen = reduce(lambda x, y: x + ' ' + y, words)
            corpus.append(sen)
        except Exception:
            print(each)
            print(questions[each])
    return corpus
Пример #8
0
def load_word_embedding_model(path=None):
    path = config.get_config('word_embedding_path')
    word_embedding = gensim.models.Word2Vec.load(path)
    # word_embedding = KeyedVectors.load_word2vec_format(path)
    return word_embedding
Пример #9
0
def load_word_embedding_model():
    path = config.get_config('')
    word_embedding = gensim.models.Word2Vec.load(path)
    return word_embedding
Пример #10
0
def schedule_task():
    interval = config.get_config("craw_interval")
    scheduler.enter(10, 0, craw_latest_comment, (int(interval), ))
    task = threading.Thread(target=scheduler.run)
    task.start()
Пример #11
0
    general_service.plot_latest_chart()


def craw_latest_comment(inc):
    scheduler.enter(inc, 0, craw_latest_comment, (inc, ))
    do_job()


def schedule_task():
    interval = config.get_config("craw_interval")
    scheduler.enter(10, 0, craw_latest_comment, (int(interval), ))
    task = threading.Thread(target=scheduler.run)
    task.start()


path = config.get_config('word_embedding_path')
word_vector_model = KeyedVectors.load_word2vec_format(path)


def cosine(vec1, vec2):
    distance = pdist(np.vstack([vec1, vec2]), 'cosine')[0]
    return distance


def extract_key_words(comment):
    comment = comment.replace("\n", "")
    comment = comment.replace("\r", "")
    words = jieba.lcut(comment)
    words = filter(lambda x: len(x) > 1, words)
    words = list(filter(lambda x: x not in stop_words, words))
    if len(words) <= 0: