예제 #1
0
def read_urls(path, vocab, is_train, repr="3gram", ngram_size=3):
    urls = []
    max_len = 0
    if os.path.exists(path):
        with codecs.open(path, "r", "UTF-8") as f:
            for i, line in enumerate(f):
                line = line.strip()
                if len(line) == 0:
                    line = DEFAULT_URL
                marker_index = line.find('?')
                url = line[7:marker_index]
                if sys.version_info[0] < 3:
                    q_tokens = split_sent(normalize_unicode(unicode(url)),
                                          repr, ngram_size)
                else:
                    q_tokens = split_sent(normalize_unicode(str(url)), repr,
                                          ngram_size)
                token_ids = []
                if len(q_tokens) > max_len:
                    max_len = len(q_tokens)
                for token in q_tokens:
                    if token not in vocab['url']:
                        if is_train:
                            vocab['url'][token] = len(vocab['url'])
                    if token in vocab['url']:
                        token_ids.append(vocab['url'][token])
                    else:
                        token_ids.append(OOV_WORD_INDEX)
                urls.append(token_ids)
    return urls, max_len
예제 #2
0
def read_sentences(path,
                   vocab,
                   is_train,
                   repr="word",
                   ngram_size=3,
                   test_vocab=None):
    questions = []
    max_len = 0
    with codecs.open(path, "r", "UTF-8") as f:
        for i, line in enumerate(f):
            q_tokens = split_sent(normalize_unicode(line.strip()), repr,
                                  ngram_size)
            token_ids = []
            if len(q_tokens) > max_len:
                max_len = len(q_tokens)
            for token in q_tokens:
                if token not in vocab[repr]:
                    if is_train:
                        vocab[repr][token] = len(vocab[repr])
                    elif repr == "word" and token not in test_vocab[repr]:
                        test_vocab[repr][token] = len(vocab[repr]) + len(
                            test_vocab[repr])
                if token in vocab[repr]:
                    token_ids.append(vocab[repr][token])
                elif repr == "word":
                    token_ids.append(test_vocab[repr][token])
                else:
                    token_ids.append(OOV_WORD_INDEX)
            questions.append(token_ids)
    return questions, max_len
예제 #3
0
def generate_idf(base_dir):
    freq_dict = {"unigram": defaultdict(int),
                 "bigram": defaultdict(int),
                 "trigram": defaultdict(int)}

    total_word_freq, total_bigram_freq, total_ngram_freq = 0, 0, 0

    for dataset in datasets:
        for file in files:
            path = '%s/%s/%s' % (base_dir, dataset, file)
            if os.path.exists(path):
                with codecs.open(path, 'r', 'UTF-8') as f:
                    for i, line in enumerate(f):
                        tokens = split_sent(normalize_unicode(line.strip()), "word")
                        for j in range(len(tokens)):
                            freq_dict["unigram"][tokens[j].lower()] += 1
                            #freq_dict["word"][ps.stem(tokens[j])] += 1
                            if j >= 1:
                                bigram = " ".join([token.lower() for token in tokens[j - 1:j + 1]])
                                #bigram = " ".join([ps.stem(token) for token in tokens[j-1:j+1]])
                                freq_dict["bigram"][bigram] += 1
                            if j >= 2:
                                trigram = " ".join([token.lower() for token in tokens[j - 2:j + 1]])
                                #trigram = " ".join([ps.stem(token) for token in tokens[j-2:j+1]])
                                freq_dict["trigram"][trigram] += 1
                        total_word_freq += len(tokens)

    json.dump(freq_dict, open("%s/collection_raw_idf.json" % base_dir, "w"))
    return freq_dict, total_word_freq
예제 #4
0
    def fill_example_queue(self):
        input_gen = text_generator(
            utils.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                (op, evd, kp, arg) = next(input_gen)
            except StopIteration:
                tf.logging.info("The example generator for this example "
                                "queue filling thread has exhausted data.")
                if self._single_pass:
                    tf.logging.info(
                        "Finished reading dataset. This thread is stopping!!")
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "The example generator is out of data; error.")

            kp_sents = [sent.strip() for sent in utils.split_sent(kp, "kp")]
            arg_sents = [sent.strip() for sent in utils.split_sent(arg, "arg")]
            example = DataSample(op, evd, kp_sents, arg_sents, self._src_vocab,
                                 self._tgt_vocab, self._hps)
            self._example_queue.put(example)
def predicting(fname, weight_file):
    print('loading models')
    model = model_build(MODEL_NAME, model_cfg, embedding_matrix)
    model.load_weights(weight_file)
    df_predict = pd.read_excel(fname)
    df_predict = df_predict[pd.notnull(df_predict['segment'])]
    p_x = df_predict['segment'].tolist()
    if MODEL_NAME in ['HAN', 'MHAN']:
        p_x = [
            split_sent(sent, MAX_WORDS, MAX_SENTS, CUT_MODE) for sent in p_x
        ]
    p_x = get_sequences(tokenizer, p_x, TEXT_FORMAT, MAX_WORDS)
    print('data shape', p_x.shape)
    print('making predictions...')
    predicted = model.predict(p_x)
    df_predict['label_90'] = [1 if p[0] > 0.9 else 0 for p in predicted]
    #df_predict[df_predict['label_90'] == 1]['cleaned_text'][0:10]
    df_predict.to_excel('./data/taobao_0_30000_predict.xlsx', index=None)
    x_w, x_c, y = get_x_y(DATASET)
    
    tokenizer_w = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                            lower = True, split = " ")
    tokenizer_c = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                            lower = True, split = " ")
    tokenizer_w.fit_on_texts(x_w)
    tokenizer_c.fit_on_texts(x_c)
    vocab_w = tokenizer_w.word_index
    vocab_w['UNK'] = 0
    vocab_c = tokenizer_c.word_index
    vocab_c['UNK'] = 0
    
    DATE = time.strftime('%Y%m%d%H%M',time.localtime(time.time()))
    m_name = './model/' + MODEL_NAME + '_' + DATE + '.yaml'
    weights_name = './model/' + MODEL_NAME + '_weights_' + DATE + '.hdf5'

    MAX_WORDS = [20,30]
    MAX_SENTS = [6,6]
    
    x_w = [split_sent(sent, MAX_WORDS[0], MAX_SENTS[0], CUT_MODE) for sent in x_w]
    x_c = [split_sent(sent, MAX_WORDS[1], MAX_SENTS[1], CUT_MODE) for sent in x_c]
    
    embed_mat_w = load_embeddings(EMBED_FILE_word, vocab_w, EMBED_DIMS)
    embed_mat_c = load_embeddings(EMBED_FILE_char, vocab_c, EMBED_DIMS)
    
    result = train(CV, [x_w, x_c], y, [tokenizer_w, tokenizer_c], DATE)
    
    

    
DATE = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
model_file = './model/' + MODEL_NAME + '_' + DATE + '.h5'
weight_file = './model/' + MODEL_NAME + '_weights_' + DATE + '.hdf5'

# HAN族模型需要的文本输入格式
if MODEL_NAME in ['HAN', 'MHAN']:
    print('prepare inputs for HAN series model...')
    if EMBED_TYPE == 'word' or 'scratch':
        MAX_WORDS = 20
        MAX_SENTS = 5
    elif EMBED_TYPE == 'char':
        MAX_WORDS = 30
        MAX_SENTS = 6
    N_LIMIT = MAX_WORDS * MAX_SENTS
    sents = [
        split_sent(sent, MAX_WORDS, MAX_SENTS, CUT_MODE) for sent in sents
    ]
    TEXT_FORMAT = 'seq'
    new_name = MODEL_NAME + '_' + str(MAX_WORDS) + '_' + str(MAX_SENTS)
    model_file = './model/' + new_name + '_' + DATE + '.h5'
    weight_file = './model/' + new_name + '_weights_' + DATE + '.hdf5'

# 初始化参数设置
model_cfg = ModelConfig(MAX_WORDS,
                        MAX_SENTS,
                        EMBED_DIMS,
                        len(vocab) + 1,
                        MODEL_NAME,
                        ntags=2)
train_cfg = TrainingConfig(ntags=2, model_name=MODEL_NAME)
    # 模型及权重保存路径
    DATE = time.strftime('%Y%m%d%H%M',time.localtime(time.time()))
    m_name = './model/' + MODEL_NAME + '_' + DATE + '.yaml'
    weights_name = './model/' + MODEL_NAME + '_weights_' + DATE + '.hdf5'

    # HAN模型需要的文本输入格式
    if MODEL_NAME in ['HAN','HMAN']:
        print('prepare inputs for HAN series model...')
        if EMBED_TYPE == 'word' or 'scratch':
            MAX_WORDS = 20
            MAX_SENTS = 5
        elif EMBED_TYPE == 'char':
            MAX_WORDS = 30
            MAX_SENTS = 6
        N_LIMIT = MAX_WORDS * MAX_SENTS
        x = [split_sent(sent, MAX_WORDS, MAX_SENTS, CUT_MODE) for sent in x]
        if PREDICT:
            p_x = [split_sent(sent, MAX_WORDS, MAX_SENTS, CUT_MODE) for sent in predict_text]
        TEXT_FORMAT = 'seq'
        new_name = MODEL_NAME + '_' + str(MAX_WORDS) + '_' + str(MAX_SENTS)
        m_name = './model/' + new_name + '_' + DATE + '.yaml'
        weights_name = './model/' + new_name + '_weights_' + DATE + '.hdf5'


    
    # 读入预训练的词向量矩阵
    if PRE_TRAINED and MODEL_NAME != 'one-hot':
        print('loading word embeddings...')
        embedding_matrix = load_embeddings(EMBED_FILE, vocab, EMBED_DIMS)
    else:
        embedding_matrix = None