예제 #1
0
def ajax_add_corpus():
    corpus_type = request.args.get('corpus_type',
                                   '')  #corpus_type可取值:主题语料,日常语料
    theme_daily_name = request.args.get('theme_daily_name', '')
    text = request.args.get('text', '')
    uid = request.args.get('uid', '')
    mid = request.args.get('mid', '')
    timestamp = int(time.time())
    retweeted = request.args.get('retweeted', '')
    comment = request.args.get('comment', '')
    like = request.args.get('like', '')
    create_type = request.args.get('create_type', '')
    corpus_info = [
        corpus_type, theme_daily_name, text, uid, mid, timestamp, retweeted,
        comment, like, create_type
    ]
    results = create_corpus(corpus_info)
    return json.dumps(results)
예제 #2
0
def main():
    # -------- hyper params --------------
    file_path = "nlp_sample.txt"
    embedding_dim = 200
    hidden_dim = 128
    BATCH_NUM = 100

    epoch = 10
    # 損失関数
    criterion = gluon.loss.SoftmaxCrossEntropyLoss()
    # optimize
    opt = "adam"

    save = True

    # ----- dataの用意 ---------

    input_date, output_date = utils.date_load(file_path)
    # inputとoutputの系列の長さを取得
    # すべて長さが同じなので、0番目の要素でlenを取ってます
    # paddingする必要は、このデータに対してはない
    # input_len = len(input_date[0])  # 29
    # output_len = len(output_date[0])  # 10

    input_data, output_data, char2id, id2char = utils.create_corpus(
        input_date, output_date)
    vocab_size = len(char2id)

    # 7:3でtrainとtestに分ける
    train_x, test_x, train_y, test_y = train_test_split(
        input_data, output_data, train_size=0.7)

    train_x = np.array(train_x)
    train_y = np.array(train_y)
    train_data = mx.io.NDArrayIter(train_x, train_y, BATCH_NUM, shuffle=False)

    # -------- training ---------------

    encoder = model.Encoder(vocab_size, embedding_dim, hidden_dim)
    attn_decoder = model.AttentionDecoder(
        vocab_size, embedding_dim, hidden_dim, BATCH_NUM)

    encoder, attn_decoder = train(encoder, attn_decoder, train_data,
                                  epoch, criterion, opt=opt, save=save)
예제 #3
0
    test_clean = pd.read_csv(nlp_path + 'test_clean.csv')
    print('... done loading data from file !!!')
else:
    train = pd.read_csv('C:/Users/cdoerr1/Desktop/CoronaAi/nlp-getting-started/train.csv')
    test = pd.read_csv('C:/Users/cdoerr1/Desktop/CoronaAi/nlp-getting-started/test.csv')
    train_clean = train['text'].progress_apply(lambda x : utils.cleanTweet(x, appostrophes=True, emojis=True, html=True, url=True,\
                                                                     misspellings=False, punctuation=True, lemming=True, stop=True))
    utils.safeIndicators(train_clean, nlp_path, 'train_clean')
    print('finished cleanning train dataset')
    test_clean = train['text'].progress_apply(lambda x : utils.cleanTweet(x, appostrophes=True, emojis=True, html=True, url=True,\
                                                                     misspellings=False, punctuation=True, lemming=True, stop=True))
    print('finished cleanning train dataset')
    utils.safeIndicators(test_clean, nlp_path, 'test_clean')

data = pd.concat([train_clean,test_clean])
data_corpus = utils.create_corpus(data)
embedding_dict={}
with open('C:/Users/cdoerr1/Desktop/CoronaAi/nlp-getting-started/glove.6B.100d.txt','r', encoding="utf8") as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(data_corpus)
sequences=tokenizer_obj.texts_to_sequences(data_corpus)
tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')
word_index=tokenizer_obj.word_index