def ajax_add_corpus(): corpus_type = request.args.get('corpus_type', '') #corpus_type可取值:主题语料,日常语料 theme_daily_name = request.args.get('theme_daily_name', '') text = request.args.get('text', '') uid = request.args.get('uid', '') mid = request.args.get('mid', '') timestamp = int(time.time()) retweeted = request.args.get('retweeted', '') comment = request.args.get('comment', '') like = request.args.get('like', '') create_type = request.args.get('create_type', '') corpus_info = [ corpus_type, theme_daily_name, text, uid, mid, timestamp, retweeted, comment, like, create_type ] results = create_corpus(corpus_info) return json.dumps(results)
def main(): # -------- hyper params -------------- file_path = "nlp_sample.txt" embedding_dim = 200 hidden_dim = 128 BATCH_NUM = 100 epoch = 10 # 損失関数 criterion = gluon.loss.SoftmaxCrossEntropyLoss() # optimize opt = "adam" save = True # ----- dataの用意 --------- input_date, output_date = utils.date_load(file_path) # inputとoutputの系列の長さを取得 # すべて長さが同じなので、0番目の要素でlenを取ってます # paddingする必要は、このデータに対してはない # input_len = len(input_date[0]) # 29 # output_len = len(output_date[0]) # 10 input_data, output_data, char2id, id2char = utils.create_corpus( input_date, output_date) vocab_size = len(char2id) # 7:3でtrainとtestに分ける train_x, test_x, train_y, test_y = train_test_split( input_data, output_data, train_size=0.7) train_x = np.array(train_x) train_y = np.array(train_y) train_data = mx.io.NDArrayIter(train_x, train_y, BATCH_NUM, shuffle=False) # -------- training --------------- encoder = model.Encoder(vocab_size, embedding_dim, hidden_dim) attn_decoder = model.AttentionDecoder( vocab_size, embedding_dim, hidden_dim, BATCH_NUM) encoder, attn_decoder = train(encoder, attn_decoder, train_data, epoch, criterion, opt=opt, save=save)
test_clean = pd.read_csv(nlp_path + 'test_clean.csv') print('... done loading data from file !!!') else: train = pd.read_csv('C:/Users/cdoerr1/Desktop/CoronaAi/nlp-getting-started/train.csv') test = pd.read_csv('C:/Users/cdoerr1/Desktop/CoronaAi/nlp-getting-started/test.csv') train_clean = train['text'].progress_apply(lambda x : utils.cleanTweet(x, appostrophes=True, emojis=True, html=True, url=True,\ misspellings=False, punctuation=True, lemming=True, stop=True)) utils.safeIndicators(train_clean, nlp_path, 'train_clean') print('finished cleanning train dataset') test_clean = train['text'].progress_apply(lambda x : utils.cleanTweet(x, appostrophes=True, emojis=True, html=True, url=True,\ misspellings=False, punctuation=True, lemming=True, stop=True)) print('finished cleanning train dataset') utils.safeIndicators(test_clean, nlp_path, 'test_clean') data = pd.concat([train_clean,test_clean]) data_corpus = utils.create_corpus(data) embedding_dict={} with open('C:/Users/cdoerr1/Desktop/CoronaAi/nlp-getting-started/glove.6B.100d.txt','r', encoding="utf8") as f: for line in f: values=line.split() word=values[0] vectors=np.asarray(values[1:],'float32') embedding_dict[word]=vectors f.close() MAX_LEN=50 tokenizer_obj=Tokenizer() tokenizer_obj.fit_on_texts(data_corpus) sequences=tokenizer_obj.texts_to_sequences(data_corpus) tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post') word_index=tokenizer_obj.word_index