Python TweetTokenizer.texts_to_sequences 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltk.tokenize

클래스/타입: TweetTokenizer

메소드/함수: texts_to_sequences

hotexamples.com에서의 예제들: 6

Python TweetTokenizer.texts_to_sequences - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltk.tokenize.TweetTokenizer.texts_to_sequences에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

TweetTokenizer(30)

tokenize(30)

fit_on_texts(6)

texts_to_sequences(6)

extend(3)

index(3)

remove(2)

lower(1)

pop(1)

strip(1)

예제 #1

파일 보기

파일: Single_GRU_wiki_char(including preprocessing).py 프로젝트: hanhaohh/Kaggle-project-list

list_sentences_test = test_cl.comment_text

print("....start....pretrain")

from numpy import asarray
from numpy import zeros

print("....At....Tokenizer")

puncuate = r'([\.\!\?\:\,])'

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=max_features, oov_token=puncuate)
tokenizer.fit_on_texts(list(list_sentences_train) + list(list_sentences_test))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

totalNumWords = [len(one_comment) for one_comment in list_tokenized_train]
print("mean length:" + str(np.mean(totalNumWords)))
print("max length:" + str(max(totalNumWords)))
print("std length:" + str(np.std(totalNumWords)))

print(" maxlen is:" + str(maxlen))

print("number of different word:" + str(len(tokenizer.word_index.items())))

if len(tokenizer.word_index.items()) < max_features:
    max_features = len(tokenizer.word_index.items())

from keras.preprocessing import sequence

예제 #2

파일 보기

    recall = recall(y_true, y_prediction)
    precision = precision(y_true, y_prediction)

    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))


from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

#tokenizing the words
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

#X_test = tokenizer.texts_to_sequences(X_train)
#X_test

from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

#sequence padding for the model
from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

예제 #3

파일 보기

파일: tweet_process.py 프로젝트: mleonetti12/Tweet-Language-Classifier

		labels.append(0)
	elif (label_arr[i][1] == 1):
		labels.append(1)
	elif (label_arr[i][2] == 1):
		labels.append(2)
	elif (label_arr[i][3] == 1):
		labels.append(3)
'''
#(unique, counts) = np.unique(data_arr.flatten(), return_counts=True)
#vocab_size = len(unique)
#labels_a = np.array(labels)
token = Tokenizer()
token.fit_on_texts(data_arr)
index = token.word_index
index_len = len(index)
new_data = token.texts_to_sequences(data_arr)
new_data = pad_sequences(new_data)
print(new_data.shape)

train, test, train_lab, test_lab = train_test_split(new_data, label_arr)
print(train.shape, train_lab.shape)
print(test.shape, test_lab.shape)
#embed = Word2Vec(train, min_count=1)
print(new_data.shape[1])
model = Sequential()

model.add(Embedding(index_len * 2, 100, input_length=new_data.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',

예제 #4

파일 보기

파일: airportevaluation_a.py 프로젝트: huda2017/PassengerTweetsEvaluation

tweets, y = read_test_data(file_dir + 'AraSenti_all.xlsx')
testTweets, ytest = read_train_data(file_dir + 'KKAISA_tweets.xlsx')

#tweets preprocessing

tweets = [tweet_preprocessing(t) for t in tweets]
testTweets = [tweet_preprocessing(t) for t in testTweets]

max_tweet_length = max([len(x.split()) for x in (tweets + testTweets)])

## Tokenization and padding

tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets + testTweets)

sequences = tokenizer.texts_to_sequences(tweets)
x_train = pad_sequences(sequences, maxlen=max_tweet_length)

sequences = tokenizer.texts_to_sequences(testTweets)
x_test = pad_sequences(sequences, maxlen=max_tweet_length)

vocab_size = len(tokenizer.word_index) + 1  ## in my dataset

#create one hot_vectors for labels
y_train = to_categorical(y, classes)
y_test = to_categorical(ytest, classes)

#upload pre_trained embedding
embeddings_index = load_embedding()

#map our data to word embedding

예제 #5

파일 보기

    word = word_tokenize(text)
    l = len(word)
    r_len.append(l)

MAX_REVIEW_LEN = np.max(r_len)
MAX_REVIEW_LEN

max_features = num_unique_word
max_words = MAX_REVIEW_LEN
batch_size = 128
epochs = 3
num_classes = 5

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train_text))
X_train = tokenizer.texts_to_sequences(X_train_text)
X_val = tokenizer.texts_to_sequences(X_val_text)
X_test = tokenizer.texts_to_sequences(test_text)

#sequence padding?
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

model1 = Sequential()
model1.add(Embedding(max_features, 100, mask_zero=True))
model1.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))
model1.add(LSTM(32, dropout=0.5, recurrent_dropout=0.5,
                return_sequences=False))
model1.add(Dense(num_classes, activation='softmax'))
model1.compile(loss='categorical_crossentropy',

예제 #6

파일 보기

파일: lstm_classifier.py 프로젝트: ZhenxiangWang/Question-Answering-System

LOC_labels = [[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 ] for _ in LOC]  # 3
MONEY_labels = [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 ] for _ in MONEY]  # 4
NUMBER_labels = [[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 ] for _ in NUMBER]  # 5
ORG_labels = [[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 ] for _ in ORG]  # 6
OTHER_labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 ] for _ in OTHER]  # 7
PERCENT_labels = [[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 ] for _ in PERCENT]  # 8
PERSON_labels = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0 ] for _ in PERSON]  # 9
TIME_labels = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] for _ in TIME]  # 10

y = np.concatenate([EVENT_labels, GPE_labels, LANGUAGE_labels, LOC_labels, MONEY_labels, NUMBER_labels, ORG_labels, OTHER_labels,
     PERCENT_labels, PERSON_labels, TIME_labels], 0)

max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_text))
list_tokenized_train = tokenizer.texts_to_sequences(x_text)

maxlen = 100
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)

inp = Input(shape=(maxlen, ))

embed_size = 128
x = Embedding(max_features, embed_size)(inp)
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(11, activation="softmax")(x)
model = Model(inputs=inp, outputs=x)