Пример #1
0
 def classify_string(self, string):
     num_features = 1000 # Why am I setting this?
     max_len      = 50   # Why am I setting this?
     z            = np.array(one_hot(string_explode(string), num_features, filters = ''))
     z.shape      = (1, z.shape[0])
     z            = sequence.pad_sequences(z, max_len)
     return self.levs[self.model.predict(z, batch_size = 1).argmax()]
Пример #2
0
    def transform(self, sentence):
        x = text.one_hot(sentence, self.max_features, lower=True, filters=" ")
        x_new = [[0 if t != i else 1 for i in range(self.max_features)] for t in x]

        null_vector = [0] * self.max_features
        pad_size = self.maxlen - len(x_new)
        for i in range(pad_size):
            x_new.append(null_vector)
        return x_new
Пример #3
0
 def _format_x(self, z, words):
     return sequence.pad_sequences(
         [
             one_hot(string_explode(x, words = words), self.num_features, filters = '') 
             for x in z
         ], 
         maxlen = self.max_len, 
         # truncating = 'post'
     )
Пример #4
0
def input_data():
    train_file = "3.25-data.txt"
    test_file = "test.txt"

    train_words = []
    train_tags = []

    X = []
    Y = []

    test_words = []
    test_tags = []
    with open(train_file, 'r') as f1:
        for line in f1:
            tks = line.split('\t', 1)
            word = tks[0]
            # word = jieba.cut(word, cut_all=True)
            words = ""
            for i in word:
                words += i + " "
            words = words[:len(words)-1].encode('utf8')
            x = one_hot(n=10000, text=words)
            if len(x) > 300:
                print len(x)
            try:
                tag = tks[1]
                if tag == "预警\n":
                    tag = [1, 0]
                else:
                    tag = [0, 1]
                train_words.append(x)
                train_tags.append(tag)
            except:
                pass
    # print train_words[0]
    index = [i for i in range(len(train_words))]
    train_words = pad_sentences(train_words)
    train_tags = np.concatenate([train_tags], 0)
    random.shuffle(index)
    for i, j in enumerate(train_words):
        if i < 0.1 * len(train_words):
            test_words.append(train_words[index[i]])
            test_tags.append(train_tags[index[i]])
        else:
            X.append(train_words[index[i]])
            Y.append(train_tags[index[i]])

    # with open(test_file, 'r') as f1:
    #     for line in f1:
    #         tks = line.split('\t', 1)
    #         word = tks[0]
    #         tag = tks[1]
    #         test_words.append(word)
    #         test_tags.append(tag)
    return X, Y, test_words, test_tags
Пример #5
0
def preprocess(tweet):
    tweet = re.sub('@\w+', ' ', tweet)
    tweet = re.sub('[^A-Za-z1-9!? ]', ' ', tweet)
    tweet = tweet.lower()
    # stop_words = set(stopwords.words('english'))
    # tweet = word_tokenize(tweet)
    # tweet = [w for w in tweet if not w in stop_words]
    # tweet = ' '.join(tweet)
    tweet = one_hot(tweet, 3000, lower=False)
    processed = pad_sequences([tweet], 35, padding='post', truncating='post')
    return processed
Пример #6
0
 def transform_keywords(self, file_name):
     inf_file = open(file_name)
     data = list()
     for one_news in inf_file.readlines():
         single = one_news.strip().split(',')
         mapping = list()
         for one_keyword in single:
             mapping.append(one_hot(one_keyword, 7000)[0])
         data.append(mapping)
     #print(data)
     return data
Пример #7
0
 def transform_titles(self, file_name):
     inf_file = open(file_name)
     data = list()
     for one_news in inf_file.readlines():
         single = nltk.word_tokenize(self.clean_sentence(one_news))
         mapping = list()
         for one_keyword in single:
             mapping.append(one_hot(one_keyword, 7000)[0])
         data.append(mapping)
     # print(data)
     return data
def one_encoding(data):
    words = set(
        text_to_word_sequence(str(data),
                              filters="!”#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t'\n"))
    size = len(words)
    result = one_hot(str(data), round(size * 1.3))
    label = []
    train_data = []
    test_data = []
    get_id = {}
    for i in words:
        a = one_hot(str(i), round(size * 1.3))
        b = i
        label.extend([a])
        get_id[b] = str(a)
    for i in range(round(len(label) * 0.75)):
        train_data.append(i)
    for i in range(round(len(label) * 0.25)):
        test_data.append(i)
    return train_data, test_data
Пример #9
0
def transform_titles(text):

    data = list()
    for one_news in text:
        single = nltk.word_tokenize(clean_sentence(one_news))
        mapping = list()
        for one_keyword in single:
            mapping.append(one_hot(one_keyword, 7000)[0])
        data.append(mapping)
        # print(data)
    return data
Пример #10
0
def discriminateur_text(model_disc, text_seq_length, seed_text):

    encoded = [one_hot(d, vocab_size) for d in seed_text]
    print(encoded)
    encoded = pad_sequences(encoded, maxlen=text_seq_length, truncating='pre')
    y = model_disc.predict([encoded])
    print(y)
    for i in y:
        if (i < 0.6):
            print("it's fake")
        else:
            print("it's real")
Пример #11
0
def encoding_question(text):
    text = clean_text(text)
    ques_id = np.load('ques_id.npy').item()
    encoded_ques = []

    encoded_ques = [one_hot(text, 1000)]
    encoded_ques = pad_sequences(encoded_ques, maxlen=55, padding='post')

    encoded_ques = np.array(encoded_ques)
    encoded_ques = np.reshape(encoded_ques, [1, 55])

    return encoded_ques
    def encode_artist(self, X_artist):
        # Integer encode artist names
        # We will estimate the vocabulary size of (unique_artists*1000), which is much larger than needed to
        # reduce the probability of collisions from the hash function.
        if self.vocab_size is None:
            self.vocab_size = len(X_artist['artist'].unique()) * 1000

        for idx, row in X_artist.iterrows():
            X_artist.at[idx, 'artist'] = one_hot(row['artist'],
                                                 self.vocab_size)

        return X_artist
Пример #13
0
def one_hot_vec(train_data1, test_data1):
    train_data1 = np.asarray(train_data1)
    test_data1 = np.asarray(test_data1)
    X = np.concatenate((train_data1, test_data1), axis=0)
    vocabulory = np.unique(np.hstack(X))
    print '---length---: ', len(vocabulory)
    X_train = []
    X_test = []
    length = 0
    print '--- one hot encoding ---'
    for i in train_data1:
        temp = text.one_hot(' '.join(i), len(vocabulory))
        length = max(length, len(temp))
        X_train.append(temp)
    for i in test_data1:
        temp = text.one_hot(' '.join(i), len(vocabulory))
        length = max(length, len(temp))
        X_test.append(temp)

    X_train = sequence.pad_sequences(X_train, maxlen=500)
    X_test = sequence.pad_sequences(X_test, maxlen=500)
    return np.asarray(X_train), np.asarray(X_test), len(vocabulory)
Пример #14
0
    def predict(self, text) :

        ts = self.__cws.text_to_sequence(text)
        t0 = ' '.join(ts)

        t1 = one_hot(t0, self.__vocab_size)
        t2 = [t1]

        t3 = pad_sequences(t2, maxlen=self.__docs_max_length, padding='post')

        out = self.model.predict(t3)
        result = out[0][0]
        return result
Пример #15
0
    def getFeatureMatrix(self, df):
        if cfg.input_type == "text":
            from keras.preprocessing.text import one_hot
            from keras.preprocessing.sequence import pad_sequences
            textconverter = lambda x: x
            if sys.version_info[0] == 2:
                textconverter = lambda x: x.encode("utf-8")
            X = pad_sequences(
                df.apply(lambda row: one_hot(
                    textconverter(row[self.text_field]), self.vocabulary_size),
                         axis=1), self.word_limit)
            self.fields = [cfg.text_field]
            self.input_shape = (self.word_limit, )
        elif self.objective == "time_series":
            num_series = 1 + len(self.fields)
            data = [df[self.target].tolist()]
            num_rows = len(data[0])

            for field in self.fields:
                data.append(df[field].tolist())

            instances = []
            target_instances = []

            for index in range(num_rows - (self.window_size + 1)):
                windows = []
                for windex in range(self.window_size):
                    series = []
                    for sindex in range(num_series):
                        series.append(data[sindex][index + windex])
                    windows.append(series)
                target_window = []
                for sindex in range(num_series):
                    target_window.append(data[sindex][index +
                                                      self.window_size])
                instances.append(windows)
                target_instances.append(target_window)

            X = np.array(instances)
            self.seqtargets = np.array(target_instances)

            X = np.reshape(X, (X.shape[0], self.window_size, num_series))
            print(X.shape)
            self.input_shape = (self.window_size, num_series)
        else:
            X = df.as_matrix(self.fields)
            self.input_shape = (len(self.fields), )

        self.model_metadata["predictors"] = self.fields

        return X
Пример #16
0
 def pre_process(self):
     print("Loading...")
     pos_x, neg_x = self.util.get_data()
     pos_y = [[1, 0] for i in pos_x]
     neg_y = [[0, 1] for j in neg_x]
     print("Spliting..")
     pos_x_train, pos_x_test, pos_y_train, pos_y_test = train_test_split(
         pos_x, pos_y, test_size=0.30, random_state=42)
     neg_x_train, neg_x_test, neg_y_train, neg_y_test = train_test_split(
         neg_x, neg_y, test_size=0.30, random_state=42)
     X_train = np.concatenate((pos_x_train, neg_x_train), axis=0)
     Y_train = np.concatenate((pos_y_train, neg_y_train), axis=0)
     X_test = np.concatenate((pos_x_test, neg_x_test), axis=0)
     Y_test = np.concatenate((pos_y_test, neg_y_test), axis=0)
     X_train_encode = [one_hot(d, 2000) for d in X_train]
     X_test_encode = [one_hot(d, 2000) for d in X_test]
     X_train = pad_sequences(X_train_encode, maxlen=200, padding='post')
     X_test = pad_sequences(X_test_encode, maxlen=200, padding='post')
     Y_train = np.array(Y_train)
     X_test = np.array(X_test)
     X_train, Y_train = shuffle(X_train, Y_train, random_state=10)
     X_test, Y_test = shuffle(X_test, Y_test, random_state=10)
     return X_train, Y_train, X_test, Y_test
Пример #17
0
    def predict(self, text):

        ts = self.__cws.text_to_sequence(text)
        t0 = ' '.join(ts)
        print text, t0
        t1 = one_hot(t0, self.__vocab_size)
        t2 = [t1]
        print t2

        t3 = pad_sequences(t2, maxlen=self.__docs_max_length, padding='post')
        print t3

        out = self.model.predict(t3)
        print out
def one_hot_encode(docs, vocab_size, max_length_factor):
    ''' First converts a text to a sequence of words (or tokens).
    Then with keras.preprocessing.text.one_hot() whic is a wrapper for the hashing_trick() 
    function, returns an integer encoded version of the document. 
    The use of a hash function means that there may be collisions and not all
    words will be assigned unique integer values.
    Finally pads sequences to the same length.'''
    wordsequence = [text_to_word_sequence(str(d)) for d in docs]
    encoded_docs = [one_hot(str(d), vocab_size) for d in wordsequence]
    padded_docs = pad_sequences(encoded_docs,
                                maxlen=(len(max(encoded_docs, key=len)) *
                                        max_length_factor),
                                padding='post')
    return padded_docs
Пример #19
0
    def test_diff(self, data, labels):
        self.labels = labels
        D = data

        vocab_size = 300
        max_length = 200
        embedding_vector_length = 32

        emb_turn1 = sequence.pad_sequences(
            [one_hot(d, vocab_size) for d in D["turn1"]], maxlen=max_length)
        emb_turn2 = sequence.pad_sequences(
            [one_hot(d, vocab_size) for d in D["turn2"]], maxlen=max_length)
        emb_turn3 = sequence.pad_sequences(
            [one_hot(d, vocab_size) for d in D["turn3"]], maxlen=max_length)

        D = D.drop(['turn1', 'turn2', 'turn3'], axis=1).values

        results = self.model.evaluate([D, emb_turn1, emb_turn2, emb_turn3],
                                      self.labels,
                                      batch_size=32)
        print(results)
        print("Done testing")
        return results
	def preprocess(self):
		global review_int
		#Stemming & Remove Stopwords
		data2 = data
		wn = nltk.wordnet.WordNetLemmatizer()
		lc = nltk.stem.SnowballStemmer('english')

		sw = set(stopwords.words('english'))
		hasStop = data2['text'].tolist()
		noStop = []
		for item in hasStop:
			filtered = []
			wt = word_tokenize(item)
			for wo in wt:
				if wo == "not":
					filtered.append(wo)
				elif not wo in sw:
					filtered.append(wo)
			filtered = [wn.lemmatize(w) for w in filtered]
			filtered = [lc.stem(w) for w in filtered]
			noStop.append(' '.join(filtered))
		temp = pd.Series(noStop)
		data2['text'] = temp.values

		#Embedding Word
		with open('vocab.json','r') as json_data:
			voc = json.load(json_data)
			
		from keras.preprocessing.text import one_hot
		import random
		dataList = data2['text'].tolist()
		vocab_int = voc
		data3 =[]
		vocab_size = 200
		for item in dataList:
			notDone = True
			temp1 = (one_hot(item,vocab_size))
			temp2 = item.split()
			for i in range(len(temp2)):
				if temp2[i] in vocab_int:
					continue
				else:
					while notDone:
						if temp1[i] in vocab_int.values():
							temp1[i] = random.randrange(1, vocab_size)
						else:
							notDone = False
				vocab_int[temp2[i]] = temp1[i]
			data3.append(temp1)
		review_int = data3
Пример #21
0
    def prediction(self,user_text):

        # Encode the text
        encoded_docs = [one_hot(user_text, conf_keras_first_go.vocab_size)]
        # pad documents to a max length
        padded_text = pad_sequences(encoded_docs, maxlen=conf_keras_first_go.max_length, padding='post')
        # Prediction based on model
        prediction = self.model.predict(padded_text)
        # Decode the prediction
        encoder = LabelBinarizer()
        encoder.fit(self.test_labels)
        result = encoder.inverse_transform(prediction)

        return result[0]
Пример #22
0
    def test(self, D):
        self.labels = pd.get_dummies(D[output_emocontext])
        D = D.drop(output_emocontext, axis=1)

        vocab_size = 300
        max_length = 200
        embedding_vector_length = 32

        emb_turn1 = sequence.pad_sequences(
            [one_hot(d, vocab_size) for d in D["turn1"]], maxlen=max_length)
        emb_turn2 = sequence.pad_sequences(
            [one_hot(d, vocab_size) for d in D["turn2"]], maxlen=max_length)
        emb_turn3 = sequence.pad_sequences(
            [one_hot(d, vocab_size) for d in D["turn3"]], maxlen=max_length)

        D = D.drop(['turn1', 'turn2', 'turn3'], axis=1).values

        results = self.model.evaluate([D, emb_turn1, emb_turn2, emb_turn3],
                                      self.labels,
                                      batch_size=32)
        print(results)
        print("Done testing")
        return results
    def load_data(self):
        # load train data
        x_train = [one_hot(q, self._vocab_size) for q in self._questions]
        x_train = pad_sequences(x_train,
                                maxlen=self._max_length,
                                padding='post')
        x_train = np.array(x_train)
        top_answers, answers_info = self.get_top_answers()
        print(answers_info)
        y_train = []
        for i in range(len(x_train)):
            rand = random.randint(0, 999)
            y_train.append(answers_info[top_answers[rand][0]])

        # Store the unique IDs of answers
        unique_answers = {key: 1 for key in y_train}
        self.targets_size = len(unique_answers)

        # load val dataset
        x_val = [one_hot(q, self._vocab_size) for q in self._questions_val]
        x_val = pad_sequences(x_val, maxlen=self._max_length, padding='post')
        x_val = np.array(x_val)
        y_val = []
        for i in range(len(x_val)):
            rand = random.randint(0, 999)
            y_val.append(answers_info[top_answers[rand][0]])

        # Encoding the output data
        y_train, y_val = np.array(y_train), np.array(y_val)
        encoder = LabelEncoder()
        encoder.fit(y_train)
        encoder.fit(y_val)
        encoded_y_train, encoded_y_val = encoder.transform(
            y_train), encoder.transform(y_val)
        y_train, y_val = np_utils.to_categorical(
            encoded_y_train), np_utils.to_categorical(encoded_y_val)
        return x_train, y_train, x_val, y_val
Пример #24
0
def load_data(vocab_size, num_classes):
    with open('/Users/nadeau/Documents/Metagenome_Classification/train_test_set/X_test150.pickle', 'rb') as f:
        x_test = pickle.load(f)
        # make sequences into sentances of words
        # https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
        x_test = [[letter for letter in word] for word in x_test]
        x_test = [" ".join(letters) for letters in x_test]
        f.close()
    with open('/Users/nadeau/Documents/Metagenome_Classification/train_test_set/X_train150.pickle', 'rb') as f:
        x_train = pickle.load(f)
        x_train = [[letter for letter in word] for word in x_train]
        x_train = [" ".join(letters) for letters in x_train]
        f.close()
    with open('/Users/nadeau/Documents/Metagenome_Classification/train_test_set/y_test150.pickle', 'rb') as f:
        y_test_str = pickle.load(f)
        y_test = enumerate_y_labels(y_test_str)
        f.close()
    with open('/Users/nadeau/Documents/Metagenome_Classification/train_test_set/y_train150.pickle', 'rb') as f:
        y_train_str = pickle.load(f)
        y_train = enumerate_y_labels(y_train_str)
        f.close()

    # integer encode the "words" in sequences
    x_test = [[one_hot(s, vocab_size)] for s in x_test]
    x_test = np.array(x_test)
    x_train = [[one_hot(s, vocab_size)] for s in x_train]
    x_train = np.array(x_train)
    print('x train shape: {}'.format(x_train.shape))

    # covert int y label vectors to one hot matrices
    y_test_1D = y_test
    y_train_1D = y_train
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)
    print('y train shape: {}'.format(y_train.shape))

    return y_train_1D, y_test_1D, x_train, y_train, x_test, y_test
Пример #25
0
    def __split_and_one_hot_and_padded_docs (self, docs) :


        enc_docs = []

        for doc in docs :
            ts = self.__cws.text_to_sequence(doc)
            t0 = ' '.join(ts)
            x = one_hot(t0, self.__get_vocab_size())
            enc_docs.append(x)

        pad_docs = pad_sequences(enc_docs,\
                             maxlen=self.__get_docs_max_length(), padding='post')

        return pad_docs
Пример #26
0
def generate(skip=False):
    with open('database/banki_ru_train.csv') as f:
        reader = csv.reader(f)
        first = True
        if skip:
            m = random.randint(1, 40000)
        for row in reader:
            if first:
                first = False
                continue
            x_train = text.one_hot(row[2], vocab_size)
            y_train = np.array(percent(row[2])).reshape(-1, 1)
            x_train = sequence.pad_sequences([x_train], maxlen=max_len)
            res = x_train, y_train
            yield res
Пример #27
0
def preprocess_data(text):  
    text = text
    text=re.sub('[^a-zA-Z0-9]',' ',text)                        # Remove special characters(punctuations) and numbers 
    text=text.lower()                                           # Convert to lower case
    text=text.split()                                           # Tokenization
    text = [word for word in text if word not in sw]            # Removing stopwords
    text = [lemma.lemmatize(word=w,pos='v') for w in text]      # lemmatization
    text = [k for k in text if len(k)>2]                        # Remove words with length < 2
    text = ' '.join(text)
    
  
    ohe = [one_hot(word, vocab_size) for word in text]
    padded = pad_sequences(ohe, padding=padding_type, truncating=trunc_type)
    fd = (pd.DataFrame(padded)).transpose()
    return fd
Пример #28
0
def preprocess_data(stored_contents):
    from keras.preprocessing.text import text_to_word_sequence, one_hot
    from keras.preprocessing.sequence import pad_sequences

    #see: https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/
    
    # tokenize the document
    word_sequence=text_to_word_sequence(filter_data(stored_contents))
    words = set(word_sequence) #set() "groups by" the characters filtering duplicaded ones
    vocab_size=len(words) #getting vocabulary size, this will be the input 
    tokenized_array=one_hot(stored_contents, round(vocab_size)) #one hot encoding input data

    #data_to_predict = pad_sequences(tokenized_array, maxlen = 9000)
    #return data_to_predict
    return tokenized_array
Пример #29
0
def word_vectorizing_keras(csv_file, max_features, max_len):
    """
    векторизация текстов с помощью встроенного keras-функционала
    :param csv_file: датасет
    :param max_features: размер алфавита
    :param max_len: макс длина вектора-текста
    :return:
    """
    labels = np.asarray(list(csv_file['1'])).astype('float32')
    texts = list(csv_file['0'])

    X = [one_hot(text, max_features) for text in texts]
    X = pad_sequences(X, maxlen=max_len)

    return X, labels
Пример #30
0
def embedded(data, v_size):
    from keras.preprocessing.text import one_hot
    dataList = data['text'].tolist()
    vocab_int = {}
    encoded = []
    vocab_size = v_size
    for item in dataList:
        temp1 = (one_hot(item, vocab_size))
        temp2 = item.split()
        for i in range(len(temp2)):
            if temp2[i] in vocab_int:
                continue
            else:
                vocab_int[temp2[i]] = temp1[i]
        encoded.append(temp1)
    return encoded, vocab_int
Пример #31
0
def generate(skip=False):
    with open('database/banki_ru_train.csv') as f:
        reader = csv.reader(f)
        first = True
        if skip:
            m = random.randint(1, 40000)
        for row in reader:
            if first:
                first = False
                continue
            x_train = text.one_hot(row[2], vocab_size)
            y_train = int(row[4])
            x_train = sequence.pad_sequences([x_train], maxlen=max_len)
            y_train = np_utils.to_categorical([y_train], 6)
            res = x_train, y_train
            yield res
Пример #32
0
 def format_testcase(self, string, type, max_len):
     #titles
     single = list()
     if type == 0:
         single = nltk.word_tokenize(self.clean_sentence(string))
     #keywords
     else:
         single = string
     mapping = list()
     for one_keyword in single:
         mapping.append(one_hot(one_keyword, 7000)[0])
     while len(mapping) < max_len:
         mapping.append(0)
     data = list()
     data.append(mapping)
     print(data)
     return mapping
Пример #33
0
def input_data_gen():
    train_file = "total-data.txt"
    train_words = []
    train_tags = []

    X = []
    Y = []

    test_words = []
    test_tags = []
    with open(train_file, 'r') as f1:
        for line in f1:

            # line = line.decode('utf-8')
            tks = line.split('-0-')
            # print tks
            word = tks[0]
            x = one_hot(n=10000, text=word)
            # try:
            # print tks
            tag = tks[1]
            if tag == "+":
                tag = [1, 0, 0]
            elif tag == "-":
                tag = [0, 1, 0]
            else:
                tag = [0, 0, 1]
            train_words.append(x)
            train_tags.append(tag)
            # except Exception as e:
            #     print e.message
    # print train_words[0]
    index = [i for i in range(len(train_words))]
    train_words = pad_sentences(train_words)
    train_tags = np.concatenate([train_tags], 0)
    random.shuffle(index)
    for i, j in enumerate(train_words):
        if i < 0.1 * len(train_words):
            test_words.append(train_words[index[i]])
            test_tags.append(train_tags[index[i]])
        else:
            X.append(train_words[index[i]])
            Y.append(train_tags[index[i]])

    return X, Y, test_words, test_tags
Пример #34
0
def genData():
    #X, Y arrays for all data
    X = []
    Y = []

    #generate 20% as test set
    train_count=0
    validate_count = 0
    test_count = 0
    line_count = 0

    with open('./hs.csv', encoding='latin-1') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t')
        for row in csv_reader:
            X.append(row[0])
            Y.append(row[1])
            line_count += 1
        print("actual vocab size",len(set(X)))  
        output_size = len(set(Y))
        #encode the vocab
        encoded_X = [one_hot(d, vocab_size) for d in X]
        padded_X = pad_sequences(encoded_X, maxlen=max_len, padding="post")
        encoder = LabelEncoder()
        encoder.fit(Y)
        encoded_Y = encoder.transform(Y)
        Y_one_hot = np_utils.to_categorical(encoded_Y)

        #Training, Test
        X_train = []
        Y_train = []
        X_test = []
        Y_test = []

        # generate sets for train, test and validate
        np.random.seed(1)
        for i in range(line_count):
            if(round(np.random.rand()*100) < 81):
                X_train.append(padded_X[i])
                Y_train.append(Y_one_hot[i])
                train_count = train_count+1
            else:
                X_test.append(padded_X[i])
                Y_test.append(Y_one_hot[i])
                test_count = test_count+1
    return X_train, Y_train, X_test, Y_test, output_size
Пример #35
0
 def get_iemocap_data(self):
     X, Y = self.iemocap_util.read_iemocap_data()
     X, Y = shuffle(X, Y, random_state=42)
     Y = [
         self.encode_class(y, ["Positive", "Neutral", "Negative"])
         for y in Y
     ]
     X = [one_hot(d, 2000) for d in X]
     X = pad_sequences(X, maxlen=50, padding='post')
     x_train, x_test, y_train, y_test = train_test_split(X,
                                                         Y,
                                                         test_size=0.30,
                                                         random_state=42)
     X_train = np.array(x_train)
     Y_train = np.array(y_train)
     X_test = np.array(x_test)
     Y_test = np.array(y_test)
     return X_train, Y_train, X_test, Y_test
Пример #36
0
    #
    text = text.lower()
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    doc = tokenizer.tokenize(text)
    # Stopword removal
    # DONE: Add your code here. Store results to a list with name 'doc'
    #
    doc = [word for word in doc if word not in stopwords]
    # Stemming
    # DONE: Add your code here. Store results to a list with name 'doc'
    #
    stemmer = PorterStemmer()
    doc = [stemmer.stem(word) for word in doc]
    # Convert list of words to one string
    doc = ' '.join(w for w in doc).encode('ascii')
    doc = one_hot(doc, vocab_size, split=' ')
    data[doc_id] = doc   # list data contains the preprocessed document


data_train, data_test, labels_train, labels_test = cross_validation.train_test_split(data, labels, test_size=0.4, random_state=1033)


# Model learning and prediction
# TODO: test different learning algorithms

y_train = np.array(labels_train)
y_test = np.array(labels_test)
y_train = (y_train == 1).astype('float32')
y_test = (y_test == 1).astype('float32')

print("Pad sequences (samples x time)")
Пример #37
0
    line = line.strip().decode("ascii", "ignore").encode("utf-8")
    if len(line) == 0:
        continue
    lines.append(line)
fin.close()

sents = nltk.sent_tokenize(" ".join(lines))

tokenizer = Tokenizer(5000)  # use top 5000 words only
tokens = tokenizer.fit_on_texts(sents)
vocab_size = len(tokenizer.word_counts) + 1

xs = []
ys = []
for sent in sents:
    embedding = one_hot(sent, vocab_size)
    triples = list(nltk.trigrams(embedding))
    w_lefts = [x[0] for x in triples]
    w_centers = [x[1] for x in triples]
    w_rights = [x[2] for x in triples]
    xs.extend(w_centers)
    ys.extend(w_lefts)
    xs.extend(w_centers)
    ys.extend(w_rights)

ohe = OneHotEncoder(n_values=vocab_size)
X = ohe.fit_transform(np.array(xs).reshape(-1, 1)).todense()
Y = ohe.fit_transform(np.array(ys).reshape(-1, 1)).todense()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3,
                                                random_state=42)
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)
Пример #38
0
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

if (mode =='train'):
    #save all checkpoints        
    checkpointer = ModelCheckpoint(filepath=fdir+"/weights.hdf5", verbose=1, save_best_only=False)
    history = LossHistory()
    sample = Sample()
    
    print("Training...")
    
    for e in range(nb_epoch):
        print("epoch %d" % e)
        #for X_batch,Y_batch in zip(batches_X,batches_Y):
        for i, batch in enumerate(batches_X):
            X_batch= batches_X[i]
            Y_batch = one_hot(batches_Y[i],max_features_Y)
            model.fit(X_batch, Y_batch, batch_size=batch_size, nb_epoch=1, validation_split=0.1, callbacks=[checkpointer,history,sample])
            f = file(fdir+'/losses.pkl', 'wb')
            pkl.dump(history.losses, f, protocol=pkl.HIGHEST_PROTOCOL)
            f.close()

else:
    preds = model.predict_classes(X_test,batch_size=1, verbose=1)

    print(preds[0])
    get_activations = theano.function([model.layers[3].input], model.layers[4].output(train=False), allow_input_downcast=True)
    activations = get_activations(X_test)
    
    
    
    print (activations.shape)
Пример #39
0
# a = ["a d d", "d a"]
# a = ["我是一个爱生活的人", "他也是一个爱生活的人"]
# one_h = one_hot(filters=base_filter(), n=30, text=a)
# # o.fit_on_texts(a)
# # b = one_h(a)
# print one_hot(filters=base_filter(), n=30, text=a)
# print one_hot(filters=base_filter(), n=30, text=a)

# a=['hello world', 'foo bar']
# tokenizer = Tokenizer()
# train_tokens = tokenizer.fit_transform(a)
# print train_tokens
# comma_tokenizer = lambda x: jieba.cut(x, cut_all=True)
# from sklearn.feature_extraction.text import HashingVectorizer
# v = HashingVectorizer(tokenizer=comma_tokenizer, n_features=30000, non_negative=True)
# train_data = v.fit_transform(a)
# print train_data

# import jieba
a = "我是一个男孩"
c = jieba.cut(a, cut_all=False)
w = ""
# print(", ".join(c))
for i in c:
    w += i + " "
    # print i
w = w[:len(w)-1].encode('utf8')
# w = "我 是 一个男孩"
print one_hot(filters=base_filter(), n=30000, text=w)
# print w
# # print c.next()
Пример #40
0
def test_one_hot():
    text = 'The cat sat on the mat.'
    encoded = one_hot(text, 5)
    assert len(encoded) == 6
    assert np.max(encoded) <= 4
    assert np.min(encoded) >= 0
Пример #41
0
def one_hot(word_model, n):
    return text.one_hot(
        word_model, n, filters=text_filter(), lower=False, split=" ")
#!/usr/bin/python3
# coding: utf-8
# https://github.com/EliasCai/sentiment/blob/master/sentiment_words.py#L78
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import hashing_trick
##################################################################
## 1. text_to_word_sequence, one_hot, hashing_trick
texts = ['some thing to eat', 'some thing to drink']
print(text_to_word_sequence(texts[0]))  # ['some', 'thing', 'to', 'eat']; 简单的空格分开
print(one_hot(texts[0], 10))  # [5, 7, 5, 7]; (10 表示数字化向量为 10 以内的数字)
print(one_hot(texts[1], 10))  # [5, 7, 5, 5]; 因为内部调用了 hash, 所以能够在定了 (text, n) 之后对每个 str 赋值相同
# This is a wrapper to the `hashing_trick` function using `hash` as the hashing function, unicity of word to index mapping non-guaranteed.
##################################################################
## 2. Tokenizer: 索引就是出现的先后位置
# keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n', lower=True, split=" ", char_level=False)
# Tokenizer 是一个用于向量化文本, 或将文本转换为序列(即单词在字典中的下标构成的列表, 从 1 算起)的类.
# num_words: None 或整数, 处理的最大单词数量. 若被设置为整数, 则分词器将被限制为待处理数据集中最常见的 num_words 个单词
# char_level: 如果为 True, 每个字符将被视为一个标记
texts = ['some thing to eat', 'some thing to drink']
tmp_tokenizer = Tokenizer(num_words=None)  # num_words:None 或整数, 处理的最大单词数量; 少于此数的单词丢掉
tmp_tokenizer.fit_on_texts(texts)
# tmp_tokenizer.fit_on_texts(texts[0]); tmp_tokenizer.fit_on_texts(texts[1])  # 不能这样, 会按单个字母来统计
# 属性
print(tmp_tokenizer.word_counts)  # OrderedDict([('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]); 在训练期间出现的次数
print(tmp_tokenizer.word_docs)  # {'thing': 2, 'eat': 1, 'to': 2, 'some': 2, 'drink': 1}; 在训练期间所出现的文档或文本的数量
print(tmp_tokenizer.word_index)  # {'some': 1, 'thing': 2, 'to': 3, 'eat': 4, 'drink': 5}; 排名或者索引
print(len(tmp_tokenizer.word_index))  # 5; 词典长度
print(tmp_tokenizer.index_docs)  # {2: 2, 4: 1, 3: 2, 1: 2, 5: 1}; 将 word_index 和 word_docs 合并
print(tmp_tokenizer.document_count)  # 2; 训练文档数
Пример #43
0
def get_input_data(train_file="rm_result.txt", test_file=None, split=0.1, label_func=get_label_rm):

    X = []
    Y = []
    train_words = []
    train_tags = []
    test_len = 0
    if test_file is not None:
        with open(test_file, 'r') as f1:

            for line in f1:
                line = line.replace("\n", "")
                tks = line.split('-0-')
                word = tks[0]
                x = one_hot(n=10000, text=word)

                if len(x) > 500:
                    continue
                try:
                    tag = label_func(tks[1])

                    train_words.append(x)
                    train_tags.append(tag)
                except:
                    pass
        test_len = len(train_words)

    with open(train_file, 'r') as f1:

        for line in f1:
            line = line.replace("\n", "")
            tks = line.split('-0-')
            word = tks[0]

            x = one_hot(n=10000, text=word)

            if len(x) > 500:
                continue
            try:
                tag = label_func(tks[1])

                train_words.append(x)
                train_tags.append(tag)
            except:
                pass
    # print train_words[0]
    index = [i for i in range(len(train_words))]
    print "padding"
    train_words = pad_sentences(train_words)
    train_tags = np.concatenate([train_tags], 0)
    print "end padding"

    if test_file is None:
        random.shuffle(index)
        test_len = int(split * len(train_words))

    test_words = []
    test_tags = []
    for i, j in enumerate(train_words):
        if i < test_len:
            test_words.append(train_words[index[i]])
            test_tags.append(train_tags[index[i]])
        else:
            X.append(train_words[index[i]])
            Y.append(train_tags[index[i]])



    return X, Y, test_words, test_tags
Пример #44
0
__author__ = 'bohaohan'
# from keras.datasets import imdb
# from nltk.stem import WordNetLemmatizer
# (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=20,
#                                                       test_split=0.2)
# for i in X_test:
#     print i

# print WordNetLemmatizer().lemmatize("lives")
# import nltk
# nltk.download()
from keras.datasets import imdb, reuters
from get_data import input_data
max_features = 20000
maxlen = 100  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
# (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)
# (X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=1000, test_split=0.2)
# X_train, y_train, X_test, y_test = input_data()
# print(len(X_train), 'train sey_trainquences')
# print(len(X_test), 'test sequences')
# print(X_train[0], 'train sequences')
# tokenizer = Tokenizer(nb_words=1000)
# X_train = sequence.pad_sequences(X_train, maxlen=100)
# print(X_train[0], 'train sequences')
from keras.preprocessing.text import one_hot
x = "你 我 他"
print one_hot(n=10000, text=x)
Пример #45
0
#!/usr/bin/python3
# coding: utf-8
# 参考: [use-word-embedding-layers-deep-learning-keras](https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/)
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding
##################################################################
## 一: Embedding 简介
# We will define a small problem where we have 10 text documents, each with a comment about a piece of work a student submitted.
# Each text document is classified as positive "1" or negative "0". This is a simple sentiment analysis problem.
docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!', 'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.']
labels = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]  # define class labels
vocab_size = 50  # integer encode the documents; 词典长度, 50 能有效减少 Hash 碰撞...
encoded_docs = [one_hot(d, vocab_size) for d in docs]  # 使用 one_hot, 给每个单词编一个号, 编号可能重复, 所以 vocab_size 要大
print(encoded_docs)  # [[18, 44], [37, 9], [34, 24], [39, 9], [44], [39], [36, 24], [9, 37], [36, 9], [29, 39, 44, 49]]
# one_hot() 只是一种方法, 类似于 tf-idf, 词袋模型
# Keras prefers inputs to be vectorized and all inputs to have the same length
max_length = 4  # pad documents to a max length of 4 words
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)  # [[18 44  0  0] [37  9  0  0] [34 24  0  0] [39  9  0  0] [44  0  0  0] [39  0  0  0] [36 24  0  0] [ 9 37  0  0] [36  9  0  0] [29 39 44 49]]
# 至此, 每个 document 已经用 4 维的向量来表示了, We are now ready to define our Embedding layer as part of our neural network model.
# The Embedding has a vocabulary of 50 and an input length of 4. We will choose a small embedding space of 8 dimensions.
##################################################################
## Embedding(input_dim, output_dim, input_length) 定义模型; (字典大小, 词向量大小, 每句话单词个数)
# Embedding requires that the input data be integer encoded, so that each word is represented by a unique integer.
#     This data preparation step can be performed using the Tokenizer API also provided with Keras.
# input_dim: This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10,
#     then the size of the vocabulary would be 11 words.
# output_dim: This is the size of the vector space in which words will be embedded. It defines the size of the output vectors