def text2hash(df): df.columns = columnsHead df['protocol_type'] = df['protocol_type'].apply( lambda x: hashing_trick(x, 200, hash_function='md5', filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~ ')) df['service'] = df['service'].apply( lambda x: hashing_trick(x, 200, hash_function='md5', filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~ ')) df['flag'] = df['flag'].apply( lambda x: hashing_trick(x, 200, hash_function='md5', filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~ '))
def compile_tag_data(self, cat): """Compile data for the given category (indexed in the following list)""" n = self.n isize = self.isize if cat == 2: return [], [] # no irish data text = "" with open(self.categories[cat] + "_data") as tf: for line in tf: text += line.strip() #setup for character level hashing raw_data = hashing_trick(text, n, hash_function='md5', lower=False, split=' ') #with this data, will cut it into groups of isize (this is just a list) # and also make a label for each of these groups with the category index labels = [] data = [] for i in range(0, len(raw_data), isize): if i + isize < len(raw_data): #in bounds data.append(raw_data[i:i+isize]) else: zeros = [0]*(i+isize - len(raw_data)) #pad the rest of the values data.append(raw_data[i:len(raw_data)] + zeros) labels.append(cat) #an index for each category is training target return data, labels #return two arrays
def compile_data(self, cat): """Compile data for the given category (indexed in the following list)""" n = self.n isize = self.isize text = "" for filename in os.listdir("./langs/" + self.categories[cat]): with open("./langs/" + self.categories[cat] + "/" + filename) as tf: for line in tf: for c in line: if c.isalpha(): text += c + " " #setup for character level hashing elif c == " ": text += "_ " raw_data = hashing_trick(text, n, hash_function='md5', lower=False, split=' ') #with this data, will cut it into groups of isize (this is just a list) # and also make a label for each of these groups with the category index labels = [] data = [] for i in range(0, len(raw_data), isize): if i + isize < len(raw_data): #in bounds data.append(raw_data[i:i+isize]) else: zeros = [0]*(i+isize - len(raw_data)) #pad the rest of the values data.append(raw_data[i:len(raw_data)] + zeros) labels.append(cat) #an index for each category is training target return data, labels #return two arrays
def encode_string(text, num_words=2000): return hashing_trick(text, num_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', hash_function='md5')
def encode(text_list): encoded_list = [] for i in range(len(text_list)): text = text_list[i] encoded = hashing_trick(text, vocab_size, hash_function='md5') encoded_list.append(encoded) return (encoded_list)
def read_program(): vocab=list() vocab_one_hot=list() vocab_hash=list() filter='\n' for file in glob.glob('/root/eclipse-workspace/Ranking/src/C/*.c'): print(file) f = open(file, "r") text=f.read() print(text) vocab.append(text_to_word_sequence(text,filters=filter,split=' ')) print(vocab[-1]) words = set(vocab[-1]) vocab_size = len(words) print("vocab_size = ",vocab_size) #One hot encoding print('One-hot encoding:') vocab_one_hot.append(one_hot(text,round(vocab_size*1.3),filters=filter)) print(vocab_one_hot[-1]) #Hash encoding print('Hash encoding:') vocab_hash.append(hashing_trick(text, round(vocab_size*1.3), hash_function='md5',filters=filter)) print(vocab_hash[-1]) print('--------------------------')
def text2hash(df, cols, toHash=['service', 'flag', 'protocol_type']): df.columns = cols for el in toHash: df[el] = df[el].apply( lambda x: hashing_trick(x, 200, hash_function='md5', filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~ '))
def test_hashing_trick_md5(self): text = 'The cat sat on the mat.' encoded = preprocessing_text.hashing_trick(text, 5, hash_function='md5') self.assertEqual(len(encoded), 6) self.assertLessEqual(np.max(encoded), 4) self.assertGreaterEqual(np.min(encoded), 1)
def build_embedding(graph): """ Given a graph, creates word embeddings for the names of all nodes. First the nodes in the graph are ordered by their labels. Next, for each node in the ordered list of nodes, a one hot encoding is computed for its name, which is a list of integers. This list is padded. The padded list of integers for all nodes are combines to form a single list of integers, which is returned. :param graph: A Graph object describing the input data :return: A 1D numpy array of shape (MAX_NODES*EMBEDDING_LENGTH,) """ nodes_list = list(graph.nodes.values()) sorted_nodes = sorted(nodes_list, key=lambda x: hash_labels_only( labels=x.labels, node_label_hash=NODE_TYPE_HASH)) embedding = [] for i in range(MAX_NODES): if i < len(sorted_nodes) and "name" in sorted_nodes[i].properties and \ sorted_nodes[i].properties["name"] != []: # The 'name' property on each node is a list, the current solution is to # take the first element. name = sorted_nodes[i].properties["name"][0] encoded_name = hashing_trick(name, VOCAB_SIZE, hash_simhash) if "cmdline" in sorted_nodes[i].properties: cmdline = sorted_nodes[i].properties["cmdline"] encoded_cmdline = hashing_trick(cmdline, VOCAB_SIZE, hash_simhash) else: encoded_cmdline = [] embedding += [encoded_name, encoded_cmdline] else: embedding += [[], []] padded_embedding = pad_sequences(embedding, maxlen=EMBEDDING_LENGTH) combined_embedding = [ num for sublist in padded_embedding for num in sublist ] return np.asarray(combined_embedding, dtype=np.int16)
def gethash(self, big_input, max_words=10, hash_mole=20000): hashed = np.zeros((len(big_input), max_words), dtype='float32') for i in range(len(big_input)): j = 0 for h in text.hashing_trick(big_input[i], hash_mole, hash_function='md5'): if j == max_words: print('haiku too long? ', big_input[i]) hashed[i][j] = h j += 1 return hashed
def text2hash( df): #hash all the colums which contains string values in a dataframe for el in df.columns: if not isNumeric(df[el]): #print el, df[el][0] df[el] = df[el].apply(lambda x: hashing_trick( str(x), 200, hash_function='md5', filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~ '))
def gethash(self, big_input, max_words=10, hash_mole=20000): hashed = np.ones((len(big_input), max_words + 2), dtype='float32') for i in range(len(big_input)): hashed[i][0] = 0 j = 1 for h in text.hashing_trick(big_input[i], hash_mole, hash_function='md5'): if j == max_words: print('input too long? ', big_input[i]) break hashed[i][j] = h j += 1 return hashed
def IntEncodeWords(wordlist): vocab_size = 200000 max_length = 10 #integer encoding the syllables encoded_words = [ hashing_trick(d, vocab_size, hash_function='md5') for d in wordlist ] #padding to a max length of 10 padded_words = pad_sequences(encoded_words, maxlen=max_length, padding='post') return padded_words
def prepare_data(data, classes, vocab): inputs = [] labels = [] for line, char in zip(*data): #inputs.append(one_hot(line, len(vocab))) inputs.append(hashing_trick(line, len(vocab), hash_function='md5')) one_hot_out = np.zeros(len(classes)) one_hot_out[classes.index(char)] = 1 labels.append(one_hot_out) return np.asarray(pad_sequences(inputs, padding='post')), np.asarray(labels)
def text_to_ids(text): if type(text) == str: text = [text] encoded_texts = [ hashing_trick(t, settings.VOCABULARY_SIZE, hash_function='md5') for t in text ] padded_texts = pad_sequences(encoded_texts, maxlen=settings.PHRASE_MAX_LENGTH, padding='post') return padded_texts
def generator_inout(d1, d2, cat1, cat2, batch_size, vocab): c1 = True c2 = True file1 = open(d1, "r") file2 = open(d2, "r") while True: batch_input = [] batch_output = [] for i in range(0, batch_size): if c1 and c2: if random.randint(0, 1) > 0: text = file1.readline() cat = cat1 if text == "": c1 = False else: text = file2.readline() cat = cat2 if text == "": c2 = False else: if c1: text = file1.readline() cat = cat1 if text == "": c1 = False if c2: text = file2.readline() cat = cat2 if text == "": c2 = False if not (c1) and not (c2): file1.close() file2.close() file1 = open(d1, "r") file2 = open(d2, "r") c1 = True c2 = True input = hashing_trick(text, round(vocab * 1.3), hash_function='md5') output = cat batch_input += [input] batch_output += [output] batch_x = pad_sequences(np.array(batch_input), maxlen=14) batch_y = to_categorical(np.array(batch_output), num_classes=2) yield batch_x, batch_y
def text2HashIntegers(word, max_nb_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=False, split=" "): if not uStr.isValidWord(word): return [] nbs = text.hashing_trick( text=word, n=max_nb_words, hash_function= 'md5', # 'md5' is a stable hashing function consistent across different runs filters=filters, lower=lower, split=split) return nbs
def neroset(request): com = list() json_file = open("imdb_model.json", "r") loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights("imdb_model.h5") loaded_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) for i in range(0, 2): text = Comment.text[i] sequence = hashing_trick(text, n=64, hash_function=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ') scores = skipgrams(sequence, vocabulary_size=5000, window_size=32, negative_samples=1.0, shuffle=False, categorical=False, sampling_table=None, seed=None) b = scores[1:2] kol = 0 num2 = 0 for g in b: num1 = g for c in num1: num2 += c kol += 1 k = num2 / kol if k >= 0.5: com[i] = 'Хорошая' else: com[i] = 'Плохая' return render(request, 'blog/home.html', com)
def preprocessing_tokenize(X, num_words=160): # _tokenizer = Tokenizer(num_words=NUM_WORDS, # filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', # lower=True, # split=" ", # char_level=False) # # _tokenizer.fit_on_texts(X) # print("There where found {} unique tokens. ".format(len(_tokenizer.word_index))) # _X = _tokenizer.texts_to_matrix(X) _X = [] for x in X: _X.append(hashing_trick(x, NUM_WORDS, hash_function='md5', split=' ')) _X = pad_sequences(_X, maxlen=MAX_SEQUENCE_LENGTH) return _X
def make_test_data(self, line): """Make data set to predict the languages in a line of finnegans wake.""" line = line.strip() text = "" for c in line: if c == " ": text += "_ " #encode spaces just like test data else: text += c + " " raw_data = hashing_trick(text, self.n, hash_function='md5', lower=False, split=' ') data = [] if len(raw_data) < self.isize: #add padding data.append(raw_data + [0]*(self.isize-len(raw_data))) #add 0 up to isize for i in range(len(raw_data)-self.isize): #get every isize length window data.append(raw_data[i:i+self.isize]) #get encodings for this window return data
def prepareData(vocabulary_size, list_of_intent_dicts): x_train, y_train = [], [] intents = [] for intent_dict in list_of_intent_dicts: intents = intents + [ intent_dict['intent'] ] if intent_dict['intent'] not in intents else intents for sentence in intent_dict['sentences']: sentence_vec = text.hashing_trick( sentence, vocabulary_size, hash_function='md5', filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ') x_train.append(sentence_vec) y_train.append( [1 if i == intent_dict['intent'] else 0 for i in intents]) return intents, x_train, y_train
def preprocessing_data_hashing_trick(self, vocab_size=5000, max_len=100): print('Preprocessing data...') y = self.df['label'].astype('U') data_f = self.df['text'].astype('U') data_f = self.clear_text(data_f) data = [] for xt in data_f: xt = ' '.join(text_to_word_sequence(xt)) data.append( hashing_trick(xt.lower(), vocab_size, hash_function='md5')) x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=255) x_train = sequence.pad_sequences(x_train, maxlen=max_len) x_test = sequence.pad_sequences(x_test, maxlen=max_len) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) return x_train, x_test, y_train, y_test
vocab_size = len(words) print(vocab_size) # integer encode the document result = one_hot(text, round(vocab_size * 1.3)) print(result) from keras.preprocessing.text import hashing_trick from keras.preprocessing.text import text_to_word_sequence # define the document text = 'The quick brown fox jumped over the lazy dog.' # estimate the size of the vocabulary words = set(text_to_word_sequence(text)) vocab_size = len(words) print(vocab_size) # integer encode the document result = hashing_trick(text, round(vocab_size * 1.3), hash_function='md5') print(result) from keras.preprocessing.text import Tokenizer # define 5 documents docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!'] # create the tokenizer t = Tokenizer() # fit the tokenizer on the documents t.fit_on_texts(docs) # summarize what was learned print(t.word_counts) print(t.document_count) print(t.word_index) print(t.word_docs)
model = Sequential() model.add(Dense(1000, activation='relu', input_shape=(14, ))) model.add(Dropout(0.1)) model.add(Dense(500, activation='relu')) model.add(Dropout(0.1)) model.add(Dense(200, activation='relu')) model.add(Dropout(0.1)) model.add(Dense(2, activation='softmax')) sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy']) model.load_weights("thought_model.h5") inp = input('enter a sentence: ') words = inp.split() perm = permutations(words) for i in perm: preinp = hashing_trick(str(i), round(vocab_size * 1.3), hash_function='md5') readyinp = pad_sequences(np.array([preinp]), maxlen=14) output = model.predict_classes(readyinp) #print(output) if output[0] == 0: print(i)
def extract(infile, outfile, dict_keys, stem=False, lemma=False, element="narrative", arg_rebalance=""): train = False narratives = [] keywords = [] # Get the xml from file root = etree.parse(infile).getroot() if dict_keys == None: train = True # Set up the keys for the feature vector dict_keys = ["MG_ID", labelname] if checklist in featurenames: dict_keys = dict_keys + ["CL_DeathAge", "CL_ageunit", "CL_DeceasedSex", "CL_Occupation", "CL_Marital", "CL_Hypertension", "CL_Heart", "CL_Stroke", "CL_Diabetes", "CL_TB", "CL_HIV", "CL_Cancer", "CL_Asthma","CL_InjuryHistory", "CL_SmokeD", "CL_AlcoholD", "CL_ApplytobaccoD"] elif dem in featurenames: dict_keys = dict_keys + ["CL_DeathAge", "CL_DeceasedSex"] print "dict_keys: " + str(dict_keys) #keywords = set([]) #narrwords = set([]) print "train: " + str(train) print "stem: " + str(stem) print "lemma: " + str(lemma) # Extract features matrix = [] for child in root: features = {} if rec_type in featurenames: features["CL_" + rec_type] = child.tag # CHECKLIST features for key in dict_keys: if key[0:3] == "CL_": key = key[3:] item = child.find(key) value = "0" if item != None: value = item.text if key == "AlcoholD" or key == "ApplytobaccoD": if value == 'N': value = 9 features[key] = value #print "-- value: " + value #if key == "MG_ID": # print "extracting features from: " + value # KEYWORD features if kw_features: keyword_string = get_keywords(child) # Remove punctuation and trailing spaces from keywords words = [s.strip().translate(string.maketrans("",""), string.punctuation) for s in keyword_string.split(',')] # Split keyword phrases into individual words for word in words: w = word.split(' ') words.remove(word) for wx in w: words.append(wx.strip().strip('–')) keywords.append(" ".join(words)) # NARRATIVE features if narr_features or ((not train) and (symp_train in featurenames)): narr_string = "" item = child.find(element) if item != None: if item.text != None: narr_string = item.text.encode("utf-8") else: print "warning: empty narrative" narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')] text = " ".join(narr_words) if stem: narr_string = preprocessing.stem(text) elif lemma: narr_string = preprocessing.lemmatize(text) narratives.append(narr_string.strip().lower()) #print "Adding narr: " + narr_string.lower() # SYMPTOM features elif train and (symp_train in featurenames): narr_string = "" item = child.find("narrative_symptoms") if item != None: item_text = item.text if item_text != None and len(item_text) > 0: narr_string = item.text.encode("utf-8") #narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')] narratives.append(narr_string.lower()) print "Adding symp_narr: " + narr_string.lower() # Save features matrix.append(features) # Construct the feature matrix # COUNT or TFIDF features if narr_count in featurenames or kw_count in featurenames or narr_tfidf in featurenames or kw_tfidf in featurenames or lda in featurenames or symp_train in featurenames: documents = [] if narr_count in featurenames or narr_tfidf in featurenames or lda in featurenames or symp_train in featurenames: documents = narratives print "narratives: " + str(len(narratives)) elif kw_count in featurenames or kw_tfidf in featurenames: documents = keywords print "keywords: " + str(len(keywords)) # Create count matrix global count_vectorizer if train: print "training count_vectorizer" count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(min_ngram,max_ngram),stop_words=stopwords) count_vectorizer.fit(documents) dict_keys = dict_keys + count_vectorizer.get_feature_names() print "transforming data with count_vectorizer" count_matrix = count_vectorizer.transform(documents) matrix_keys = count_vectorizer.get_feature_names() print "writing count matrix to file" out_matrix = open(infile + ".countmatrix", "w") out_matrix.write(str(count_matrix)) out_matrix.close() # Add count features to the dictionary for x in range(len(matrix)): feat = matrix[x] for i in range(len(matrix_keys)): key = matrix_keys[i] val = count_matrix[x,i] feat[key] = val # Convert counts to TFIDF if (narr_tfidf in featurenames) or (kw_tfidf in featurenames): print "converting to tfidf..." print "matrix_keys: " + str(len(matrix_keys)) # Use the training count matrix for fitting if train: global tfidfTransformer tfidfTransformer = sklearn.feature_extraction.text.TfidfTransformer() tfidfTransformer.fit(count_matrix) # Convert matrix to tfidf tfidf_matrix = tfidfTransformer.transform(count_matrix) print "count_matrix: " + str(count_matrix.shape) print "tfidf_matrix: " + str(tfidf_matrix.shape) # Replace features in matrix with tfidf for x in range(len(matrix)): feat = matrix[x] #values = tfidf_matrix[x,0:] #print "values: " + str(values.shape[0]) for i in range(len(matrix_keys)): key = matrix_keys[i] val = tfidf_matrix[x,i] feat[key] = val # LDA topic modeling features if lda in featurenames: global ldaModel if train: ldaModel = LatentDirichletAllocation(n_topics=num_topics) ldaModel.fit(count_matrix) lda_matrix = ldaModel.transform(count_matrix) for t in range(0,num_topics): dict_keys.append("lda_topic_" + str(t)) for x in range(len(matrix)): for y in range(len(lda_matrix[x])): val = lda_matrix[x][y] matrix[x]["lda_topic_" + str(y)] = val # TODO: Print LDA topics # WORD2VEC features elif narr_vec in featurenames: print "Warning: using word2vec features, ignoring all other features" # Create word2vec mapping word2vec, dim = load_word2vec(vecfile) # Convert words to vectors and add to matrix dict_keys.append(narr_vec) global max_seq_len max_seq_len = 200 #if train: #max_seq_len = 0 print "word2vec dim: " + str(dim) print "initial max_seq_len: " + str(max_seq_len) zero_vec = [] for z in range(0, dim): zero_vec.append(0) for x in range(len(matrix)): narr = narratives[x] #print "narr: " + narr vectors = [] vec = zero_vec for word in narr.split(' '): if len(word) > 0: #if word == "didnt": # word = "didn't" if word in word2vec: vec = word2vec[word] vectors.append(vec) length = len(vectors) if length > max_seq_len: #if train: # max_seq_len = length vectors = vectors[(-1*max_seq_len):] (matrix[x])[narr_vec] = vectors # Pad the narr_vecs with 0 vectors print "padding vectors to reach maxlen " + str(max_seq_len) for x in range(len(matrix)): length = len(matrix[x][narr_vec]) matrix[x]['max_seq_len'] = max_seq_len if length < max_seq_len: for k in range(0, max_seq_len-length): matrix[x][narr_vec].insert(0,zero_vec) # use insert for pre-padding # narr_seq for RNN elif narr_seq in featurenames: global vocab_size, max_seq_len if train: dict_keys.append(narr_seq) dict_keys.append('vocab_size') dict_keys.append('max_seq_len') vocab = set() for narr in narratives: words = narr.split(' ') for word in words: vocab.add(word) vocab_size = len(vocab) max_seq_len = 0 sequences = [] # Convert text into integer sequences for x in range(len(matrix)): narr = narratives[x] seq = hashing_trick(narr, vocab_size, hash_function='md5', filters='\t\n', lower=True, split=' ') if len(seq) > max_seq_len: max_seq_len = len(seq) sequences.append(seq) # Pad the sequences sequences = pad_sequences(sequences, maxlen=max_seq_len, dtype='int32', padding='pre') for x in range(len(matrix)): matrix[x]['narr_seq'] = sequences[x] matrix[x]['vocab_size'] = vocab_size matrix[x]['max_seq_len'] = max_seq_len #if arg_rebalance != "": # matrix_re = rebalance_data(matrix, dict_keys, arg_rebalance) # write_to_file(matrix_re, dict_keys, outfile) #else: data_util.write_to_file(matrix, dict_keys, outfile)
# 'Transportation mode': ''} # temp[col] is the mapped value of a specific column (e.g. a string) pos_body, list2_body, pos_subj, list2_subj, neg_subj, neu_subj, posi_subj, compound_subj, neg_body, neu_body, posi_body, compound_body = clean(temp["description"]) temp["posi_subj"] = posi_subj temp["neu_subj"] = neu_subj temp["neg_subj"] = neg_subj temp["compound_subj"] = compound_subj temp["posi_body"] = posi_body temp["neu_body"] = neu_body temp["neg_body"] = neg_body temp["compound_body"] = compound_body # Hash subject POS pos_subj_hash = "" for i in pos_subj: pos_subj_hash+= hashlib.sha512((salt + str(i)).encode('utf-8')).hexdigest() + " " temp["pos_subj"] = hashing_trick(pos_subj_hash, 1000000, hash_function='md5') # Hash Mail subject txt_subj_hash = "" for i in list2_subj: txt_subj_hash += hashlib.sha512((salt + str(i)).encode('utf-8')).hexdigest() + " " temp["txt_subj"] = hashing_trick(txt_subj_hash, 1000000, hash_function='md5') # Hash main POS pos_main_hash = "" for i in pos_body: pos_main_hash += hashlib.sha512((salt + str(i)).encode('utf-8')).hexdigest() + " " temp["pos_main"] = hashing_trick(pos_main_hash, 1000000, hash_function='md5') # Hash main POS txt_main_hash = "" for i in list2_body: txt_main_hash += hashlib.sha512((salt + str(i)).encode('utf-8')).hexdigest() + " " temp["txt_main"] = hashing_trick(txt_main_hash, 1000000, hash_function='md5')
def test_hashing_trick_md5(): text = 'The cat sat on the mat.' encoded = hashing_trick(text, 5, hash_function='md5') assert len(encoded) == 6 assert np.max(encoded) <= 4 assert np.min(encoded) >= 1
def test_hashing_trick_hash(): text = 'The cat sat on the mat.' encoded = hashing_trick(text, 5) assert len(encoded) == 6 assert np.max(encoded) <= 4 assert np.min(encoded) >= 1
def test_hashing_trick_md5(): text = 'The cat sat on the mat.' encoded = hashing_trick(text, 5, hash_function='md5') assert len(encoded) == 6 assert np.max(encoded) <= 4 assert np.min(encoded) >= 1
def test_hashing_trick_hash(): text = 'The cat sat on the mat.' encoded = hashing_trick(text, 5) assert len(encoded) == 6 assert np.max(encoded) <= 4 assert np.min(encoded) >= 1
def test_hashing_trick_md5(self): sample_text = "The cat sat on the mat." encoded = text.hashing_trick(sample_text, 5, hash_function="md5") self.assertLen(encoded, 6) self.assertLessEqual(np.max(encoded), 4) self.assertGreaterEqual(np.min(encoded), 1)
def tokenize(self, text: str): return hashing_trick(text, self.max_words, hash_function='md5')