def fit(self, text_data_model, text_label_pairs, model_dir_path, batch_size=None, epochs=None, test_size=None, random_state=None): if batch_size is None: batch_size = 64 if epochs is None: epochs = 20 if test_size is None: test_size = 0.3 if random_state is None: random_state = 42 self.config = text_data_model self.idx2word = self.config['idx2word'] self.word2idx = self.config['word2idx'] self.max_len = self.config['max_len'] self.vocab_size = self.config['vocab_size'] self.labels = self.config['labels'] np.save(self.get_config_file_path(model_dir_path), self.config) self.create_model() json = self.model.to_json() open(self.get_architecture_file_path(model_dir_path), 'w').write(json) xs = [] ys = [] for text, label in text_label_pairs: tokens = [x.lower() for x in word_tokenize(text)] wid_list = list() for w in tokens: wid = 0 if w in self.word2idx: wid = self.word2idx[w] wid_list.append(wid) xs.append(wid_list) ys.append(self.labels[label]) X = pad_sequences(xs, maxlen=self.max_len) Y = np_utils.to_categorical(ys, len(self.labels)) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state) print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) weight_file_path = self.get_weight_file_path(model_dir_path) checkpoint = ModelCheckpoint(weight_file_path) history = self.model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_data=[x_test, y_test], callbacks=[checkpoint], verbose=1) self.model.save_weights(weight_file_path) np.save(model_dir_path + '/' + WordVecCnnLstm.model_name + '-history.npy', history.history) score = self.model.evaluate(x=x_test, y=y_test, batch_size=batch_size, verbose=1) print('score: ', score[0]) print('accuracy: ', score[1]) return history
def fit(self, text_data_model, text_label_pairs, model_dir_path, batch_size=None, epochs=None, test_size=None, random_state=None): if batch_size is None: batch_size = 64 if epochs is None: epochs = 20 if test_size is None: test_size = 0.3 if random_state is None: random_state = 42 self.config = text_data_model self.idx2word = self.config['idx2word'] self.word2idx = self.config['word2idx'] self.max_len = self.config['max_len'] self.vocab_size = self.config['vocab_size'] self.labels = self.config['labels'] np.save(self.get_config_file_path(model_dir_path), self.config) self.create_model() json = self.model.to_json() open(self.get_architecture_file_path(model_dir_path), 'w').write(json) ys = [] X = np.zeros(shape=(len(text_label_pairs), self.glove_model.embedding_dim)) for i, (text, label) in enumerate(text_label_pairs): words = [w.lower() for w in word_tokenize(text)] E = np.zeros(shape=(self.glove_model.embedding_dim, self.max_len)) for j in range(len(words)): word = words[j] try: E[:, j] = self.glove_model.encode_word(word) except KeyError: pass X[i, :] = np.sum(E, axis=1) ys.append(self.labels[label]) Y = np_utils.to_categorical(ys, len(self.labels)) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state) print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) weight_file_path = self.get_weight_file_path(model_dir_path) checkpoint = ModelCheckpoint(weight_file_path) history = self.model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_data=[x_test, y_test], callbacks=[checkpoint], verbose=1) self.model.save_weights(weight_file_path) np.save(model_dir_path + '/' + WordVecGloveFFN.model_name + '-history.npy', history.history) score = self.model.evaluate(x=x_test, y=y_test, batch_size=batch_size, verbose=1) print('score: ', score[0]) print('accuracy: ', score[1]) return history
def predict(self, sentence): xs = [] tokens = [w.lower() for w in word_tokenize(sentence)] wid = [self.word2idx[token] if token in self.word2idx else len(self.word2idx) for token in tokens] xs.append(wid) x = pad_sequences(xs, self.max_len) output = self.model.predict(x) return output[0]
def fit(self, text_data_model, text_label_pairs, model_dir_path, test_size=None, random_state=None, epochs=None, batch_size=None): if epochs is None: epochs = 10 if batch_size is None: batch_size = 16 if test_size is None: test_size = 0.3 if random_state is None: random_state = 42 self.config = text_data_model self.idx2word = self.config['idx2word'] self.word2idx = self.config['word2idx'] self.max_len = self.config['max_len'] self.vocab_size = self.config['vocab_size'] self.labels = self.config['labels'] verbose = 1 config_file_path = WordVecMultiChannelCnn.get_config_file_path(model_dir_path) np.save(config_file_path, text_data_model) max_input_tokens = len(self.word2idx) self.model = self.define_model(self.max_len, max_input_tokens) open(self.get_architecture_file_path(model_dir_path), 'wt').write(self.model.to_json()) xs = [] ys = [] for text, label in text_label_pairs: tokens = [x.lower() for x in word_tokenize(text)] wid_list = list() for w in tokens: wid = 0 if w in self.word2idx: wid = self.word2idx[w] wid_list.append(wid) xs.append(wid_list) ys.append(self.labels[label]) X = pad_sequences(xs, maxlen=self.max_len) Y = np_utils.to_categorical(ys, len(self.labels)) weight_file_path = WordVecMultiChannelCnn.get_weight_file_path(model_dir_path) checkpoint = ModelCheckpoint(weight_file_path) history = self.model.fit([X, X, X], Y, epochs=epochs, batch_size=batch_size, validation_split=test_size, verbose=verbose, callbacks=[checkpoint]) # save the model self.model.save(weight_file_path) np.save(model_dir_path + '/' + WordVecMultiChannelCnn.model_name + '-history.npy', history.history) return history
def encode_docs(self, docs, max_allowed_doc_length=None): if max_allowed_doc_length is None: max_allowed_doc_length = 500 doc_count = len(docs) X = np.zeros(shape=(doc_count, self.embedding_dim)) max_len = 0 for doc in docs: max_len = max(max_len, len([word_tokenize(doc)])) max_len = min(max_len, max_allowed_doc_length) for i in range(0, doc_count): doc = docs[i] words = [w.lower() for w in word_tokenize(doc)] E = np.zeros(shape=(self.embedding_dim, max_len)) for j in range(max_len): word = words[j] try: E[:, j] = self.word2em[word] except KeyError: pass X[i, :] = np.sum(E, axis=1) return X
def predict(self, sentence): tokens = [w.lower() for w in word_tokenize(sentence)] X = np.zeros(shape=(1, self.glove_model.embedding_dim)) E = np.zeros(shape=(self.glove_model.embedding_dim, self.max_len)) for j in range(0, len(tokens)): word = tokens[j] try: E[:, j] = self.glove_model.encode_word(word) except KeyError: pass X[0, :] = np.sum(E, axis=1) output = self.model.predict(X) return output[0]
def parse(self, texts, print_line=False): self.raw = texts for p in texts: if len(p) > 10: s = word_tokenize(p.lower()) unknown = True name = extract_name(s, p) email = extract_email(s, p) sex = extract_sex(s, p) race = extract_ethnicity(s, p) education = extract_education(s, p) experience = extract_experience(s, p) objective = extract_objective(s, p) expertise = extract_expertise(s, p) mobile = extract_mobile(s, p) if name is not None: self.name = name unknown = False if email is not None: self.email = email unknown = False if sex is not None: self.sex = sex unknown = False if race is not None: self.ethnicity = race unknown = False if education is not None: self.education = education unknown = False if experience is not None: self.experience = experience unknown = False if objective is not None: self.objective = objective unknown = False if expertise is not None: self.expertise.append(expertise) unknown = False if mobile is not None: self.mobile = mobile unknown = False if unknown is False: self.unknown = unknown if print_line: print('parsed: ', p)
def encode_doc(self, doc, max_allowed_doc_length=None): if max_allowed_doc_length is None: max_allowed_doc_length = 500 words = [w.lower() for w in word_tokenize(doc)] max_len = min(len(words), max_allowed_doc_length) E = np.zeros(shape=(self.embedding_dim, max_len)) X = np.zeros(shape=(self.embedding_dim, )) for j in range(max_len): word = words[j] try: E[:, j] = self.word2em[word] except KeyError: pass X[:] = np.sum(E, axis=1) return X
def fit_text(data_dir_path, max_vocab_size=None, label_type=None): if label_type is None: label_type = 'line_type' if max_vocab_size is None: max_vocab_size = 5000 counter = collections.Counter() max_len = 0 labels = dict() for f in os.listdir(data_dir_path): data_file_path = os.path.join(data_dir_path, f) if os.path.isfile(data_file_path) and f.lower().endswith('.txt'): file = open(data_file_path, mode='rt', encoding='utf8') for line in file: res = line.strip().split('\t') if len(res) == 3: line_type, line_label, sentence = res[0], res[1], res[2] tokens = [x.lower() for x in word_tokenize(sentence)] for token in tokens: counter[token] += 1 max_len = max(max_len, len(tokens)) label = line_label if label_type != 'line_label': label = line_type if label not in labels: labels[label] = len(labels) file.close() word2idx = collections.defaultdict(int) for idx, word in enumerate(counter.most_common(max_vocab_size)): word2idx[word[0]] = idx idx2word = {v: k for k, v in word2idx.items()} vocab_size = len(word2idx) + 1 model = dict() model['word2idx'] = word2idx model['idx2word'] = idx2word model['vocab_size'] = vocab_size model['max_len'] = max_len model['labels'] = labels return model
def parse(self, texts, print_line=False): self.raw = texts for p in texts: if len(p) > 10: s = word_tokenize(p.lower()) line_label = self.line_label_classifier.predict_class( sentence=p) line_type = self.line_type_classifier.predict_class(sentence=p) unknown = True name = extract_name(s, p) email = extract_email(s, p) sex = extract_sex(s, p) race = extract_ethnicity(s, p) education = self.extract_education(line_label, p) project = self.extract_project(line_label, p) experience = self.extract_experience(line_label, p) objective = extract_objective(s, p) knowledge = self.extract_knowledge(line_label, p) mobile = extract_mobile(s, p) if name is not None: self.name = name unknown = False if email is not None: self.email = email unknown = False if sex is not None: self.sex = sex unknown = False if race is not None: self.ethnicity = race unknown = False if education is not None: self.education.append(education) unknown = False if knowledge is not None: self.knowledge.append(knowledge) unknown = False if project is not None: self.project.append(project) unknown = False if objective is not None: self.objective = objective unknown = False if experience is not None: self.experience.append(experience) unknown = False if mobile is not None: self.mobile = mobile unknown = False if line_type == 'meta': self.meta.append(p) unknown = False if line_type == 'header': self.header.append(p) if unknown is False: self.unknown = unknown if print_line: print('parsed: ', p)
def parse(self, texts, print_line=False): self.raw = texts proc = TextPreprocessor(n_jobs=-0) predictions = {'line': [], 'type': [], 'label':[]} for p in texts: if len(p) > 10: s = word_tokenize(p) original_line = deepcopy(p).lower() p = proc._preprocess_text(p) line_label = self.line_label_classifier.predict_class(sentence=p) line_type = self.line_type_classifier.predict_class(sentence=p) predictions['line'].append(p) unknown = True # Find if the line belongs to header name = extract_name(s, original_line) email = extract_email(s, original_line) sex = extract_sex(s, original_line) race = extract_ethnicity(s, original_line) education = self.extract_education(line_label, p) project = self.extract_project(line_label, p) experience = self.extract_experience(line_label, p) objective = extract_objective(s, p) knowledge = self.extract_knowledge(line_label, original_line) mobile = extract_mobile(s, original_line) if mobile or name or email or sex or race: predictions['type'].append('header') predictions['label'].append('personal') else: predictions['type'].append(line_type) predictions['label'].append(line_label) if name is not None: self.name = name unknown = False if email is not None: self.email = email unknown = False if sex is not None: self.sex = sex unknown = False if race is not None: self.ethnicity = race unknown = False if education is not None: self.education.append(education) unknown = False if knowledge is not None: self.knowledge.append(knowledge) unknown = False if project is not None: self.project.append(project) unknown = False if objective is not None: self.objective = objective unknown = False if experience is not None: self.experience.append(experience) unknown = False if mobile is not None: self.mobile = mobile unknown = False if line_type == 'meta': self.meta.append(p) unknown = False if line_type == 'header': self.header.append(p) if unknown is False: self.unknown = unknown return predictions