def __init__(self): self.line_label_classifier = WordVecBidirectionalLstmSoftmax() self.line_type_classifier = WordVecBidirectionalLstmSoftmax() self.email = None self.name = None self.sex = None self.ethnicity = None self.education = [] self.objective = None self.mobile = None self.experience = [] self.knowledge = [] self.project = [] self.meta = list() self.header = list() self.unknown = True self.raw = None
def main(): random_state = 42 np.random.seed(random_state) output_dir_path = './models' data_file_path = '../data/training_data' text_data_model = fit_text(data_file_path, label_type='line_label') text_label_pairs = load_text_label_pairs(data_file_path, label_type='line_label') classifier = WordVecBidirectionalLstmSoftmax() batch_size = 64 epochs = 20 history = classifier.fit(text_data_model=text_data_model, model_dir_path=output_dir_path, text_label_pairs=text_label_pairs, batch_size=batch_size, epochs=epochs, test_size=0.3, random_state=random_state)
def main(): random_state = 42 np.random.seed(random_state) model_dir_path = './models' data_file_path = '../data/training_data' text_label_pairs = load_text_label_pairs(data_file_path) classifier = WordVecBidirectionalLstmSoftmax() classifier.load_model(model_dir_path=model_dir_path) shuffle(text_label_pairs) for i in range(20): text, label = text_label_pairs[i] print('Output: ', classifier.predict(sentence=text)) predicted_label = classifier.predict_class(text) print('Sentence: ', text) print('Predicted: ', predicted_label, 'Actual: ', label)
class ResumeParser(object): def __init__(self): self.line_label_classifier = WordVecBidirectionalLstmSoftmax() self.line_type_classifier = WordVecBidirectionalLstmSoftmax() self.email = None self.name = None self.sex = None self.ethnicity = None self.education = [] self.objective = None self.mobile = None self.experience = [] self.knowledge = [] self.project = [] self.meta = list() self.header = list() self.unknown = True self.raw = None def load_model(self, model_dir_path): self.line_label_classifier.load_model( model_dir_path=os.path.join(model_dir_path, 'line_label')) self.line_type_classifier.load_model( model_dir_path=os.path.join(model_dir_path, 'line_type')) def fit(self, training_data_dir_path, model_dir_path, batch_size=None, epochs=None, test_size=None, random_state=None): line_label_history = self.fit_line_label(training_data_dir_path, model_dir_path=model_dir_path, batch_size=batch_size, epochs=epochs, test_size=test_size, random_state=random_state) line_type_history = self.fit_line_type(training_data_dir_path, model_dir_path=model_dir_path, batch_size=batch_size, epochs=epochs, test_size=test_size, random_state=random_state) history = [line_label_history, line_type_history] return history def fit_line_label(self, training_data_dir_path, model_dir_path, batch_size=None, epochs=None, test_size=None, random_state=None): text_data_model = fit_text(training_data_dir_path, label_type='line_label') text_label_pairs = load_text_label_pairs(training_data_dir_path, label_type='line_label') if batch_size is None: batch_size = 64 if epochs is None: epochs = 20 history = self.line_label_classifier.fit( text_data_model=text_data_model, model_dir_path=os.path.join(model_dir_path, 'line_label'), text_label_pairs=text_label_pairs, batch_size=batch_size, epochs=epochs, test_size=test_size, random_state=random_state) return history def fit_line_type(self, training_data_dir_path, model_dir_path, batch_size=None, epochs=None, test_size=None, random_state=None): text_data_model = fit_text(training_data_dir_path, label_type='line_type') text_label_pairs = load_text_label_pairs(training_data_dir_path, label_type='line_type') if batch_size is None: batch_size = 64 if epochs is None: epochs = 20 history = self.line_label_classifier.fit( text_data_model=text_data_model, model_dir_path=os.path.join(model_dir_path, 'line_type'), text_label_pairs=text_label_pairs, batch_size=batch_size, epochs=epochs, test_size=test_size, random_state=random_state) return history @staticmethod def extract_education(label, text): if label == 'education': return text return None @staticmethod def extract_project(label, text): if label == 'project': return text return None @staticmethod def extract_knowledge(label, text): if label == 'knowledge': return text return None @staticmethod def extract_experience(label, text): if label == 'experience': return text return None def parse(self, texts, print_line=False): self.raw = texts for p in texts: if len(p) > 10: s = word_tokenize(p.lower()) line_label = self.line_label_classifier.predict_class( sentence=p) line_type = self.line_type_classifier.predict_class(sentence=p) unknown = True name = extract_name(s, p) email = extract_email(s, p) sex = extract_sex(s, p) race = extract_ethnicity(s, p) education = self.extract_education(line_label, p) project = self.extract_project(line_label, p) experience = self.extract_experience(line_label, p) objective = extract_objective(s, p) knowledge = self.extract_knowledge(line_label, p) mobile = extract_mobile(s, p) if name is not None: self.name = name unknown = False if email is not None: self.email = email unknown = False if sex is not None: self.sex = sex unknown = False if race is not None: self.ethnicity = race unknown = False if education is not None: self.education.append(education) unknown = False if knowledge is not None: self.knowledge.append(knowledge) unknown = False if project is not None: self.project.append(project) unknown = False if objective is not None: self.objective = objective unknown = False if experience is not None: self.experience.append(experience) unknown = False if mobile is not None: self.mobile = mobile unknown = False if line_type == 'meta': self.meta.append(p) unknown = False if line_type == 'header': self.header.append(p) if unknown is False: self.unknown = unknown if print_line: print('parsed: ', p) def summary(self): text = '' if self.name is not None: text += 'name: {}\n'.format(self.name) if self.email is not None: text += 'email: {}\n'.format(self.email) if self.mobile is not None: text += 'mobile: {}\n'.format(self.mobile) if self.ethnicity is not None: text += 'ethnicity: {}\n'.format(self.ethnicity) if self.sex is not None: text += 'sex: {}\n'.format(self.sex) if self.objective is not None: text += 'objective: {}\n'.format(self.objective) for ex in self.experience: text += 'experience: {}\n'.format(ex) for edu in self.education: text += 'education: {}\n'.format(edu) for knowledge in self.knowledge: text += 'knowledge: {}\n'.format(knowledge) for project in self.project: text += 'project: {}\n'.format(project) for meta_data in self.meta: text += 'meta: {}\n'.format(meta_data) return text.strip()
class ResumeParser(object): def __init__(self): self.line_label_classifier = WordVecBidirectionalLstmSoftmax() self.line_type_classifier = WordVecBidirectionalLstmSoftmax() self.email = None self.name = None self.sex = None self.ethnicity = None self.education = [] self.objective = None self.mobile = None self.experience = [] self.knowledge = [] self.project = [] self.meta = list() self.header = list() self.unknown = True self.raw = None def load_model(self, model_dir_path): self.line_label_classifier.load_model(model_dir_path=os.path.join(model_dir_path, 'line_label')) self.line_type_classifier.load_model(model_dir_path=os.path.join(model_dir_path, 'line_type')) @staticmethod def extract_education(label, text): if label == 'education': return text return None @staticmethod def extract_project(label, text): if label == 'project': return text return None @staticmethod def extract_knowledge(label, text): if label == 'knowledge': return text return None @staticmethod def extract_experience(label, text): if label == 'experience': return text return None def parse(self, texts, print_line=False): self.raw = texts proc = TextPreprocessor(n_jobs=-0) predictions = {'line': [], 'type': [], 'label':[]} for p in texts: if len(p) > 10: s = word_tokenize(p) original_line = deepcopy(p).lower() p = proc._preprocess_text(p) line_label = self.line_label_classifier.predict_class(sentence=p) line_type = self.line_type_classifier.predict_class(sentence=p) predictions['line'].append(p) unknown = True # Find if the line belongs to header name = extract_name(s, original_line) email = extract_email(s, original_line) sex = extract_sex(s, original_line) race = extract_ethnicity(s, original_line) education = self.extract_education(line_label, p) project = self.extract_project(line_label, p) experience = self.extract_experience(line_label, p) objective = extract_objective(s, p) knowledge = self.extract_knowledge(line_label, original_line) mobile = extract_mobile(s, original_line) if mobile or name or email or sex or race: predictions['type'].append('header') predictions['label'].append('personal') else: predictions['type'].append(line_type) predictions['label'].append(line_label) if name is not None: self.name = name unknown = False if email is not None: self.email = email unknown = False if sex is not None: self.sex = sex unknown = False if race is not None: self.ethnicity = race unknown = False if education is not None: self.education.append(education) unknown = False if knowledge is not None: self.knowledge.append(knowledge) unknown = False if project is not None: self.project.append(project) unknown = False if objective is not None: self.objective = objective unknown = False if experience is not None: self.experience.append(experience) unknown = False if mobile is not None: self.mobile = mobile unknown = False if line_type == 'meta': self.meta.append(p) unknown = False if line_type == 'header': self.header.append(p) if unknown is False: self.unknown = unknown return predictions def detect_blocks(self, df: pd.DataFrame) -> pd.DataFrame: """ taking results from parse() Detecting header - personal data if found Data frame in the form: | line | type | label | ----------------------- |'same' | header | personal| """ df.at[:, 'has_date'] = False date_idxs = [] for i, row in df.iterrows(): if has_date(row['line']): date_idxs.append(i) df.at[i, 'has_date'] = True return df def define_header_lines(self, df_predictions: pd.DataFrame): """ If predictions contain personal/header information label all prior rows as personal """ header_limit_percent = 0.1 max_allowed_header_idx = int(df_predictions.shape[0]*header_limit_percent) personal_indexes = df_predictions[df_predictions['label'] == 'personal'].index personal_indexes = [i for i in personal_indexes if i <= max_allowed_header_idx] if len(personal_indexes)>0: last_idx = personal_indexes[-1] df_predictions.iloc[:last_idx, :][['type', 'label']] = ('header', 'personal') return df_predictions