Exemplo n.º 1
0
 def __init__(self):
     self.line_label_classifier = WordVecBidirectionalLstmSoftmax()
     self.line_type_classifier = WordVecBidirectionalLstmSoftmax()
     self.email = None
     self.name = None
     self.sex = None
     self.ethnicity = None
     self.education = []
     self.objective = None
     self.mobile = None
     self.experience = []
     self.knowledge = []
     self.project = []
     self.meta = list()
     self.header = list()
     self.unknown = True
     self.raw = None
Exemplo n.º 2
0
def main():
    random_state = 42
    np.random.seed(random_state)

    output_dir_path = './models'
    data_file_path = '../data/training_data'
    text_data_model = fit_text(data_file_path, label_type='line_label')
    text_label_pairs = load_text_label_pairs(data_file_path, label_type='line_label')

    classifier = WordVecBidirectionalLstmSoftmax()
    batch_size = 64
    epochs = 20
    history = classifier.fit(text_data_model=text_data_model,
                             model_dir_path=output_dir_path,
                             text_label_pairs=text_label_pairs,
                             batch_size=batch_size, epochs=epochs,
                             test_size=0.3,
                             random_state=random_state)
Exemplo n.º 3
0
def main():
    random_state = 42
    np.random.seed(random_state)

    model_dir_path = './models'
    data_file_path = '../data/training_data'
    text_label_pairs = load_text_label_pairs(data_file_path)

    classifier = WordVecBidirectionalLstmSoftmax()
    classifier.load_model(model_dir_path=model_dir_path)

    shuffle(text_label_pairs)

    for i in range(20):
        text, label = text_label_pairs[i]
        print('Output: ', classifier.predict(sentence=text))
        predicted_label = classifier.predict_class(text)
        print('Sentence: ', text)
        print('Predicted: ', predicted_label, 'Actual: ', label)
Exemplo n.º 4
0
class ResumeParser(object):
    def __init__(self):
        self.line_label_classifier = WordVecBidirectionalLstmSoftmax()
        self.line_type_classifier = WordVecBidirectionalLstmSoftmax()
        self.email = None
        self.name = None
        self.sex = None
        self.ethnicity = None
        self.education = []
        self.objective = None
        self.mobile = None
        self.experience = []
        self.knowledge = []
        self.project = []
        self.meta = list()
        self.header = list()
        self.unknown = True
        self.raw = None

    def load_model(self, model_dir_path):
        self.line_label_classifier.load_model(
            model_dir_path=os.path.join(model_dir_path, 'line_label'))
        self.line_type_classifier.load_model(
            model_dir_path=os.path.join(model_dir_path, 'line_type'))

    def fit(self,
            training_data_dir_path,
            model_dir_path,
            batch_size=None,
            epochs=None,
            test_size=None,
            random_state=None):
        line_label_history = self.fit_line_label(training_data_dir_path,
                                                 model_dir_path=model_dir_path,
                                                 batch_size=batch_size,
                                                 epochs=epochs,
                                                 test_size=test_size,
                                                 random_state=random_state)

        line_type_history = self.fit_line_type(training_data_dir_path,
                                               model_dir_path=model_dir_path,
                                               batch_size=batch_size,
                                               epochs=epochs,
                                               test_size=test_size,
                                               random_state=random_state)

        history = [line_label_history, line_type_history]
        return history

    def fit_line_label(self,
                       training_data_dir_path,
                       model_dir_path,
                       batch_size=None,
                       epochs=None,
                       test_size=None,
                       random_state=None):
        text_data_model = fit_text(training_data_dir_path,
                                   label_type='line_label')
        text_label_pairs = load_text_label_pairs(training_data_dir_path,
                                                 label_type='line_label')

        if batch_size is None:
            batch_size = 64
        if epochs is None:
            epochs = 20
        history = self.line_label_classifier.fit(
            text_data_model=text_data_model,
            model_dir_path=os.path.join(model_dir_path, 'line_label'),
            text_label_pairs=text_label_pairs,
            batch_size=batch_size,
            epochs=epochs,
            test_size=test_size,
            random_state=random_state)
        return history

    def fit_line_type(self,
                      training_data_dir_path,
                      model_dir_path,
                      batch_size=None,
                      epochs=None,
                      test_size=None,
                      random_state=None):
        text_data_model = fit_text(training_data_dir_path,
                                   label_type='line_type')
        text_label_pairs = load_text_label_pairs(training_data_dir_path,
                                                 label_type='line_type')

        if batch_size is None:
            batch_size = 64
        if epochs is None:
            epochs = 20
        history = self.line_label_classifier.fit(
            text_data_model=text_data_model,
            model_dir_path=os.path.join(model_dir_path, 'line_type'),
            text_label_pairs=text_label_pairs,
            batch_size=batch_size,
            epochs=epochs,
            test_size=test_size,
            random_state=random_state)
        return history

    @staticmethod
    def extract_education(label, text):
        if label == 'education':
            return text
        return None

    @staticmethod
    def extract_project(label, text):
        if label == 'project':
            return text
        return None

    @staticmethod
    def extract_knowledge(label, text):
        if label == 'knowledge':
            return text
        return None

    @staticmethod
    def extract_experience(label, text):
        if label == 'experience':
            return text
        return None

    def parse(self, texts, print_line=False):
        self.raw = texts
        for p in texts:
            if len(p) > 10:
                s = word_tokenize(p.lower())
                line_label = self.line_label_classifier.predict_class(
                    sentence=p)
                line_type = self.line_type_classifier.predict_class(sentence=p)
                unknown = True
                name = extract_name(s, p)
                email = extract_email(s, p)
                sex = extract_sex(s, p)
                race = extract_ethnicity(s, p)
                education = self.extract_education(line_label, p)
                project = self.extract_project(line_label, p)
                experience = self.extract_experience(line_label, p)
                objective = extract_objective(s, p)
                knowledge = self.extract_knowledge(line_label, p)
                mobile = extract_mobile(s, p)
                if name is not None:
                    self.name = name
                    unknown = False
                if email is not None:
                    self.email = email
                    unknown = False
                if sex is not None:
                    self.sex = sex
                    unknown = False
                if race is not None:
                    self.ethnicity = race
                    unknown = False
                if education is not None:
                    self.education.append(education)
                    unknown = False
                if knowledge is not None:
                    self.knowledge.append(knowledge)
                    unknown = False
                if project is not None:
                    self.project.append(project)
                    unknown = False
                if objective is not None:
                    self.objective = objective
                    unknown = False
                if experience is not None:
                    self.experience.append(experience)
                    unknown = False
                if mobile is not None:
                    self.mobile = mobile
                    unknown = False

                if line_type == 'meta':
                    self.meta.append(p)
                    unknown = False
                if line_type == 'header':
                    self.header.append(p)

                if unknown is False:
                    self.unknown = unknown

                if print_line:
                    print('parsed: ', p)

    def summary(self):
        text = ''
        if self.name is not None:
            text += 'name: {}\n'.format(self.name)
        if self.email is not None:
            text += 'email: {}\n'.format(self.email)
        if self.mobile is not None:
            text += 'mobile: {}\n'.format(self.mobile)
        if self.ethnicity is not None:
            text += 'ethnicity: {}\n'.format(self.ethnicity)
        if self.sex is not None:
            text += 'sex: {}\n'.format(self.sex)
        if self.objective is not None:
            text += 'objective: {}\n'.format(self.objective)

        for ex in self.experience:
            text += 'experience: {}\n'.format(ex)

        for edu in self.education:
            text += 'education: {}\n'.format(edu)

        for knowledge in self.knowledge:
            text += 'knowledge: {}\n'.format(knowledge)
        for project in self.project:
            text += 'project: {}\n'.format(project)

        for meta_data in self.meta:
            text += 'meta: {}\n'.format(meta_data)

        return text.strip()
class ResumeParser(object):

    def __init__(self):
        self.line_label_classifier = WordVecBidirectionalLstmSoftmax()
        self.line_type_classifier = WordVecBidirectionalLstmSoftmax()
        self.email = None
        self.name = None
        self.sex = None
        self.ethnicity = None
        self.education = []
        self.objective = None
        self.mobile = None
        self.experience = []
        self.knowledge = []
        self.project = []
        self.meta = list()
        self.header = list()
        self.unknown = True
        self.raw = None

    def load_model(self, model_dir_path):
        self.line_label_classifier.load_model(model_dir_path=os.path.join(model_dir_path, 'line_label'))
        self.line_type_classifier.load_model(model_dir_path=os.path.join(model_dir_path, 'line_type'))

    @staticmethod
    def extract_education(label, text):
        if label == 'education':
            return text
        return None

    @staticmethod
    def extract_project(label, text):
        if label == 'project':
            return text
        return None

    @staticmethod
    def extract_knowledge(label, text):
        if label == 'knowledge':
            return text
        return None

    @staticmethod
    def extract_experience(label, text):
        if label == 'experience':
            return text
        return None

    def parse(self, texts, print_line=False):
        self.raw = texts
        proc = TextPreprocessor(n_jobs=-0)
        predictions = {'line': [], 'type': [], 'label':[]}
        for p in texts:
            if len(p) > 10:
                s = word_tokenize(p)
                original_line = deepcopy(p).lower()
                p = proc._preprocess_text(p)
                line_label = self.line_label_classifier.predict_class(sentence=p)
                line_type = self.line_type_classifier.predict_class(sentence=p)
                predictions['line'].append(p)
                unknown = True
                # Find if the line belongs to header
                name = extract_name(s, original_line)
                email = extract_email(s, original_line)
                sex = extract_sex(s, original_line)
                race = extract_ethnicity(s, original_line)
                education = self.extract_education(line_label, p)
                project = self.extract_project(line_label, p)
                experience = self.extract_experience(line_label, p)
                objective = extract_objective(s, p)
                knowledge = self.extract_knowledge(line_label, original_line)
                mobile = extract_mobile(s, original_line)
                if mobile or name or email or sex or race:
                    predictions['type'].append('header')
                    predictions['label'].append('personal')
                else:
                    predictions['type'].append(line_type)
                    predictions['label'].append(line_label)
                if name is not None:
                    self.name = name
                    unknown = False
                if email is not None:
                    self.email = email
                    unknown = False
                if sex is not None:
                    self.sex = sex
                    unknown = False
                if race is not None:
                    self.ethnicity = race
                    unknown = False
                if education is not None:
                    self.education.append(education)
                    unknown = False
                if knowledge is not None:
                    self.knowledge.append(knowledge)
                    unknown = False
                if project is not None:
                    self.project.append(project)
                    unknown = False
                if objective is not None:
                    self.objective = objective
                    unknown = False
                if experience is not None:
                    self.experience.append(experience)
                    unknown = False
                if mobile is not None:
                    self.mobile = mobile
                    unknown = False

                if line_type == 'meta':
                    self.meta.append(p)
                    unknown = False
                if line_type == 'header':
                    self.header.append(p)

                if unknown is False:
                    self.unknown = unknown
        return predictions

    def detect_blocks(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        taking results from parse()
        Detecting header - personal data if found
        Data frame in the form:
        | line  | type  |   label |
        -----------------------
        |'same' | header | personal|
        """
        df.at[:, 'has_date'] = False
        date_idxs = []
        for i, row in df.iterrows():
            if has_date(row['line']):
                date_idxs.append(i)
                df.at[i, 'has_date'] = True
        return df

    def define_header_lines(self, df_predictions: pd.DataFrame):
        """
        If predictions contain personal/header information label all prior rows as personal
        """
        header_limit_percent = 0.1
        max_allowed_header_idx = int(df_predictions.shape[0]*header_limit_percent)
        personal_indexes = df_predictions[df_predictions['label'] == 'personal'].index
        personal_indexes = [i for i in personal_indexes if i <= max_allowed_header_idx]
        if len(personal_indexes)>0:
            last_idx = personal_indexes[-1]
            df_predictions.iloc[:last_idx, :][['type', 'label']] = ('header', 'personal')
        return df_predictions