def get_zylon_parser_scores(self): """ parameters: none Extracts labelled entities from zylon's xml output and true xml output. Compares the entity lists and returns a score, higher is better. return: edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score """ extractor = Extractor() zylon_filenames = extractor.populate_file_names( self.__zylon_parser_labels_folder) zylon_xml_trees = extractor.read_resume_labels( self.__zylon_parser_labels_folder, zylon_filenames) true_xml_trees = extractor.read_resume_labels( self.__dataset_raw_folder, zylon_filenames) true_edu_insts = [ extractor.get_edu_institutions(xml_tree) for xml_tree in true_xml_trees ] true_edu_majors = [ extractor.get_edu_majors(xml_tree) for xml_tree in true_xml_trees ] true_emp_names = [ extractor.get_company_names(xml_tree) for xml_tree in true_xml_trees ] true_emp_jtitles = [ extractor.get_job_titles(xml_tree) for xml_tree in true_xml_trees ] zylon_edu_insts = [ extractor.get_edu_institutions_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_edu_majors = [ extractor.get_edu_majors_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_emp_names = [ extractor.get_company_names_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_emp_jtitles = [ extractor.get_job_titles_zy(xml_tree) for xml_tree in zylon_xml_trees ] tokeniser = Tokeniser() true_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_insts)) true_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_majors)) true_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_names)) true_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_jtitles)) zylon_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_edu_insts)) zylon_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_edu_majors)) zylon_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_emp_names)) zylon_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_emp_jtitles)) edu_insts_match_score = self.score_matches(zylon_edu_insts, true_edu_insts) edu_majors_match_score = self.score_matches(zylon_edu_majors, true_edu_majors) emp_names_match_score = self.score_matches(zylon_emp_names, true_emp_names) emp_jtitles_match_score = self.score_matches(zylon_emp_jtitles, true_emp_jtitles) return edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score
def get_ies_scores(self): extractor = Extractor() ies_filenames = extractor.populate_file_names(self.__ies_accuracy_test) ies_filenames = extractor.filter_by_valid_exts(ies_filenames) filenames, resume_content = extractor.read_resume_content_tika_api( ies_filenames, self.__ies_accuracy_test) filenames, resume_content = extractor.remove_empty_resumes( filenames, resume_content) resume_labels = extractor.read_resume_labels(self.__ies_accuracy_test, filenames) true_edu_insts = [ extractor.get_edu_institutions(xml_tree) for xml_tree in resume_labels ] true_edu_majors = [ extractor.get_edu_majors(xml_tree) for xml_tree in resume_labels ] true_emp_names = [ extractor.get_company_names(xml_tree) for xml_tree in resume_labels ] true_emp_jtitles = [ extractor.get_job_titles(xml_tree) for xml_tree in resume_labels ] cs = CrfSuite() cs.load_tagger() annotator = Annotator() annotated_resumes = [ annotator.annotate_using_trained_model(self.__ies_accuracy_test + self.__seperator + filename[0] + filename[1]) for filename in filenames ] predicted_entity_list = [ cs.tag_doc(resume) for resume in annotated_resumes ] ies_edu_insts = [ extractor.get_edu_institutions_from_list(entity_list) for entity_list in predicted_entity_list ] ies_edu_majors = [ extractor.get_edu_major_from_list(entity_list) for entity_list in predicted_entity_list ] ies_emp_names = [ extractor.get_company_names_from_list(entity_list) for entity_list in predicted_entity_list ] ies_emp_jtitles = [ extractor.get_company_position_from_list(entity_list) for entity_list in predicted_entity_list ] tokeniser = Tokeniser() true_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_insts)) true_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_majors)) true_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_names)) true_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_jtitles)) ies_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_edu_insts)) ies_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_edu_majors)) ies_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_emp_names)) ies_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_emp_jtitles)) edu_insts_match_score = self.score_matches(ies_edu_insts, true_edu_insts) edu_majors_match_score = self.score_matches(ies_edu_majors, true_edu_majors) emp_names_match_score = self.score_matches(ies_emp_names, true_emp_names) emp_jtitles_match_score = self.score_matches(ies_emp_jtitles, true_emp_jtitles) print(edu_insts_match_score) print(edu_majors_match_score) print(emp_names_match_score) print(emp_jtitles_match_score)
class Annotator(): __job_position_tag = "EMP-POS" __job_company_tag = "EMP-COMP" __education_course_tag = "EDU-MAJOR" __education_institution_tag = "EDU-INST" def __init__(self): self.__extractor = Extractor() self.__tokeniser = Tokeniser() self.__tagger = Tagger() self.__dataset = Dataset() self.__logger = Logger() def prepare_dataset(self, nr_of_docs=-1): resumes, labels = self.__extractor.read_raw_files(nr_of_docs) resumes = self.__tokeniser.tokenise_docs_to_lines(resumes) resumes = self.__tokeniser.tokenise_doclines_to_words(resumes) self.__dataset.resume_content = self.annotate_docs(resumes, labels) self.__dataset.save() # resumes: list of tokenised (by line and word) résumé docs # labels: xml structure storing labels for several resumes def annotate_docs(self, resumes, labels): self.__logger.println("annotating resumes") annotated_resumes = [] for idx, resume in enumerate(resumes): annotated_resumes.append(self.annotate_doc(resume, labels[idx])) self.__logger.println( "annotating resume %s/%s with true labels and pos tags" % (idx + 1, len(resumes))) # non local ner tag entire dataset at a time for speed annotated_resumes = self.__tagger.nonlocal_ner_tag(annotated_resumes) self.__logger.println("completed annotating resumes") return annotated_resumes # doc: a single résumé document with token strings in each slot of list # labels: xml structure storing pre-extracted information def annotate_doc(self, doc, labels): job_title_list = self.__extractor.get_job_titles(labels) job_company_list = self.__extractor.get_company_names(labels) edu_major_list = self.__extractor.get_edu_majors(labels) edu_inst_list = self.__extractor.get_edu_institutions(labels) # can extract more labels here prepared_doc = self.__tagger.prepare_doc(doc) prepared_doc = self.__match_entity(prepared_doc, job_title_list, self.__job_position_tag) prepared_doc = self.__match_entity(prepared_doc, job_company_list, self.__job_company_tag) prepared_doc = self.__match_entity(prepared_doc, edu_major_list, self.__education_course_tag) prepared_doc = self.__match_entity(prepared_doc, edu_inst_list, self.__education_institution_tag) prepared_doc = self.__tagger.add_default_entity_tags(prepared_doc) prepared_doc = self.__tagger.pos_tag(prepared_doc) return prepared_doc # doc: résumé doc to be annotated # entity_list: list of labels to matched in doc # tag: tag to be assigned if match found def __match_entity(self, doc, entity_list, tag): for entity in entity_list: doc = self.__tagger.match_label(doc, entity, tag) return doc # function takes in a path to file and annotates it for tagging # to be ideally used to tag as a one off for testing # filepath: path to résumé def annotate_using_trained_model(self, filepath): resume_content = self.__extractor.read_resume_content(filepath) resume_content = self.__tokeniser.tokenise_docs_to_lines( resume_content) resume_content = self.__tokeniser.tokenise_doclines_to_words( resume_content) prepared_doc = self.__tagger.prepare_doc(resume_content[0]) prepared_doc = self.__tagger.pos_tag(prepared_doc) prepared_doc = self.__tagger.nonlocal_ner_tag([prepared_doc]) return prepared_doc[0]