def extract_features(res_text, xml_text): xml_tree = etree.fromstring(xml_text) words = [ st.stem(w) for w in nltk.word_tokenize(res_text.lower()) if w not in stopwords and len(w) > 2 ] features = Counter(words) # features = {} # words_set = list(set(words)) # for w in words_set: # features[w] = 1 university = xml_tree.xpath('//institution/text()') if university: normalized_univ = extract_univ(university[0], univ_dict, univ_normalize) features[ 'university'] = normalized_univ if normalized_univ else university[ 0] degree_level = xml_tree.xpath('//degree/@level') if degree_level: features['degree_level'] = max(degree_level) return features
def analyze(): # global results_json global university global tree_json if request.method: # Get and save file from browser upload files = request.files['file'] if files: filename = str(files.filename) extension = filename.rsplit('.', 1)[1] filename_without_extension = filename.rsplit('.', 1)[0] files.save(os.path.join(iHire.config['UPLOAD_FOLDER'], filename)) if extension == 'pdf': text_from_pdf = extract_text_from_pdf(filename) text_from_pdf = text_from_pdf.replace('\xc2\xa0', ' ') with open(filename_without_extension + '.txt', 'wb') as write_file: write_file.write(text_from_pdf) textfile_name = filename_without_extension + '.txt' else: textfile_name = filename university = extract_univ(open(textfile_name).read(), univ_dict, univ_normalize) print filename # create_data_for_graph(university, "", skills_employer, univ_major_number, major_code_lookup) tree_json = create_data_for_tree( university, "", skills_employer_tree, univ_major_number, major_code_lookup, employer_second_degree_tree ) resume_text = [open(textfile_name).read()] predicted_decision = model.decision_function(resume_text) top_predictions, normalized_prediction_score = get_top_predictions(predicted_decision) out = dict() skills_map_with_percent_list = [] titles = sorted(skills_map_with_percent.keys()) for title in titles: temp_skill_map = dict() temp_skill_map[title] = skills_map_with_percent[title] skills_map_with_percent_list.append(temp_skill_map) out["university"] = university out["skills_map"] = skills_map_with_percent_list out["titles"] = titles out["candidate_skills"] = dict() out["title_data"] = dict() try: tokens = nltk.word_tokenize(resume_text[0].lower()) except UnicodeDecodeError: tokens = nltk.word_tokenize(resume_text[0].decode('utf-8').lower()) skill_score = [] for pred in top_predictions: try: top15 = skills_map_with_percent[title_title_map[pred]]["skills"][:15] except KeyError: top15 = [] temp_skill_list = [t for t in top15 if len(t) > 1 and t.lower() in tokens] out["candidate_skills"][title_title_map[pred]] = temp_skill_list out["title_data"][title_title_map[pred]] = titles_data[title_title_map[pred]] skill_score.append(int(len(temp_skill_list) / 15.0 * 100.0)) final_score = [sum(x)/2 for x in zip(normalized_prediction_score, skill_score)] final_titles_list = [] sorted_score_indexes = [i[0] for i in sorted(enumerate(final_score), key=lambda x:x[1], reverse=True)] for s in sorted_score_indexes: final_titles_list.append(title_title_map[top_predictions[s]]) final_score_sorted = sorted(final_score, reverse=True) out["final_prediction_list"] = final_titles_list out["final_score_sorted"] = final_score_sorted out["tree_json"] = json.dumps(tree_json) print final_titles_list[:5] print final_score_sorted[:5] if os.path.isfile(textfile_name): os.remove(textfile_name) if os.path.isfile(filename): os.remove(filename) # results_json = OrderedDict(out) return json.dumps(OrderedDict(out))
for f in files: try: xml = etree.parse(xml_directory + '/' + f) name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0] if name not in names: names.append(name) education = xml.xpath('//education')[0] schools = education.xpath('//school') resume_id = f.split('.')[0] temp_univ_major_list = [] for school in schools: school_text = stripxml(etree.tostring(school)) #institution = school.xpath('institution/text()')[0] institution = extract_univ( school_text, univ_dict, univ_normalize) institution = re.sub ('[^A-Za-z0-9 ]+',' ',str(institution)) institution = re.sub (' ',' ',str(institution)) #print institution if institution.lower() in univ_normalize: #print "NORMALIZED" institution = univ_normalize[institution] degree_level = school.xpath('degree/@level')[0] degree = school.xpath('degree/text()')[0] major_code = str(school.xpath('major/@code')[0]) major = school.xpath('major/text()')[0] temp_univ_major_list.append(str(institution + '_' + major_code).lower()) if str(institution + '_' + major_code).lower() not in univ_major_list: counter += 1
def analyze(): # global results_json global university global tree_json if request.method: # Get and save file from browser upload files = request.files['file'] if files: filename = str(files.filename) extension = filename.rsplit('.', 1)[1] filename_without_extension = filename.rsplit('.', 1)[0] files.save(os.path.join(iHire.config['UPLOAD_FOLDER'], filename)) if extension == 'pdf': text_from_pdf = extract_text_from_pdf(filename) text_from_pdf = text_from_pdf.replace('\xc2\xa0', ' ') with open(filename_without_extension + '.txt', 'wb') as write_file: write_file.write(text_from_pdf) textfile_name = filename_without_extension + '.txt' else: textfile_name = filename university = extract_univ( open(textfile_name).read(), univ_dict, univ_normalize) print filename # create_data_for_graph(university, "", skills_employer, univ_major_number, major_code_lookup) tree_json = create_data_for_tree(university, "", skills_employer_tree, univ_major_number, major_code_lookup, employer_second_degree_tree) resume_text = [open(textfile_name).read()] predicted_decision = model.decision_function(resume_text) top_predictions, normalized_prediction_score = get_top_predictions( predicted_decision) out = dict() skills_map_with_percent_list = [] titles = sorted(skills_map_with_percent.keys()) for title in titles: temp_skill_map = dict() temp_skill_map[title] = skills_map_with_percent[title] skills_map_with_percent_list.append(temp_skill_map) out["university"] = university out["skills_map"] = skills_map_with_percent_list out["titles"] = titles out["candidate_skills"] = dict() out["title_data"] = dict() try: tokens = nltk.word_tokenize(resume_text[0].lower()) except UnicodeDecodeError: tokens = nltk.word_tokenize( resume_text[0].decode('utf-8').lower()) skill_score = [] for pred in top_predictions: try: top15 = skills_map_with_percent[ title_title_map[pred]]["skills"][:15] except KeyError: top15 = [] temp_skill_list = [ t for t in top15 if len(t) > 1 and t.lower() in tokens ] out["candidate_skills"][ title_title_map[pred]] = temp_skill_list out["title_data"][title_title_map[pred]] = titles_data[ title_title_map[pred]] skill_score.append(int(len(temp_skill_list) / 15.0 * 100.0)) final_score = [ sum(x) / 2 for x in zip(normalized_prediction_score, skill_score) ] final_titles_list = [] sorted_score_indexes = [ i[0] for i in sorted( enumerate(final_score), key=lambda x: x[1], reverse=True) ] for s in sorted_score_indexes: final_titles_list.append(title_title_map[top_predictions[s]]) final_score_sorted = sorted(final_score, reverse=True) out["final_prediction_list"] = final_titles_list out["final_score_sorted"] = final_score_sorted out["tree_json"] = json.dumps(tree_json) print final_titles_list[:5] print final_score_sorted[:5] if os.path.isfile(textfile_name): os.remove(textfile_name) if os.path.isfile(filename): os.remove(filename) # results_json = OrderedDict(out) return json.dumps(OrderedDict(out))
try: xml = etree.parse(xml_directory + '/' + f) name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath( '//surname/text()')[0] if name not in names: names.append(name) education = xml.xpath('//education')[0] schools = education.xpath('//school') resume_id = f.split('.')[0] temp_univ_major_list = [] for school in schools: school_text = stripxml(etree.tostring(school)) #institution = school.xpath('institution/text()')[0] institution = extract_univ(school_text, univ_dict, univ_normalize) institution = re.sub('[^A-Za-z0-9 ]+', ' ', str(institution)) institution = re.sub(' ', ' ', str(institution)) #print institution if institution.lower() in univ_normalize: #print "NORMALIZED" institution = univ_normalize[institution] degree_level = school.xpath('degree/@level')[0] degree = school.xpath('degree/text()')[0] major_code = str(school.xpath('major/@code')[0]) major = school.xpath('major/text()')[0] temp_univ_major_list.append( str(institution + '_' + major_code).lower())