def hello_firestore(event, context): path_parts = context.resource.split('/documents/')[1].split('/') collection_path = path_parts[0] document_path = '/'.join(path_parts[1:]) affected_doc = client.collection(collection_path).document(document_path) affected_doc.update({ u'checking': "checing", u'gotUrl': event["value"]['fields']["url"]['stringValue'] }) file_url = event["value"]['fields']["url"]['stringValue'] fil = urllib.request.urlopen(file_url) response = requests.get(file_url) fine_name = os.path.join(tempfile.gettempdir(), "metadata.pdf") with open(fine_name, 'wb') as f: f.write(response.content) text = textract.process(fine_name, method='pdfminer', encoding='ascii') data = resumeparse.read_file(fine_name) affected_doc.update({u'resumeData': data}) affected_doc.update( {u'original': event["value"]['fields']["url"]['stringValue']})
def extract_fields(txt): # import pdb;pdb.set_trace() resume_fields = resumeparse.read_file( '/home/ebabu/Downloads/Sanidhya_CV-converted.docx') return resume_fields # def main(): # text = extract_text_from_docx('/home/dell/Downloads/Sanidhya_CV-converted.docx') # experience = extract_experience(text) # main() # import pdb;pdb.set_trace() # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): # # chunk = list(chunk) # if hasattr(chunk, 'label') and (chunk.label() == 'ORGANIZATION' or 'PERSON'): # # projects_list.append(sent) # # chunk_leave = chunk.leaves() # # # import pdb;pdb.set_trace() # # for word,tag in chunk_leave: # # # projects_title = [] # # # projects_title.append(word) # # projects_title = word # # print("projects_title",projects_title) # # project_sent = sent # # projects_dict[projects_title] = project_sent # # while count<=3: # # print(chunk.label()) # # pass # # count += 1 # # print("sent",sent," ") # import pdb;pdb.set_trace()
def index(): if request.method == 'GET': return "Hello" content = request.json() data = resumeparse.read_file('/content/Anubhab_Cover letter.pdf') result =data return jsonify( {'solution_text': result } )
#from pyresparser import ResumeParser from resume_parser import resumeparse if __name__ == '__main__': #data = ResumeParser(r'CV_AliDoggaz.pdf').get_extracted_data() data = resumeparse.read_file('CV_AliDoggaz.pdf') print(data)
from resume_parser import resumeparse import os import warnings warnings.simplefilter('ignore') path = 'C:/Users/91884/Desktop/OCR_Resume/OCR_Resume/' resumes = os.listdir(path) c = 0 try: for i in range(8, 10): print(resumes[i]) data = resumeparse.read_file(f'{path}{resumes[i]}') print(data['skills']) c += 1 except: pass print(c) print(len(resumes) + 1 - 9) #8
# All Resume Parser Dependencies must be installed # Java must be running from resume_parser import resumeparse data = resumeparse.read_file(r"C:\Users\some_\Downloads\Profile.pdf") # PS: First Run takes a while
def home(): filename = request.args.get("filename") data = resumeparse.read_file('pdfs/' + filename) name = data['name'].lower().title() email = data['email'] phone = data['phone'] try: school = data['university'][0].title() except: school = "" skills = [] count = 0 for x in data['skills']: skills.append(x.strip()) if count == 5: break else: count += 1 skills = list(filter(None, skills)) skillstr = ', '.join(skills) matchings = [] listings = [[]] best_jobs = [] concentration = [] organization, job_type, job_title, job_description, location = [], [], [], [], [] # Converting pdf to txt file for library def pdf2txt(PDFfile, TXTfile): in_file = open(PDFfile, 'rb') res_mgr = PDFResourceManager() data = io.StringIO() TxtConverter = TextConverter(res_mgr, data, laparams=LAParams()) interpreter = PDFPageInterpreter(res_mgr, TxtConverter) for page in PDFPage.get_pages(in_file): interpreter.process_page(page) txt = data.getvalue() with open(TXTfile, 'w') as f: f.write(txt) # Function that searches for best match in jobs dataset def find_matches(user_resume): with open(user_resume, 'r') as resume: with open('small_jobs_dataset.csv', 'r') as job_listings_csv: # Splitting dataset rows by delimiter csv_reader = csv.reader(job_listings_csv) count = 0 # Reading user's resume into variable resume_var = resume.read() for row in csv_reader: str_row = str(row) job_as_list = pp.commaSeparatedList.parseString( str_row).asList() # Storing job description job_desc = job_as_list[4] if count > 0: # Feature extraction on job desc and resume text = [resume_var, job_desc] count_vec = CountVectorizer() count_matrix = count_vec.fit_transform(text) match = cosine_similarity(count_matrix)[0][1] * 100 matchings.append(tuple((match, count))) listings.append(job_as_list) count += 1 # Sorting by jobs with highest match matchings.sort(reverse=True) # Storing jobs with highest match and user's concentration for i in range(5): match = matchings[i] job = listings[int(match[1])] split_string = job organization.append(split_string[0].strip('\"')) job_type.append(split_string[3].strip('\"')) job_title.append(split_string[7].strip('\"')) job_description.append(split_string[4].strip('\"')) location.append(split_string[6].strip('\"')) if i == 0: job_industry = split_string[3] job_industry = job_industry.strip('\"') concentration.append(job_industry) # Resume needs to be converted from pdf to txt PDFfile = 'pdfs/' + filename TXTfile = 'parsed_resume.txt' # Converting pdf resume to txt pdf2txt(PDFfile, TXTfile) # Calling find_matches function find_matches('parsed_resume.txt') # Printing results company_names = [] company_desc = [] company_match = [] for i in range(3): try: company_names.append(re.sub(r'[^A-Za-z0-9 ]+', '', organization[i])) company_desc.append(job_title[i].strip()) matchc = round(float(str(matchings[i][0]).strip())) company_match.append(matchc) except: pass try: avg = 0 for x in company_match: if x < 80: avg += x + 20 else: avg += x avg = round(avg / 3) except: print('didnt work') import os os.remove("parsed_resume.txt") return render_template("results.html", name=name, email=email, phone=phone, school=school, skills=skillstr, company_desc=company_desc, company_match=company_match, company_names=company_names, avg=avg)
from resume_parser import resumeparse # nltk.download('popular') data = resumeparse.read_file( '/home/ebabu/Downloads/SrishtiJain 2021-converted.docx') print(data) # from pyresparser import ResumeParser # data = ResumeParser('/path/to/resume/file').get_extracted_data() # resume_fields = ResumeParser('/home/ebabu/Downloads/Sanidhya_CV-converted.docx').get_extracted_data() # print(resume_fields)
# This file conatins python code to extract skills from uploaded resume using natural language processing import json import re import os import sys from resume_parser import resumeparse from itertools import filterfalse from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) file_name = "./backend-engine/" + sys.argv[1] data = resumeparse.read_file(file_name) skills = data["skills"] # remove all non-alphabet or non-numbers # remove beginning and trailling white spaces for i in range(0, len(skills)): skills[i] = re.sub('[^0-9a-zA-Z\s]+', '', skills[i]) skills[i] = skills[i].strip() # remove strings with length greater than 2 skills[:] = filterfalse(lambda elm: len(elm.split()) > 2, skills) # remove any stopwords for i in range(0, len(skills)): skills[i] = ' '.join( filter(lambda w: not w in stopwords, skills[i].split())) # remove empty string skills[:] = filterfalse(lambda elm: len(elm) == 0, skills)
def find_cv(): """ :param: N/A :return: The name of the file that is the most likely to be the candidate's resume/cv, as well as the candidate email adress, his phone number, and his name. """ #Special case where candidate's folder is empty or contains only 1 file if len(os.listdir('PDF_Converted_Files')) == 0: return if len(os.listdir('PDF_Converted_Files')) == 1: return os.listdir('PDF_Converted_Files')[0] # If a file contains the words CV/Resume/etc..., returns directly that file name for name in os.listdir('PDF_Converted_Files'): if name.lower().startswith("cv"): data = resumeparse.read_file('PDF_Converted_Files' + os.sep + name) return name, data['email'], data['phone'], data['name'] for keyword in [ 'cv.', 'resume', 'résumé', 'curriculum vitae', 'curriculumvitae' ]: if keyword in name.lower(): data = resumeparse.read_file('PDF_Converted_Files' + os.sep + name) return name, data['email'], data['phone'], data['name'] # Attribute a score to each file. The score will allow us to estimate the probability of # that file being the candidate's resume. # "maxi" will store the highest score reached yet. maxi = 0 # loop over all files and attribute a "score" to each one. # If the file's score is >= to maxi, maxi = score. In this case, # we will also store the email, phone number, and fullname present in the file. for name in os.listdir('PDF_Converted_Files'): # Parse the file, looking for relevant info (email, skills, education, etc...) score = 0 data = resumeparse.read_file('PDF_Converted_Files' + os.sep + name) # Increase score if we find relevant info in the file if data['skills']: score += 5 # If the file contains the candidate's skills, there are very high chances that this # file is the candidate's resume. So we increase its score by 5. if data['email']: score += 1 if data['phone']: score += 1 if data['degree']: score += 3 if data['university']: score += 2 if data['total_exp']: score += 2 if score >= maxi: cv_name, email, phone, FullName = name, data['email'], data[ 'phone'], data['name'] maxi = score # Return the file with the highest score (highest chances of being the candidate's resume) return cv_name, email, phone, FullName