def extract_info(cv_dir): extracted_info = [] files = os.listdir(cv_dir) for file in files: if file.endswith('.pdf'): text = convert2txt.extract_text(cv_dir+file, '.pdf') result = create_dic(file) get_info(text, result) extracted_info.append(result) elif file.endswith('.doc'): text = convert2txt.extract_text(cv_dir+file, '.doc') result = create_dic(file) get_info(text, result) extracted_info.append(result) elif file.endswith('.docx'): text = convert2txt.extract_text(cv_dir+file, '.docx') result = create_dic(file) get_info(text, result) extracted_info.append(result) elif file.endswith('.txt'): with open(cv_dir+file, encoding='utf-8-sig') as f: text = f.read() result = create_dic(file) get_info(text, result) extracted_info.append(result) return extracted_info
def extract_mobile_numbers(cv_dir, word_limit): extracted_mobile_numbers = {} files = os.listdir(cv_dir) pattern = re.compile( r"\(?\+?[8]{2}?0?\)?\0?-?0?[0-9]{3}-?[0-9]{3}-?[0-9]{4}|[0-9]{4}-?[0-9]{3}-?[0-9]{4}|[0-9]{5}-[0-9]{6}" ) for file in files: if file.endswith('.pdf'): text = convert2txt.extract_text(cv_dir + file, '.pdf') words = text.split() text = ' '.join(words[:word_limit]) results = pattern.findall(text) extracted_mobile_numbers[file] = results elif file.endswith('.doc'): text = convert2txt.extract_text(cv_dir + file, '.doc') words = text.split() text = ' '.join(words[:word_limit]) results = pattern.findall(text) extracted_mobile_numbers[file] = results elif file.endswith('.docx'): text = convert2txt.extract_text(cv_dir + file, '.docx') words = text.split() text = ' '.join(words[:word_limit]) results = pattern.findall(text) extracted_mobile_numbers[file] = results elif file.endswith('.txt'): with open(cv_dir + file, encoding='utf-8') as f: text = f.read() words = text.split() text = ' '.join(words[:word_limit]) results = pattern.findall(text) extracted_mobile_numbers[file] = results return extracted_mobile_numbers
def extract_emails(cv_dir, word_limit): extracted_emails = {} files = os.listdir(cv_dir) pattern = re.compile(r'[\w\.-]+@[\w\.-]+') for file in files: if file.endswith('.pdf'): text = convert2txt.extract_text(cv_dir + file, '.pdf') words = text.split() text = ' '.join(words[:word_limit]) results = pattern.findall(text) extracted_emails[file] = results elif file.endswith('.doc'): text = convert2txt.extract_text(cv_dir + file, '.doc') words = text.split() text = ' '.join(words[:word_limit]) results = pattern.findall(text) extracted_emails[file] = results elif file.endswith('.docx'): text = convert2txt.extract_text(cv_dir + file, '.docx') words = text.split() text = ' '.join(words[:word_limit]) results = pattern.findall(text) extracted_emails[file] = results elif file.endswith('.txt'): with open(cv_dir + file, encoding='utf-8') as f: text = f.read() words = text.split() text = ' '.join(words[:word_limit]) results = pattern.findall(text) extracted_emails[file] = results return extracted_emails
def wordcloud_a_dir(cv_dir, wc_dir): files = os.listdir(cv_dir) for file in files: if file.endswith('.pdf'): data = convert2txt.extract_text(cv_dir+file, '.pdf') text_to_cloud(data, file, wc_dir) elif file.endswith('.doc'): data = convert2txt.extract_text(cv_dir+file, '.doc') text_to_cloud(data, file, wc_dir) elif file.endswith('.docx'): data = convert2txt.extract_text(cv_dir+file, '.docx') text_to_cloud(data, file, wc_dir) elif file.endswith('.txt'): with open(cv_dir+file, encoding='utf-8') as f: data = f.read().replace('\n', '') text_to_cloud(data, file, wc_dir)
def extract_names(cv_dir, word_limit): extracted_names = {} files = os.listdir(cv_dir) for file in files: if file.endswith('.pdf'): text = convert2txt.extract_text(cv_dir + file, '.pdf') words = text.split() text = ' '.join(words[:word_limit]) text = preprocess.process(text) nlp_text = nlp(text) extracted_names[file] = [] for e in nlp_text.ents: extracted_names[file].append(e.text) elif file.endswith('.doc'): text = convert2txt.extract_text(cv_dir + file, '.doc') words = text.split() text = ' '.join(words[:word_limit]) text = preprocess.process(text) nlp_text = nlp(text) extracted_names[file] = [] for e in nlp_text.ents: extracted_names[file].append(e.text) elif file.endswith('.docx'): text = convert2txt.extract_text(cv_dir + file, '.docx') words = text.split() text = ' '.join(words[:word_limit]) text = preprocess.process(text) nlp_text = nlp(text) extracted_names[file] = [] for e in nlp_text.ents: extracted_names[file].append(e.text) elif file.endswith('.txt'): with open(cv_dir + file, encoding='utf-8') as f: text = f.read() words = text.split() text = ' '.join(words[:word_limit]) text = preprocess.process(text) nlp_text = nlp(text) extracted_names[file] = [] for e in nlp_text.ents: extracted_names[file].append(e.text) return extracted_names
def extract_info(cv_dir, word_limit): extracted_info = [] files = os.listdir(cv_dir) for file in files: if file.endswith('.pdf'): text = convert2txt.extract_text(cv_dir + file, '.pdf') emails = extract_emails(text) numbers = extract_mobile_numbers(text) words = text.split() text = ' '.join(words[:word_limit]) text = preprocess.process(text) nlp_text = nlp(text) name = [] for e in nlp_text.ents: name.append(e.text) extracted_info.append({ 'filename': file, 'name': ' '.join(name), 'number': numbers, 'email': emails }) elif file.endswith('.doc'): text = convert2txt.extract_text(cv_dir + file, '.doc') emails = extract_emails(text) numbers = extract_mobile_numbers(text) words = text.split() text = ' '.join(words[:word_limit]) text = preprocess.process(text) nlp_text = nlp(text) name = [] for e in nlp_text.ents: name.append(e.text) extracted_info.append({ 'filename': file, 'name': ' '.join(name), 'number': numbers, 'email': emails }) elif file.endswith('.docx'): text = convert2txt.extract_text(cv_dir + file, '.docx') emails = extract_emails(text) numbers = extract_mobile_numbers(text) words = text.split() text = ' '.join(words[:word_limit]) text = preprocess.process(text) nlp_text = nlp(text) name = [] for e in nlp_text.ents: name.append(e.text) extracted_info.append({ 'filename': file, 'name': ' '.join(name), 'number': numbers, 'email': emails }) elif file.endswith('.txt'): with open(cv_dir + file, encoding='utf-8') as f: text = f.read() emails = extract_emails(text) numbers = extract_mobile_numbers(text) words = text.split() text = ' '.join(words[:word_limit]) text = preprocess.process(text) nlp_text = nlp(text) name = [] for e in nlp_text.ents: name.append(e.text) extracted_info.append({ 'filename': file, 'name': ' '.join(name), 'number': numbers, 'email': emails }) return extracted_info