Exemplo n.º 1
0
def extract_info(cv_dir):
    extracted_info = []
    files = os.listdir(cv_dir)
    for file in files:
        if file.endswith('.pdf'):
            text = convert2txt.extract_text(cv_dir+file, '.pdf')
            result = create_dic(file)
            get_info(text, result)
            extracted_info.append(result)
        elif file.endswith('.doc'):
            text = convert2txt.extract_text(cv_dir+file, '.doc')
            result = create_dic(file)
            get_info(text, result)
            extracted_info.append(result)
        elif file.endswith('.docx'):
            text = convert2txt.extract_text(cv_dir+file, '.docx')
            result = create_dic(file)
            get_info(text, result)
            extracted_info.append(result)
        elif file.endswith('.txt'):
            with open(cv_dir+file, encoding='utf-8-sig') as f:
                text = f.read()
            result = create_dic(file)
            get_info(text, result)
            extracted_info.append(result)
    return extracted_info
Exemplo n.º 2
0
def extract_mobile_numbers(cv_dir, word_limit):
    extracted_mobile_numbers = {}
    files = os.listdir(cv_dir)
    pattern = re.compile(
        r"\(?\+?[8]{2}?0?\)?\0?-?0?[0-9]{3}-?[0-9]{3}-?[0-9]{4}|[0-9]{4}-?[0-9]{3}-?[0-9]{4}|[0-9]{5}-[0-9]{6}"
    )
    for file in files:
        if file.endswith('.pdf'):
            text = convert2txt.extract_text(cv_dir + file, '.pdf')
            words = text.split()
            text = ' '.join(words[:word_limit])
            results = pattern.findall(text)
            extracted_mobile_numbers[file] = results
        elif file.endswith('.doc'):
            text = convert2txt.extract_text(cv_dir + file, '.doc')
            words = text.split()
            text = ' '.join(words[:word_limit])
            results = pattern.findall(text)
            extracted_mobile_numbers[file] = results
        elif file.endswith('.docx'):
            text = convert2txt.extract_text(cv_dir + file, '.docx')
            words = text.split()
            text = ' '.join(words[:word_limit])
            results = pattern.findall(text)
            extracted_mobile_numbers[file] = results
        elif file.endswith('.txt'):
            with open(cv_dir + file, encoding='utf-8') as f:
                text = f.read()
            words = text.split()
            text = ' '.join(words[:word_limit])
            results = pattern.findall(text)
            extracted_mobile_numbers[file] = results
    return extracted_mobile_numbers
Exemplo n.º 3
0
def extract_emails(cv_dir, word_limit):
    extracted_emails = {}
    files = os.listdir(cv_dir)
    pattern = re.compile(r'[\w\.-]+@[\w\.-]+')
    for file in files:
        if file.endswith('.pdf'):
            text = convert2txt.extract_text(cv_dir + file, '.pdf')
            words = text.split()
            text = ' '.join(words[:word_limit])
            results = pattern.findall(text)
            extracted_emails[file] = results
        elif file.endswith('.doc'):
            text = convert2txt.extract_text(cv_dir + file, '.doc')
            words = text.split()
            text = ' '.join(words[:word_limit])
            results = pattern.findall(text)
            extracted_emails[file] = results
        elif file.endswith('.docx'):
            text = convert2txt.extract_text(cv_dir + file, '.docx')
            words = text.split()
            text = ' '.join(words[:word_limit])
            results = pattern.findall(text)
            extracted_emails[file] = results
        elif file.endswith('.txt'):
            with open(cv_dir + file, encoding='utf-8') as f:
                text = f.read()
            words = text.split()
            text = ' '.join(words[:word_limit])
            results = pattern.findall(text)
            extracted_emails[file] = results
    return extracted_emails
Exemplo n.º 4
0
def wordcloud_a_dir(cv_dir, wc_dir):
    files = os.listdir(cv_dir)
    for file in files:
        if file.endswith('.pdf'):
            data = convert2txt.extract_text(cv_dir+file, '.pdf')
            text_to_cloud(data, file, wc_dir)
        elif file.endswith('.doc'):
            data = convert2txt.extract_text(cv_dir+file, '.doc')
            text_to_cloud(data, file, wc_dir)
        elif file.endswith('.docx'):
            data = convert2txt.extract_text(cv_dir+file, '.docx')
            text_to_cloud(data, file, wc_dir)
        elif file.endswith('.txt'):
            with open(cv_dir+file, encoding='utf-8') as f:
                data = f.read().replace('\n', '')
            text_to_cloud(data, file, wc_dir)
Exemplo n.º 5
0
def extract_names(cv_dir, word_limit):
    extracted_names = {}
    files = os.listdir(cv_dir)
    for file in files:
        if file.endswith('.pdf'):
            text = convert2txt.extract_text(cv_dir + file, '.pdf')
            words = text.split()
            text = ' '.join(words[:word_limit])
            text = preprocess.process(text)
            nlp_text = nlp(text)
            extracted_names[file] = []
            for e in nlp_text.ents:
                extracted_names[file].append(e.text)
        elif file.endswith('.doc'):
            text = convert2txt.extract_text(cv_dir + file, '.doc')
            words = text.split()
            text = ' '.join(words[:word_limit])
            text = preprocess.process(text)
            nlp_text = nlp(text)
            extracted_names[file] = []
            for e in nlp_text.ents:
                extracted_names[file].append(e.text)
        elif file.endswith('.docx'):
            text = convert2txt.extract_text(cv_dir + file, '.docx')
            words = text.split()
            text = ' '.join(words[:word_limit])
            text = preprocess.process(text)
            nlp_text = nlp(text)
            extracted_names[file] = []
            for e in nlp_text.ents:
                extracted_names[file].append(e.text)
        elif file.endswith('.txt'):
            with open(cv_dir + file, encoding='utf-8') as f:
                text = f.read()
            words = text.split()
            text = ' '.join(words[:word_limit])
            text = preprocess.process(text)
            nlp_text = nlp(text)
            extracted_names[file] = []
            for e in nlp_text.ents:
                extracted_names[file].append(e.text)
    return extracted_names
Exemplo n.º 6
0
def extract_info(cv_dir, word_limit):
    extracted_info = []
    files = os.listdir(cv_dir)
    for file in files:
        if file.endswith('.pdf'):
            text = convert2txt.extract_text(cv_dir + file, '.pdf')
            emails = extract_emails(text)
            numbers = extract_mobile_numbers(text)
            words = text.split()
            text = ' '.join(words[:word_limit])
            text = preprocess.process(text)
            nlp_text = nlp(text)
            name = []
            for e in nlp_text.ents:
                name.append(e.text)
            extracted_info.append({
                'filename': file,
                'name': ' '.join(name),
                'number': numbers,
                'email': emails
            })
        elif file.endswith('.doc'):
            text = convert2txt.extract_text(cv_dir + file, '.doc')
            emails = extract_emails(text)
            numbers = extract_mobile_numbers(text)
            words = text.split()
            text = ' '.join(words[:word_limit])
            text = preprocess.process(text)
            nlp_text = nlp(text)
            name = []
            for e in nlp_text.ents:
                name.append(e.text)
            extracted_info.append({
                'filename': file,
                'name': ' '.join(name),
                'number': numbers,
                'email': emails
            })
        elif file.endswith('.docx'):
            text = convert2txt.extract_text(cv_dir + file, '.docx')
            emails = extract_emails(text)
            numbers = extract_mobile_numbers(text)
            words = text.split()
            text = ' '.join(words[:word_limit])
            text = preprocess.process(text)
            nlp_text = nlp(text)
            name = []
            for e in nlp_text.ents:
                name.append(e.text)
            extracted_info.append({
                'filename': file,
                'name': ' '.join(name),
                'number': numbers,
                'email': emails
            })
        elif file.endswith('.txt'):
            with open(cv_dir + file, encoding='utf-8') as f:
                text = f.read()
            emails = extract_emails(text)
            numbers = extract_mobile_numbers(text)
            words = text.split()
            text = ' '.join(words[:word_limit])
            text = preprocess.process(text)
            nlp_text = nlp(text)
            name = []
            for e in nlp_text.ents:
                name.append(e.text)
            extracted_info.append({
                'filename': file,
                'name': ' '.join(name),
                'number': numbers,
                'email': emails
            })
    return extracted_info