예제 #1
0
def unbias(filename, outfile, email=False):
    filenametxt = "resume.txt"
    extract_text.extract(filename, filenametxt, "text")
    name = recognizeName.recognizeName(filenametxt)
    pos_size = find_name.find_name(filename, name)
    overplay.overlay(filename, outfile, "bigredbox.pdf", *pos_size)

    if (email):
        emails = recognizeEmails.recognizeEmails(filenametxt)
        print(emails)
        for e in emails:
            pos_size = find_name.find_name(filename, e)
            overplay.overlay(outfile, outfile, "bigredbox.pdf", *pos_size)
예제 #2
0
def post_pages():

    img = cv2.imdecode(
        numpy.frombuffer(request.files['file'].read(), numpy.uint8),
        cv2.IMREAD_UNCHANGED)
    bb = demo.main(img[:, :, ::-1])
    im_pil = Image.fromarray(img)
    total, text = extract_text.extract(im_pil, bb)
    return json.dumps({'status': 'SUCCESS', 'total': total, 'items': text})
예제 #3
0
def save_webpage_text():
    with open(config.reddit_json_file, 'r') as f:
        json_dict = json.loads(f.read())

    try:
        for i, (url, d) in enumerate(json_dict.items()):
            is_reddit_page = False
            is_continue = False
            cat = d['cat']
            url = url.lower()
            d['webpage_status'] = '200'

            if 'text' in d:
                continue

            # for retry
            if d['webpage_status'] != '200':
                d['text'] = ''
                continue

            if d['cat'] in config.reddit_skip_cat:
                d['webpage_status'] = 'skip_cat'
                d['text'] = ''
                continue

            for domain in config.reddit_skip_domain:
                if domain in url:
                    is_continue = True
                    d['webpage_status'] = 'skip_domain'
                    d['text'] = ''
                    break

            for ext in [
                    '.pdf', '.doc', '.docx', '.xls', '.jpg', '.jpeg', '.png',
                    '.bmp', '.gif'
            ]:
                if ext in url:
                    is_continue = True
                    d['webpage_status'] = 'skip_ext'
                    d['text'] = ''
                    break

            if is_continue:
                continue

            if url[0] == '/':
                is_reddit_page = True
                url = 'https://www.reddit.com' + url

            resp, err_msg = open_url(cat, url)
            if resp is None:
                d['webpage_status'] = str(err_msg)
                d['text'] = ''
                continue

            soup = parse_resp(resp)
            if soup is None:
                d['webpage_status'] = 'html_parse_error'
                d['text'] = ''
                continue

            if is_reddit_page:
                desc_el = soup.select_one('.expando .usertext-body')
                text = desc_el and desc_el.text.strip() or ''
                if not text:
                    first_cmt_el = soup.select_one(
                        '.nestedlisting .usertext-body')
                    text = first_cmt_el and first_cmt_el.text.strip() or ''
                print('==================reddit page======================')
            else:
                print('==================other page======================')
                text = et.extract(soup)[0].strip()
            print(text[:30], '...', text[-30:])
            d['text'] = text
            print('')

            if i > 0 and i % 10 == 0:
                shutil.copy(config.reddit_json_file,
                            config.reddit_json_file + '.bak')
                with open(config.reddit_json_file, 'w') as f:
                    f.write(json.dumps(json_dict))
    finally:
        shutil.copy(config.reddit_json_file, config.reddit_json_file + '.bak')
        with open(config.reddit_json_file, 'w') as f:
            f.write(json.dumps(json_dict))
예제 #4
0
import os
import zipfile
import extract_text as et
import sort_words as sw
import noOEBPS as no

file = [i for i in os.listdir() if ("epub" in i)][0]
#file = "finding_a_friend_-_aurora_productions.epub"
base = os.path.splitext(file)[0]
os.rename(file, base + ".zip")

zip_ref = zipfile.ZipFile(base + ".zip", 'r')

bookname = "newbook"
newbook = "newbookedited"
#create a new empty folder
if not os.path.exists(bookname):
    os.makedirs(bookname)

zip_ref.extractall(bookname)
zip_ref.close()

# find book name, number of pages
pages = 14
et.extract(bookname, base, pages, "booktext.txt")
sw.sortwords('booktext.txt')
no.edithtml("dictwords.txt", bookname, newbook)

#tonebooktext.txt
#booktext.txt
예제 #5
0
import recognizeName
import recognizeEmails
import overplay
import find_name
import extract_text

filenamepdf = "sample_resumes/sample2.pdf"
filenametxt = "resume.txt"

extract_text.extract(filenamepdf, filenametxt, "text")

name = recognizeName.recognizeName(filenametxt)
print(name)
pos_size = find_name.find_name(filenamepdf, name)
overplay.overlay(filenamepdf, "bigredbox.pdf", *pos_size)

emails = recognizeEmails.recognizeEmails(filenametxt)
print(emails)
for e in emails:
    pos_size = find_name.find_name(filenamepdf, e)
    overplay.overlay("resume_unbiased.pdf", "bigredbox.pdf", *pos_size)
예제 #6
0
def unbias(filename):
    filenametxt = "resume.txt"
    extract_text.extract(filename, filenametxt, "text")
    name = recognizeName.recognizeName(filenametxt)
    pos_size = find_name.find_name(filename, name)
    overplay.overlay(filenamepdf, "bigredbox.pdf", *pos_size)