def unbias(filename, outfile, email=False): filenametxt = "resume.txt" extract_text.extract(filename, filenametxt, "text") name = recognizeName.recognizeName(filenametxt) pos_size = find_name.find_name(filename, name) overplay.overlay(filename, outfile, "bigredbox.pdf", *pos_size) if (email): emails = recognizeEmails.recognizeEmails(filenametxt) print(emails) for e in emails: pos_size = find_name.find_name(filename, e) overplay.overlay(outfile, outfile, "bigredbox.pdf", *pos_size)
def post_pages(): img = cv2.imdecode( numpy.frombuffer(request.files['file'].read(), numpy.uint8), cv2.IMREAD_UNCHANGED) bb = demo.main(img[:, :, ::-1]) im_pil = Image.fromarray(img) total, text = extract_text.extract(im_pil, bb) return json.dumps({'status': 'SUCCESS', 'total': total, 'items': text})
def save_webpage_text(): with open(config.reddit_json_file, 'r') as f: json_dict = json.loads(f.read()) try: for i, (url, d) in enumerate(json_dict.items()): is_reddit_page = False is_continue = False cat = d['cat'] url = url.lower() d['webpage_status'] = '200' if 'text' in d: continue # for retry if d['webpage_status'] != '200': d['text'] = '' continue if d['cat'] in config.reddit_skip_cat: d['webpage_status'] = 'skip_cat' d['text'] = '' continue for domain in config.reddit_skip_domain: if domain in url: is_continue = True d['webpage_status'] = 'skip_domain' d['text'] = '' break for ext in [ '.pdf', '.doc', '.docx', '.xls', '.jpg', '.jpeg', '.png', '.bmp', '.gif' ]: if ext in url: is_continue = True d['webpage_status'] = 'skip_ext' d['text'] = '' break if is_continue: continue if url[0] == '/': is_reddit_page = True url = 'https://www.reddit.com' + url resp, err_msg = open_url(cat, url) if resp is None: d['webpage_status'] = str(err_msg) d['text'] = '' continue soup = parse_resp(resp) if soup is None: d['webpage_status'] = 'html_parse_error' d['text'] = '' continue if is_reddit_page: desc_el = soup.select_one('.expando .usertext-body') text = desc_el and desc_el.text.strip() or '' if not text: first_cmt_el = soup.select_one( '.nestedlisting .usertext-body') text = first_cmt_el and first_cmt_el.text.strip() or '' print('==================reddit page======================') else: print('==================other page======================') text = et.extract(soup)[0].strip() print(text[:30], '...', text[-30:]) d['text'] = text print('') if i > 0 and i % 10 == 0: shutil.copy(config.reddit_json_file, config.reddit_json_file + '.bak') with open(config.reddit_json_file, 'w') as f: f.write(json.dumps(json_dict)) finally: shutil.copy(config.reddit_json_file, config.reddit_json_file + '.bak') with open(config.reddit_json_file, 'w') as f: f.write(json.dumps(json_dict))
import os import zipfile import extract_text as et import sort_words as sw import noOEBPS as no file = [i for i in os.listdir() if ("epub" in i)][0] #file = "finding_a_friend_-_aurora_productions.epub" base = os.path.splitext(file)[0] os.rename(file, base + ".zip") zip_ref = zipfile.ZipFile(base + ".zip", 'r') bookname = "newbook" newbook = "newbookedited" #create a new empty folder if not os.path.exists(bookname): os.makedirs(bookname) zip_ref.extractall(bookname) zip_ref.close() # find book name, number of pages pages = 14 et.extract(bookname, base, pages, "booktext.txt") sw.sortwords('booktext.txt') no.edithtml("dictwords.txt", bookname, newbook) #tonebooktext.txt #booktext.txt
import recognizeName import recognizeEmails import overplay import find_name import extract_text filenamepdf = "sample_resumes/sample2.pdf" filenametxt = "resume.txt" extract_text.extract(filenamepdf, filenametxt, "text") name = recognizeName.recognizeName(filenametxt) print(name) pos_size = find_name.find_name(filenamepdf, name) overplay.overlay(filenamepdf, "bigredbox.pdf", *pos_size) emails = recognizeEmails.recognizeEmails(filenametxt) print(emails) for e in emails: pos_size = find_name.find_name(filenamepdf, e) overplay.overlay("resume_unbiased.pdf", "bigredbox.pdf", *pos_size)
def unbias(filename): filenametxt = "resume.txt" extract_text.extract(filename, filenametxt, "text") name = recognizeName.recognizeName(filenametxt) pos_size = find_name.find_name(filename, name) overplay.overlay(filenamepdf, "bigredbox.pdf", *pos_size)