def read_job_from_html(skill, html_file): """ read job info from downloaded html file :param html_file: contains job info, but sometime the contents are empty. """ html = read_all(html_file) soup = make_soup(html) detail = soup.find('dl', 'job_detail') # in some rare cases, e.g. the job is closed already, then the job info is missing. if not detail: return None job = Job() job.job_id = int(soup.find('input', {'id': 'jobid'})['value']) job.skill_tag = skill log('*** JOB ***') title = detail.find('h1') log(title['title']) log(title.div.text) job.title = title['title'] job.dept = title.div.text log('') request = detail.find('dd', 'job_request') main_features = [] for s in request.stripped_strings: f = s.strip().lstrip(u'职位诱惑 : ').lstrip(u'发布时间:').rstrip(u'发布') log(f) main_features.append(f) assert len(main_features) == 7 job.salary = main_features[0] job.city = main_features[1] job.experience = main_features[2] job.education = main_features[3] job.full_time = main_features[4] == u'全职' job.benefits = main_features[5] job.published_date = get_published_date(main_features[6], created_on(html_file)) log('') desc_html = [] desc = detail.find('dd', 'job_bt').find_all('p') for bt in desc: desc_html.append(unicode(bt)) job.desc = ''.join(desc_html) log(job.desc) log('\n*** COMPANY ***\n') company = Company() comp = soup.find('dl', 'job_company') url = comp.dt.a['href'] pat = re.compile(r'(?P<comp_id>\d+)') m = re.search(pat, url) log(url) company.comp_id = int(m.group('comp_id')) job.comp_id = company.comp_id log(comp.dt.a.img['src']) log(comp.dt.a.div.h2.text.split()[0]) company.logo = comp.dt.a.img['src'] company.name = comp.dt.a.div.h2.text.split()[0] log('') comp_features = comp.dd features = [] for li in comp_features.ul.find_all('li'): for ls in li.stripped_strings: features.append(ls) log(''.join(features)) if len(features) == 6: company.domain = features[1] company.size = features[3] company.url = features[5] else: print(u'features ex: ' + html_file) log('') stage_h = comp_features.h4 stage_tags = stage_h.find_next_sibling('ul').find_all('li') stage = [] for li in stage_tags: for ls in li.stripped_strings: stage.append(ls) log('\t'.join(stage)) if len(stage) % 2 == 0: for i in xrange(0, len(stage), 2): if stage[i] == u'目前阶段': company.cur_stage = stage[i + 1] elif stage[i] == u'投资机构': company.investor = stage[i + 1] else: print(u'stages ex: ' + html_file) log('') # address if comp_features.div: log(comp_features.div.text) company.address = comp_features.div.text return job, company
def create_sample_company(): # instatiate Company: company = Company(name="Eric BLABLA KGB") company.set_founder_password("aaa") company.set_joining_password("bbb") # update database and query the ID of the new company: try: db.session.add(company) db.session.commit() except: db.session.rollback() flash( "Any error occured when created the sample company registration. Please try again.", "error") return redirect(url_for("register_company")) registered_company = Company.query.filter_by( name="Eric BLABLA KGB").first() # instatiate Jhon Do: colleague = Colleagues(user_name="jhon_do", email="*****@*****.**", first_name="Jhon", last_name="Do", position="Founder", confirmed=1) colleague.set_password("aaa") data = { "company_id": registered_company.id, "colleague": colleague, "sample_avatar": "john_do.jpg" } create_sample_colleague(data) # set the founder as Admin with full privilegs: registered_colleague = Colleagues.query.filter_by( email="*****@*****.**").first() # instatiate Admins: admin = instatiate_admin(True) admin.colleague_id = registered_colleague.id try: db.session.add(admin) db.session.commit() except: db.session.rollback() flash( "Any error occured when created sample admin registration. Please try again.", "error") return redirect(url_for("register_company")) # copy logo: location = "static/sample_logo/blabla.png" destination = f"static/logo/{registered_colleague.company_id}.png" shutil.copy2(location, destination) # update database: company.logo = "png" try: db.session.commit() print("Company logo copied.") except: db.session.rollback() print("An error occured when copied logo.") # instatiate Jane Do: colleague = Colleagues(user_name="jane_do", email="*****@*****.**", first_name="Jane", last_name="Do", position="Co-Founder", confirmed=1) colleague.set_password("aaa") data = { "company_id": registered_company.id, "colleague": colleague, "sample_avatar": "jane_do.png" } create_sample_colleague(data) # instatiate Do Do: colleague = Colleagues(user_name="dodo", email="*****@*****.**", first_name="Do", last_name="Do", position="dodo", confirmed=1) colleague.set_password("aaa") data = { "company_id": registered_company.id, "colleague": colleague, "sample_avatar": "dodo.svg" } create_sample_colleague(data) # instatiate x more colleagues: x_more = 20 usernames = open("fake_dataset/username.txt").readlines() emails = open("fake_dataset/fake_email.txt").readlines() first_names = open("fake_dataset/first_name.txt").readlines() last_names = open("fake_dataset/last_name.txt").readlines() positions = open("fake_dataset/position.txt").readlines() for x in range(x_more): colleague = Colleagues( user_name=get_random_item(usernames).strip(), email=get_random_item(emails), first_name=get_random_item(first_names), last_name=get_random_item(last_names).lower().title(), position=get_random_item(positions), confirmed=1) colleague.set_password("aaa") data = { "company_id": registered_company.id, "colleague": colleague, "sample_avatar": None } create_sample_colleague(data) # create sample Idea Box: admin = Admins.query.filter( Admins.colleague_id == registered_colleague.id).first() for x in range(2): new_box = Boxes(name=lorem.sentence().replace(".", ""), description=lorem.paragraph(), close_at=str_to_date( add_day(str_to_date(today()), x).strftime('%Y-%m-%d')), admin_id=admin.id) try: print("Trying to add new Idea Box to the database...") db.session.add(new_box) db.session.commit() except SQLAlchemyError as e: error = str(e.__dict__['orig']) print("**************************************") print(error) print("New Idea Box not created!") print("new_box.name: ", new_box.name) print("new_box.description: ", new_box.description) print("new_box.close_at: ", new_box.close_at) print("new_box.admin_id: ", new_box.admin_id) db.session.rollback() # create sample Idea: colleagues = Colleagues.query.filter( Colleagues.company_id == registered_company.id).all() boxes = db.session.query( Boxes, Admins, Colleagues).filter(Boxes.admin_id == admin.id).all() for x in range(7): colleague = get_random_item(colleagues) sign = [ "incognito", colleague.user_name, colleague.first_name, colleague.fullname() ] idea = Ideas(idea=lorem.paragraph(), sign=get_random_item(sign), box_id=get_random_item(boxes).Boxes.id, colleague_id=colleague.id) db.session.add(idea) try: db.session.commit() except: db.session.rollback() print("The sample company registered successfully!")