def processJob(jobDetails): try: job = {} job["url"] = urljoin(rootUrl, jobDetails.a['href']) job["language"] = 'de' #remove header details header = jobDetails.find(text=re.compile('Stellenausschreibung')) if header: header.extract() job["title"] = jobDetails.find('strong').text print job['title'] appDateTxt = re.search(r'Bewerbungsschluss:(?: |\xa0)(\d{2}[.\w\s]+?\d{4})', jobDetails.get_text()) if appDateTxt: try: appDate = text_to_date(appDateTxt.group(1), u'%d. %B %Y') except: appDate = text_to_date(appDateTxt.group(1), u'%d.%m.%Y') job['applicationDate'] = appDate print job['applicationDate'] try: text = process_pdf(job['url']) job['text'] = text except: job['text'] = 'n/a' thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJob(jobDetails): try: job = dict(language = 'de') if jobDetails.name == 'tr' and not jobDetails.find('a'):#handle table row print('job has no text application date is expired') elif jobDetails.name == 'tr' and jobDetails.find('a'): job['url'] = rootUrl + u'/de/' + jobDetails.a['href'] url = job['url'] job = dict(url = url, language = 'de') if findDate(jobDetails.text): job['applicationDate'] = findDate(jobDetails.text) job['text'] = pdf.process_pdf(job['url']) job['title'] = jobDetails.td.get_text() else: #handle div element job["url"] = jobDetails.a['href'] url = job['url'] job = dict(url = url, language = 'de') job['title'] = jobDetails.a.get_text() soup = thisInstitution.getSoup(job['url']) content = soup.find(class_='fallback') job['text'] = unicode(content.div) if findDate(job['text']): job['applicationDate'] = findDate(job['text']) thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: job = dict(language = 'de') print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJob(jobDetails): try: job = {} job['url'] = urljoin(rootUrl, jobDetails['href']) job['title'] = jobDetails.get_text() print job['title'] job["language"] = 'de' try: job['text'] = process_pdf(job['url']) except: job['text'] = "Details for this job are available here " + job['url'] appDateElm = jobDetails.parent.find(text=re.compile(r'Bewerbungsfrist')) if appDateElm: appDateTxt = re.search(r'\d{1,2}[.\w\s]+\d{4}', appDateElm) try: appDate = text_to_date(appDateTxt.group(0), u'%d.%m.%Y') except: appDate = text_to_date(appDateTxt.group(0), u'%d. %B %Y') job['applicationDate'] = appDate if findDate(job['text']): job['applicationDate'] = findDate(job['text']) thisInstitution.addRecord(job) except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJob(jobDetails): try: job = {} job["url"] = urljoin(rootUrl, jobDetails['href']) job["language"] = 'de' job["title"] = jobDetails.get_text() job['text'] = process_pdf(job['url']) thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJob(jobDetails): try: job = {} job["url"] = urljoin(rootUrl, jobDetails.a["href"]).encode("utf-8") job["title"] = jobDetails.text.encode("utf-8") print job["title"] if re.search("pdf", job["url"]): job["text"] = process_pdf(job["url"]) else: job["text"] = u"n/a" thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJob(jobDetails): try: job = {} job["url"] = urljoin(rootUrl, jobDetails.a['href']) job["language"] = 'de' job["title"] = jobDetails.find(class_='h3_title').get_text() app_date_txt = re.search(r'Bewerbungsschluss:\s*(\d{1,2}[.\w\s]+\d{4})', jobDetails.get_text()) if app_date_txt: app_date = text_to_date(app_date_txt.group(1), u'%d.%m.%Y') job['applicationDate'] = app_date job['text'] = process_pdf(job['url']) thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJobC(jobDetails): try: job = {} job["url"] = urljoin(rootUrl, jobDetails.a['href']) job["language"] = 'de' job["title"] = u' '.join([a.get_text() for a in jobDetails.find_all('a')]) print job['title'] job['text'] = process_pdf(job['url']) appDateElm = jobDetails.find(text=re.compile('Bewerbungsschluss')) if appDateElm: job['applicationDate'] = find_date_in_text(appDateElm.string) thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJobPdf(jobDetails): try: job = {} job["url"] = urljoin(baseUrl, quote(jobDetails['href'])) job["language"] = 'de' job["title"] = re.match(r'[^)(]+', jobDetails.get_text()).group(0) print job['title'] app_date = find_date_in_text(re.search(r'(?:Ausschreibungsfrist|Deadline):[^)]+', jobDetails.get_text()).group(0)) if app_date: job['applicationDate'] = app_date print job['applicationDate'] job['text'] = process_pdf(job['url']) thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJob(jobDetails): try: job = {} job["url"] = jobDetails.a['href'] job["language"] = 'de' job["title"] = u'\n'.join(jobDetails.td.strings) print job['title'] appDateTxt = jobDetails.contents[1].string if appDateTxt: appDate = text_to_date(appDateTxt, u'%d.%m.%Y') job['applicationDate'] = appDate print job['applicationDate'] job['text'] = process_pdf(job['url']) thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJob(jobDetails): try: #only elements that include pdf links are job elements link = jobDetails.find('a', href=re.compile('pdf')) if link: job = {} job["url"] = jobDetails.a['href'] job["language"] = 'de' job["title"] = jobDetails.a.get_text() print job['title'] app_date = find_date_in_text(jobDetails.get_text()) if app_date: job['applicationDate'] = app_date job['text'] = process_pdf(job['url']) thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJob(jobDetails): try: job = {} job["url"] = urljoin(rootUrl, jobDetails.a['href']) job['url'] = re.sub('\s*', '', job['url']) job["language"] = 'de' contents = jobDetails.find_all('td') job["title"] = contents[1].get_text() print job['title'] try: job['text'] = process_pdf(job['url']) except: job['text'] = "Job details are available here: " + job['url'] app_date = find_date_in_text(contents[-1].get_text()) if app_date: job['applicationDate'] = app_date thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def processJob(jobDetails): try: job = {} job["url"] = urljoin(rootUrl, jobDetails.find('a')['href']) job["language"] = 'de' job["title"] = jobDetails.find_all('td')[1].get_text() app_date = find_date_in_text(jobDetails.find_all('td')[4].get_text(), 'de_DE') if app_date: job['applicationDate'] = app_date publish_date = find_date_in_text(jobDetails.find_all('td')[2].get_text(), 'de_DE') if publish_date: job["publishDate"] = publish_date job['text'] = process_pdf(job['url']) thisInstitution.addRecord(job) # data is recorded here, job is the only argument except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False