def scrapCPVO(): print("#========================= CPVO SCRAPING =========================") # Database connection and agency retrieval cpvoData = cpvo.returnAgency('CPVO') cpvo_link = cpvoData['link'][0] cpvo_id = cpvoData['id'][0] html = urllib.request.urlopen(cpvo_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('table',attrs={'summary':'Vacancies'}) # Find the jobs table Jobtable = (start.findAll('tr')) for child in Jobtable: if(child.find('th',attrs={'id':'vacancy_title'})): continue #print (child) jobTitle = child.td.a.string.strip() jobLink = child.td.a.get('href') jobCode = child.td.next_sibling.next_sibling.string.strip() jobType = data_format.typeOfPost(jobCode) jobDeadline = data_format.dateFormatFull(child.td.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.string.strip()) logging.debug (jobTitle,jobLink,jobCode,jobType,jobDeadline) cpvo.persist(int(cpvo_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print("#========================CPVO SCRAPING COMPLETE=================================")
def scrapEMA(): print("#========================= EMA SCRAPING =========================") # Database connection and agency retrieval emaData = ema.returnAgency('EMA') ema_link = emaData['link'][0] ema_id = emaData['id'][0] html = urllib.request.urlopen(ema_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('div',attrs={'class':'main-col'}) # Find the jobs table Jobtable = (start.find('table')) for child in Jobtable.children: if(child.find('td',attrs={'colspan':'top'})): continue jobTitle = child.td.string jobCode = child.td.next_sibling.string jobType = data_format.typeOfPost(jobCode) jobLink = "http://www.ema.europa.eu/ema/" + child.td.next_sibling.next_sibling.a.get('href') jobDeadline = data_format.dateFormatFull(child.td.next_sibling.next_sibling.next_sibling.string) print(jobTitle,jobCode,jobType,jobLink,jobDeadline) ema.persist(int(ema_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print("#========================EMA SCRAPING COMPLETE=================================")
def scrapSatCen(): print("#========================= SatCen SCRAPING =========================") SatCenData = satcen.returnAgency('SATCEN') SatCen_link = SatCenData['link'][0] SatCen_id = SatCenData['id'][0] SatCen_source = urllib.request.urlopen(SatCen_link) #Retrieve the list of jobs as bs4 navigable string soup = BeautifulSoup(SatCen_source,'html.parser') #Convert to bytes bytesEncoded = soup.encode('utf-8') #Convert to string stringDecoded = bytesEncoded.decode('utf-8') #Convert to dictionary jobsdict = json.loads(stringDecoded) #Browse dictionaty and select available positions for job in jobsdict: if (job['Status']=='OPEN') and (job['InternalOnly'] == False): link = 'https://apps.satcen.europa.eu/recruitment/#/vacancy/'+job['Id'] print(job['Reference'], job['ExpireOn'][:10],job['Title'],format.typeOfPost(job['TypePost']),job['WorkUnit'],link) satcen.persist(SatCen_id, job['Title'],job['Reference'],job['WorkUnit'],'', job['ExpireOn'][:10],link,'', format.typeOfPost(job['TypePost'])) print("#========================SATCEN SCRAPING COMPLETE=================================")
def scrapEIOPA(): print( "#========================= EIOPA SCRAPING =========================") # Database connection and agency retrieval eiopaData = eiopa.returnAgency('EIOPA') eiopa_link = eiopaData['link'][0] eiopa_id = eiopaData['id'][0] html = urllib.request.urlopen(eiopa_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('table', attrs={'class': 'ms-rteTable-EIOPATable'}) for tr in start.tbody: if str(tr['class'][0]) == "ms-rteTableHeaderRow-EIOPATable": continue jobTitle = tr.th.next_sibling.a.string jobLink = "https://eiopa.europa.eu/" + tr.th.next_sibling.a.get('href') jobCode = str(re.match(r'(.*?)%20', jobLink).group(1)[29:]) jobType = data_format.typeOfPost(jobCode) deadlinePosition = tr.td.next_sibling if re.match('\w', jobTitle[0]) is None: jobTitle = jobTitle[1:len(jobTitle)] if (len(deadlinePosition.contents[0].string) > 2): jobDeadline = deadlinePosition.contents[0].string jobDeadline = str(jobDeadline).strip() jobDeadline = data_format.dateFormatFull(jobDeadline[1:]) else: extendedDeadlines = deadlinePosition.findAll('strong') newDeadline = extendedDeadlines[len(extendedDeadlines) - 1].string newDeadline = str(newDeadline.split(':')[1]).strip() jobDeadline = data_format.dateFormatFull(newDeadline) print(jobTitle, jobCode, jobType, jobDeadline) eiopa.persist(int(eiopa_id), jobTitle, '', '', jobCode, jobDeadline, jobLink, '', jobType) print( "#========================EIOPA SCRAPING COMPLETE=================================" ) #scrapEIOPA()
def scrapBBI(): print("#========================= BBI SCRAPING =========================") # Database connection and agency retrieval bbiData = bbi.returnAgency('BBI') bbi_link = bbiData['link'][0] bbi_id = bbiData['id'][0] html = urllib.request.urlopen(bbi_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('tbody', attrs={'class': 'ui-datatable-data ui-widget-content'}) # Find the jobs table Jobtable = (start.findAll('tr')) for child in Jobtable: jobCodeLocation = child.td.next_sibling jobTitleLocation = jobCodeLocation.next_sibling jobTypeLocation = jobTitleLocation.next_sibling jobGradeLocation = jobTypeLocation.next_sibling jobDeadlineLocation = jobGradeLocation.next_sibling jobLinkLocation = jobDeadlineLocation.next_sibling jobCode = jobCodeLocation.string jobTitle = jobTitleLocation.string jobType = data_format.typeOfPost(jobTypeLocation.string) jobGrade = data_format.typeOfGrade(jobGradeLocation.string) jobDeadline = data_format.dateFormatFull( jobDeadlineLocation.string[:10]) jobLink = jobLinkLocation.a.get('href') print(jobCode, jobTitle, jobType, jobGrade, jobDeadline, jobLink) bbi.persist(int(bbi_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print( "#========================BBI SCRAPING COMPLETE=================================" )
def scrapCDT(): print("#========================= CDT SCRAPING =========================") # Database connection and agency retrieval cdtData = cdt.returnAgency('CDT') cdt_link = cdtData['link'][0] cdt_id = cdtData['id'][0] html = urllib.request.urlopen(cdt_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.findAll('div', attrs={'class': 'ms-rtestate-read ms-rte-wpbox'}) #print (start[1])#.ul.li.p.div.span.font.string.strip()) for child in start: jobTitle = child.span.attrs['title'] postType = jobTitle[:9].strip() for post in child.ul: try: #print (child.ul) job = post.find('h3') jobCode = job.a.string.strip() jobLink = "http://cdt.europa.eu" + job.a.get('href').replace( ' ', '%20') jobTitle = post.find('p').div.span.font.string.strip() jobType = data_format.typeOfPost(postType) jobDeadline = data_format.dateFormatFull('SA') print(jobCode, jobLink, jobTitle, jobType, jobDeadline) cdt.persist(int(cdt_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) except: continue print( "#========================CDT SCRAPING COMPLETE=================================" )
def scrapBBI(): print("#========================= BBI SCRAPING =========================") # Database connection and agency retrieval bbiData = bbi.returnAgency('BBI') bbi_link = bbiData['link'][0] bbi_id = bbiData['id'][0] html = urllib.request.urlopen(bbi_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('tbody',attrs={'class':'ui-datatable-data ui-widget-content'}) # Find the jobs table Jobtable = (start.findAll('tr')) for child in Jobtable: jobCodeLocation = child.td.next_sibling jobTitleLocation = jobCodeLocation.next_sibling jobTypeLocation = jobTitleLocation.next_sibling jobGradeLocation = jobTypeLocation.next_sibling jobDeadlineLocation = jobGradeLocation.next_sibling jobLinkLocation = jobDeadlineLocation.next_sibling jobCode = jobCodeLocation.string jobTitle = jobTitleLocation.string jobType = data_format.typeOfPost(jobTypeLocation.string) jobGrade = data_format.typeOfGrade(jobGradeLocation.string) jobDeadline = data_format.dateFormatFull(jobDeadlineLocation.string[:10]) jobLink = jobLinkLocation.a.get('href') print (jobCode,jobTitle,jobType,jobGrade,jobDeadline,jobLink) bbi.persist(int(bbi_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print("#========================BBI SCRAPING COMPLETE=================================")
def scrapF4E(): print("#========================= F4E SCRAPING =========================") F4EData = F4E.returnAgency('F4E') F4E_link = F4EData['link'][0] F4E_id = F4EData['id'][0] html = urllib.request.urlopen(F4E_link) soup = BeautifulSoup(html, "html.parser") start = soup.findAll(attrs={"class": re.compile("^careersPurple2")}) for contractType in start: deadline = jobTitle = jobCode = jobLink = '' contract = data_format.typeOfPost(contractType.a.string) jobInfo = contractType.next_sibling.next_sibling try: deadline = data_format.dateFormatFull( jobInfo.find(attrs={ "class": "careersDate" }).span.string) jobTitle = jobInfo.find(attrs={"class": "careersTitle"}).string jobCode = jobInfo.find(attrs={"class": "pdf"}).string jobLink = "http://fusionforenergy.europa.eu/careers/vacancies/" + jobInfo.find( attrs={ "class": "pdf" }).get("href") print(deadline, jobTitle.strip(), jobCode, jobLink, contract) F4E.persist(F4E_id, jobTitle.strip(), jobCode, '', '', deadline, jobLink, '', contract) except: pass print( "#========================F4E SCRAPING COMPLETE=================================" )
def scrapEMA(): print("#========================= EMA SCRAPING =========================") # Database connection and agency retrieval emaData = ema.returnAgency('EMA') ema_link = emaData['link'][0] ema_id = emaData['id'][0] html = urllib.request.urlopen(ema_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('div', attrs={'class': 'main-col'}) # Find the jobs table Jobtable = (start.find('table')) for child in Jobtable.children: if (child.find('td', attrs={'colspan': 'top'})): continue jobTitle = child.td.string jobCode = child.td.next_sibling.string jobType = data_format.typeOfPost(jobCode) jobLink = "http://www.ema.europa.eu/ema/" + child.td.next_sibling.next_sibling.a.get( 'href') jobDeadline = data_format.dateFormatFull( child.td.next_sibling.next_sibling.next_sibling.string) print(jobTitle, jobCode, jobType, jobLink, jobDeadline) ema.persist(int(ema_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print( "#========================EMA SCRAPING COMPLETE=================================" )
def scrapEurojust(): print("#========================= EUROJUST SCRAPING =========================") # Database connection and agency retrieval eurojustData = eurojust.returnAgency('EUROJUST') eurojust_link = eurojustData['link'][0] eurojust_id = eurojustData['id'][0] html = urllib.request.urlopen(eurojust_link) soup = BeautifulSoup(html, "html.parser") # Find the first ad start = soup.findAll("table",attrs={"class":"vacancyAnnouncements2"}) # Iterate through the tables for table in start: for ad in table.findAll("tr",attrs={"class":"vacancyAnnouncements2Row"}): title = jobType = deadline = url = jobTitle = None for piece in ad.findAll("td"): if (title is None): title = piece.get_text() continue elif (url is None): url = piece.find('a').get('href') jobTitle = piece.get_text() continue elif (deadline is None): deadline = piece.get_text()[1:] deadlineFormatted = data_format.dateFormatFull(str(deadline).replace('/',' ')) continue else: pass print (jobTitle, deadlineFormatted) jobType = data_format.typeOfGrade(title) eurojust.persist(int(eurojust_id), str(jobTitle).strip(), '', '', str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType) for ad in table.findAll("tr",attrs={"class" : "vacancyAnnouncements2AlternatingRow"}): title = deadline = url = jobTitle = jobType = None for piece in ad.findAll("td"): if (title is None): title = piece.get_text() continue elif (url is None): url = piece.find('a').get('href') jobTitle = piece.get_text() continue elif (deadline is None): deadline = piece.get_text()[1:] deadlineFormatted = data_format.dateFormatFull(str(deadline).replace('/', ' ')) continue else: pass print (jobTitle, deadlineFormatted) jobType = data_format.typeOfPost(title) # Insert job details in database eurojust.persist(int(eurojust_id), str(jobTitle).strip(), '', '', str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType) print("#========================EUROJUST SCRAPING COMPLETE=================================")