def scrapETF(): print("#========================= ETF SCRAPING =========================") # Database connection and agency retrieval etfData = etf.returnAgency('ETF') etf_link = etfData['link'][0] etf_id = etfData['id'][0] html = urllib.request.urlopen(etf_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('div',attrs={'class':'content_group piclist_content'}) # Find the jobs table Jobtable = start.find('div').div.ul for child in Jobtable.children: jobTitle = child.div.h2.string jobLink = "" + child.div.p.a.get('href') jobCode = child.div.p.a.string jobDeadline = data_format.dateFormatFull(str(child.div.p)[12:22]) print (jobTitle, jobLink.replace(' ','%20'), jobCode, jobDeadline) etf.persist(int(etf_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink.replace(' ','%20'), '', 'Other') print("#======================== ETF SCRAPING COMPLETE =================================")
def scrapEMA(): print("#========================= EMA SCRAPING =========================") # Database connection and agency retrieval emaData = ema.returnAgency('EMA') ema_link = emaData['link'][0] ema_id = emaData['id'][0] html = urllib.request.urlopen(ema_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('div',attrs={'class':'main-col'}) # Find the jobs table Jobtable = (start.find('table')) for child in Jobtable.children: if(child.find('td',attrs={'colspan':'top'})): continue jobTitle = jobCode = jobType = data_format.typeOfPost(jobCode) jobLink = "" +'href') jobDeadline = data_format.dateFormatFull( print(jobTitle,jobCode,jobType,jobLink,jobDeadline) ema.persist(int(ema_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print("#========================EMA SCRAPING COMPLETE=================================")
def scrapESMA(): print("#========================= ESMA SCRAPING =========================") # Database connection and agency retrieval esmaData = esma.returnAgency('ESMA') esma_link = esmaData['link'][0] esma_id = esmaData['id'][0] html = urllib.request.urlopen(esma_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('div',attrs={'class':'search-page_main'}) # Find the jobs table Jobtable = (start.table.tbody.findAll('tr')) for child in Jobtable: titleSource = child.find('td',attrs={'class':'esma_library-title'}) jobCode = child.find('td',attrs={'class':'esma_library-ref'}).string jobLink = titleSource.a.get('href') jobTitle = titleSource.string jobDeadline = data_format.dateFormatFull(re.sub('\D','',jobTitle)) jobType = data_format.typeOfGrade(jobCode) print (jobTitle, jobCode, jobLink,jobDeadline,jobType) esma.persist(int(esma_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print("#========================ESMA SCRAPING COMPLETE=================================")
def scrapEFSA(): print("#========================= EFSA SCRAPING =========================") # Database connection and agency retrieval efsaData = efsa.returnAgency('EFSA') efsa_link = efsaData['link'][0] efsa_id = efsaData['id'][0] html = urllib.request.urlopen(efsa_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.findAll('div', attrs={'class': 'jlr_right_hldr'}) for child in start: jobTitle = child.p.string jobLink = child.p.a.get('href') jobDept = child.find( 'div', attrs={ 'class': 'jlr_content_half jlr_content_right' }).p.span.next_element.next_element.next_element.string print(jobTitle, jobLink, jobDept) efsa.persist(int(efsa_id), str(jobTitle).strip(), '', jobDept, '', 'SA', jobLink, '', 'Other') print( "#======================== EFSA SCRAPING COMPLETE =================================" )
def scrapGSA(): print("#========================= GSA SCRAPING =========================") # Database connection and agency retrieval gsaData = gsa.returnAgency('GSA') gsa_link = gsaData['link'][0] gsa_id = gsaData['id'][0] pages = { "CATA": "gsa/jobs-opportunities", "TR": "traineeship-listing", "SNE": "gsa-seconded-national-experts" } for pairs in pages: title = pairs.title().upper() page_link = (gsa_link + "/" + pages[title]) html = urllib.request.urlopen(page_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('tbody') today = # Find the jobs table Jobtable = (start.findAll('tr')) for cell in Jobtable: td = cell.findAll('td') print(td[0])
def scrapCPVO(): print("#========================= CPVO SCRAPING =========================") # Database connection and agency retrieval cpvoData = cpvo.returnAgency('CPVO') cpvo_link = cpvoData['link'][0] cpvo_id = cpvoData['id'][0] html = urllib.request.urlopen(cpvo_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('table',attrs={'summary':'Vacancies'}) # Find the jobs table Jobtable = (start.findAll('tr')) for child in Jobtable: if(child.find('th',attrs={'id':'vacancy_title'})): continue #print (child) jobTitle = jobLink ='href') jobCode = jobType = data_format.typeOfPost(jobCode) jobDeadline = data_format.dateFormatFull( logging.debug (jobTitle,jobLink,jobCode,jobType,jobDeadline) cpvo.persist(int(cpvo_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print("#========================CPVO SCRAPING COMPLETE=================================")
def scrapESMA(): print("#========================= ESMA SCRAPING =========================") # Database connection and agency retrieval esmaData = esma.returnAgency('ESMA') esma_link = esmaData['link'][0] esma_id = esmaData['id'][0] html = urllib.request.urlopen(esma_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('div', attrs={'class': 'search-page_main'}) # Find the jobs table Jobtable = (start.table.tbody.findAll('tr')) for child in Jobtable: titleSource = child.find('td', attrs={'class': 'esma_library-title'}) jobCode = child.find('td', attrs={'class': 'esma_library-ref'}).string jobLink = titleSource.a.get('href') jobTitle = titleSource.string jobDeadline = data_format.dateFormatFull(re.sub('\D', '', jobTitle)) jobType = data_format.typeOfGrade(jobCode) print(jobTitle, jobCode, jobLink, jobDeadline, jobType) esma.persist(int(esma_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print( "#========================ESMA SCRAPING COMPLETE=================================" )
def scrapGSA(): print("#========================= GSA SCRAPING =========================") # Database connection and agency retrieval gsaData = gsa.returnAgency('GSA') gsa_link = gsaData['link'][0] gsa_id = gsaData['id'][0] pages = {"CATA": "gsa/jobs-opportunities", "TR": "traineeship-listing", "SNE": "gsa-seconded-national-experts"} for pairs in pages: title = pairs.title().upper() page_link = (gsa_link + "/" + pages[title]) html = urllib.request.urlopen(page_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('tbody') today = # Find the jobs table Jobtable = (start.findAll('tr')) for cell in Jobtable: td = cell.findAll('td') print (td[0])
def scrapSatCen(): print("#========================= SatCen SCRAPING =========================") SatCenData = satcen.returnAgency('SATCEN') SatCen_link = SatCenData['link'][0] SatCen_id = SatCenData['id'][0] SatCen_source = urllib.request.urlopen(SatCen_link) #Retrieve the list of jobs as bs4 navigable string soup = BeautifulSoup(SatCen_source,'html.parser') #Convert to bytes bytesEncoded = soup.encode('utf-8') #Convert to string stringDecoded = bytesEncoded.decode('utf-8') #Convert to dictionary jobsdict = json.loads(stringDecoded) #Browse dictionaty and select available positions for job in jobsdict: if (job['Status']=='OPEN') and (job['InternalOnly'] == False): link = ''+job['Id'] print(job['Reference'], job['ExpireOn'][:10],job['Title'],format.typeOfPost(job['TypePost']),job['WorkUnit'],link) satcen.persist(SatCen_id, job['Title'],job['Reference'],job['WorkUnit'],'', job['ExpireOn'][:10],link,'', format.typeOfPost(job['TypePost'])) print("#========================SATCEN SCRAPING COMPLETE=================================")
def scrapEFSA(): print("#========================= EFSA SCRAPING =========================") # Database connection and agency retrieval efsaData = efsa.returnAgency('EFSA') efsa_link = efsaData['link'][0] efsa_id = efsaData['id'][0] html = urllib.request.urlopen(efsa_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.findAll('div',attrs={'class':'jlr_right_hldr'}) for child in start: jobTitle = child.p.string jobLink = child.p.a.get('href') jobDept = child.find('div',attrs={'class':'jlr_content_half jlr_content_right'}).p.span.next_element.next_element.next_element.string print (jobTitle, jobLink,jobDept) efsa.persist(int(efsa_id), str(jobTitle).strip(), '', jobDept, '', 'SA', jobLink, '', 'Other') print("#======================== EFSA SCRAPING COMPLETE =================================")
def scrapETF(): print("#========================= ETF SCRAPING =========================") # Database connection and agency retrieval etfData = etf.returnAgency('ETF') etf_link = etfData['link'][0] etf_id = etfData['id'][0] html = urllib.request.urlopen(etf_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('div', attrs={'class': 'content_group piclist_content'}) # Find the jobs table Jobtable = start.find('div').div.ul for child in Jobtable.children: jobTitle = child.div.h2.string jobLink = "" + child.div.p.a.get('href') jobCode = child.div.p.a.string jobDeadline = data_format.dateFormatFull(str(child.div.p)[12:22]) print(jobTitle, jobLink.replace(' ', '%20'), jobCode, jobDeadline) etf.persist(int(etf_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink.replace(' ', '%20'), '', 'Other') print( "#======================== ETF SCRAPING COMPLETE =================================" )
def scrapEIOPA(): print( "#========================= EIOPA SCRAPING =========================") # Database connection and agency retrieval eiopaData = eiopa.returnAgency('EIOPA') eiopa_link = eiopaData['link'][0] eiopa_id = eiopaData['id'][0] html = urllib.request.urlopen(eiopa_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('table', attrs={'class': 'ms-rteTable-EIOPATable'}) for tr in start.tbody: if str(tr['class'][0]) == "ms-rteTableHeaderRow-EIOPATable": continue jobTitle = jobLink = "" +'href') jobCode = str(re.match(r'(.*?)%20', jobLink).group(1)[29:]) jobType = data_format.typeOfPost(jobCode) deadlinePosition = if re.match('\w', jobTitle[0]) is None: jobTitle = jobTitle[1:len(jobTitle)] if (len(deadlinePosition.contents[0].string) > 2): jobDeadline = deadlinePosition.contents[0].string jobDeadline = str(jobDeadline).strip() jobDeadline = data_format.dateFormatFull(jobDeadline[1:]) else: extendedDeadlines = deadlinePosition.findAll('strong') newDeadline = extendedDeadlines[len(extendedDeadlines) - 1].string newDeadline = str(newDeadline.split(':')[1]).strip() jobDeadline = data_format.dateFormatFull(newDeadline) print(jobTitle, jobCode, jobType, jobDeadline) eiopa.persist(int(eiopa_id), jobTitle, '', '', jobCode, jobDeadline, jobLink, '', jobType) print( "#========================EIOPA SCRAPING COMPLETE=================================" ) #scrapEIOPA()
def scrapEBA(): print("#========================= EBA SCRAPING =========================") # Database connection and agency retrieval ebaData = eba.returnAgency('EBA') eba_link = ebaData['link'][0] eba_id = ebaData['id'][0] pages = {"CA": "contract-agents", "TA": "temporary-agents", "SNE": "national-experts-on-secondment"} for pairs in pages: title = pairs.title().upper() page_link = (eba_link + "/" + pages[title]) html = urllib.request.urlopen(page_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('table', attrs={'class': 'Tabular'}) today = # Find the jobs table Jobtable = (start.findAll('tr')) for cell in Jobtable: td = cell.findAll('td') try: status = td[3].string.strip() rawDate = td[2].string searchDate = re.match(r'(.*)at', rawDate) date = data_format.dateFormatFull( except: continue if (today < date) and (status == "ongoing"): jobLink = eba_link[:24] + td[0].a.get('href') jobTitle = td[0].string jobCode = td[1].string.strip() jobDeadline = date jobType = title print(jobCode, jobTitle, jobType, jobDeadline, jobLink) eba.persist(int(eba_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) else: continue print("#========================EBA SCRAPING COMPLETE=================================")
def scrapBBI(): print("#========================= BBI SCRAPING =========================") # Database connection and agency retrieval bbiData = bbi.returnAgency('BBI') bbi_link = bbiData['link'][0] bbi_id = bbiData['id'][0] html = urllib.request.urlopen(bbi_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('tbody', attrs={'class': 'ui-datatable-data ui-widget-content'}) # Find the jobs table Jobtable = (start.findAll('tr')) for child in Jobtable: jobCodeLocation = jobTitleLocation = jobCodeLocation.next_sibling jobTypeLocation = jobTitleLocation.next_sibling jobGradeLocation = jobTypeLocation.next_sibling jobDeadlineLocation = jobGradeLocation.next_sibling jobLinkLocation = jobDeadlineLocation.next_sibling jobCode = jobCodeLocation.string jobTitle = jobTitleLocation.string jobType = data_format.typeOfPost(jobTypeLocation.string) jobGrade = data_format.typeOfGrade(jobGradeLocation.string) jobDeadline = data_format.dateFormatFull( jobDeadlineLocation.string[:10]) jobLink = jobLinkLocation.a.get('href') print(jobCode, jobTitle, jobType, jobGrade, jobDeadline, jobLink) bbi.persist(int(bbi_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print( "#========================BBI SCRAPING COMPLETE=================================" )
def scrapEACEA(): print("#========================= EACEA SCRAPING =========================") # Database connection and agency retrieval eaceaData = eacea.returnAgency('EACEA') eacea_link = eaceaData['link'][0] eacea_id = eaceaData['id'][0] html = urllib.request.urlopen(eacea_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('table',attrs={'class':'table table-striped table-hover views-table cols-4'}) # Find the jobs table Jobtable = (start.find('tbody')) rows = Jobtable.findAll('tr') for posts in rows: columns = posts.children for tds in columns: try: status = tds.next_sibling.next_sibling.next_sibling.next_sibling.span.string if (status == 'Open'): jobLink = ""+ tds.a.get('href') jobTitle = tds.a.string deadline = tds.next_sibling.next_sibling.span.string jobDeadline = data_format.dateFormatFull(deadline[:10]) if jobTitle.find('CA-FG')> 0: jobType = "CA" else: jobType = "Other" print(jobTitle, jobLink, jobType, jobDeadline) eacea.persist(int(eacea_id), str(jobTitle).strip(), '', '', '', jobDeadline, jobLink, '', jobType) else: continue except: continue print("#========================EACEA SCRAPING COMPLETE=================================")
def scrapCDT(): print("#========================= CDT SCRAPING =========================") # Database connection and agency retrieval cdtData = cdt.returnAgency('CDT') cdt_link = cdtData['link'][0] cdt_id = cdtData['id'][0] html = urllib.request.urlopen(cdt_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.findAll('div', attrs={'class': 'ms-rtestate-read ms-rte-wpbox'}) #print (start[1]) for child in start: jobTitle = child.span.attrs['title'] postType = jobTitle[:9].strip() for post in child.ul: try: #print (child.ul) job = post.find('h3') jobCode = job.a.string.strip() jobLink = "" + job.a.get('href').replace( ' ', '%20') jobTitle = post.find('p').div.span.font.string.strip() jobType = data_format.typeOfPost(postType) jobDeadline = data_format.dateFormatFull('SA') print(jobCode, jobLink, jobTitle, jobType, jobDeadline) cdt.persist(int(cdt_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) except: continue print( "#========================CDT SCRAPING COMPLETE=================================" )
def scrapF4E(): print("#========================= F4E SCRAPING =========================") F4EData = F4E.returnAgency('F4E') F4E_link = F4EData['link'][0] F4E_id = F4EData['id'][0] html = urllib.request.urlopen(F4E_link) soup = BeautifulSoup(html, "html.parser") start = soup.findAll(attrs={"class": re.compile("^careersPurple2")}) for contractType in start: deadline = jobTitle = jobCode = jobLink = '' contract = data_format.typeOfPost(contractType.a.string) jobInfo = contractType.next_sibling.next_sibling try: deadline = data_format.dateFormatFull( jobInfo.find(attrs={ "class": "careersDate" }).span.string) jobTitle = jobInfo.find(attrs={"class": "careersTitle"}).string jobCode = jobInfo.find(attrs={"class": "pdf"}).string jobLink = "" + jobInfo.find( attrs={ "class": "pdf" }).get("href") print(deadline, jobTitle.strip(), jobCode, jobLink, contract) F4E.persist(F4E_id, jobTitle.strip(), jobCode, '', '', deadline, jobLink, '', contract) except: pass print( "#========================F4E SCRAPING COMPLETE=================================" )
def scrapBBI(): print("#========================= BBI SCRAPING =========================") # Database connection and agency retrieval bbiData = bbi.returnAgency('BBI') bbi_link = bbiData['link'][0] bbi_id = bbiData['id'][0] html = urllib.request.urlopen(bbi_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('tbody',attrs={'class':'ui-datatable-data ui-widget-content'}) # Find the jobs table Jobtable = (start.findAll('tr')) for child in Jobtable: jobCodeLocation = jobTitleLocation = jobCodeLocation.next_sibling jobTypeLocation = jobTitleLocation.next_sibling jobGradeLocation = jobTypeLocation.next_sibling jobDeadlineLocation = jobGradeLocation.next_sibling jobLinkLocation = jobDeadlineLocation.next_sibling jobCode = jobCodeLocation.string jobTitle = jobTitleLocation.string jobType = data_format.typeOfPost(jobTypeLocation.string) jobGrade = data_format.typeOfGrade(jobGradeLocation.string) jobDeadline = data_format.dateFormatFull(jobDeadlineLocation.string[:10]) jobLink = jobLinkLocation.a.get('href') print (jobCode,jobTitle,jobType,jobGrade,jobDeadline,jobLink) bbi.persist(int(bbi_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print("#========================BBI SCRAPING COMPLETE=================================")
def scrapEMA(): print("#========================= EMA SCRAPING =========================") # Database connection and agency retrieval emaData = ema.returnAgency('EMA') ema_link = emaData['link'][0] ema_id = emaData['id'][0] html = urllib.request.urlopen(ema_link) soup = BeautifulSoup(html, "html.parser") # Create the soup start = soup.find('div', attrs={'class': 'main-col'}) # Find the jobs table Jobtable = (start.find('table')) for child in Jobtable.children: if (child.find('td', attrs={'colspan': 'top'})): continue jobTitle = jobCode = jobType = data_format.typeOfPost(jobCode) jobLink = "" + 'href') jobDeadline = data_format.dateFormatFull( print(jobTitle, jobCode, jobType, jobLink, jobDeadline) ema.persist(int(ema_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType) print( "#========================EMA SCRAPING COMPLETE=================================" )
def scrapEDA(): print("#========================= EDA SCRAPING =========================") edaData = eda.returnAgency('EDA') eda_link = edaData['link'][0] eda_id = edaData['id'][0] user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36" headers = {'User-Agent': user_agent} data = '' data = data.encode('ascii') req = urllib.request.Request(eda_link, data, headers) #print(emsa_link) with urllib.request.urlopen(req) as response: html = soup = BeautifulSoup(html, "html.parser") # Iterate through the internal groups for post_type in soup.find_all("h4"): internal_type = post_type.contents[0].strip() if internal_type not in ('Temporary Agents', 'Contractual Agents', 'Seconded National Experts'): continue elif internal_type == 'Temporary Agents': job_type = 'TA' elif internal_type == 'Contractual Agents': job_type = 'CA' else: job_type = 'SNE' print(post_type.contents[0]) internals = post_type.next_element.next_element.next_element.find_all( "li") #try: # Iterate the URLs for each TA post for post in internals: ta_link = eda_link + post.find('a').get("href") ta_req = urllib.request.Request(ta_link, data, headers) with urllib.request.urlopen(ta_req) as response: ta_html = ta_soup = BeautifulSoup(ta_html, "html.parser") #Link print(ta_link) #Post post_title = ta_soup.findAll( attrs={"id": "cphMain_VacNotice_LabPost" })[0].contents[0].strip() print(post_title) #Grade post_grade = ta_soup.findAll( attrs={"id": "cphMain_VacNotice_LabGrade" })[0].contents[0].strip() print(post_grade) #Deadline post_deadline = ta_soup.findAll( attrs={"id": "cphMain_VacNotice_LabPublicationDateEnd"}) print(post_deadline[0].contents[0].strip() + "\n") # Convert date try: date_object = datetime.strptime( post_deadline[0].contents[0].strip(), '%d %B %Y') deadline = #print (deadline) except: print("could not modify " + str(post_deadline)) pass # Insert job details in database eda.persist(int(eda_id), str(post_title).strip(), str(post_grade), '', '', deadline, str(ta_link).strip(), '', job_type) #except: # pass print( "#========================EDA SCRAPING COMPLETE=================================" )
def scrapEPSO(): print("#========================= EPSO SCRAPING =========================") epsoData = epso.returnAgency('EPSO') epso_link = epsoData['link'][0] html = urllib.request.urlopen(epso_link) text ='utf-8') soup = BeautifulSoup(text, "html.parser") #Initiate scrap start = soup.find(attrs={"class": "view-content"}) page = 0 while (start is not None): table = start.tbody.findAll("tr") for tr in table: # Retrieve job information print( tr.find(attrs={ "class": "views-field views-field-field-epso-locations" }).get_text()) jobTitle = tr.find(attrs={ "class": "views-field views-field-title-field" }).get_text() grade = tr.find(attrs={ "class": "views-field views-field-field-epso-grade" }).get_text() institute = tr.find(attrs={ "class": "views-field views-field-field-epso-institution-id" }).get_text() url = "" + tr.find( attrs={ "class": "views-field views-field-title-field" }).a.get("href") date_deadline = tr.find( attrs={ "class": "views-field views-field-field-epso-deadline" }).get_text() contract = tr.find(attrs={ "class": "views-field views-field-field-epso-type-of-contract" }).get_text() deadline = data_format.dateFormatFull( date_deadline.split("-")[0].strip()) # Extract the agency code try: inst_code ='\((.*?)\)', institute).groups()[0] except: inst_code = institute check_institute = epso.EPSOinstitution(inst_code) #print ("inst:" + check_institute) if check_institute[2] == 1: continue # Retrieve the agency's id from eu_institute inst_id = check_institute[0] # Retrieve the agency's type from eu_institute inst_type = check_institute[1] # Determine the grade jobType = data_format.typeOfGrade(grade) # Insert job details in database epso.persist(inst_id, jobTitle, str(grade).strip(), str(institute).strip(), '', deadline, str(url).strip(), inst_type, jobType) print(inst_id, jobTitle, str(grade).strip(), str(institute).strip(), '', deadline, str(url).strip(), inst_type, jobType) page = int(page) + 1 epso_link = epso_link + str(page) html = urllib.request.urlopen(epso_link) text ='utf-8') soup = BeautifulSoup(text, "html.parser") start = soup.find(attrs={"class": "view-content"}) i = 2 print( "#========================EPSO SCRAPING COMPLETE=================================" )
def scrapEMSA(): print("#========================= EMSA SCRAPING =========================") emsaData = emsa.returnAgency('EMSA') emsa_link = emsaData['link'][0] emsa_id = emsaData['id'][0] user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36" headers = {'User-Agent': user_agent} data = '' data = data.encode('ascii') req = urllib.request.Request(emsa_link, data, headers) with urllib.request.urlopen(req) as response: html = soup = BeautifulSoup(html, "html.parser") # Find the first ad start = soup.findAll(attrs={"class": "sectiontableentry"}) # Iterate through the tables for cell in start: ad_code = cell.find("th").get_text() print("Job Code:" + ad_code.strip()) ad_url = "" + cell.find('a').get('href') print("Job URL:" + ad_url) count = 0 for ad in cell.findAll("td"): if count == 0: ad_description = ad.get_text() print("description: " + ad_description) if count == 2: ad_deadline = ad.get_text() print("deadline: " + ad_deadline) count = count + 1 # Convert date try: date_object = datetime.strptime(ad_deadline, '%d.%m.%Y') deadline = #print (deadline) except: print("could not modify " + deadline) pass ad_raw = ad_code + " " + ad_description # Identify type jobType = data_format.typeOfGrade(ad_raw) print(jobType) # Insert job details in database emsa.persist(int(emsa_id), str(ad_description).strip(), '', '', str(ad_code).strip(), deadline, str(ad_url).strip(), '', jobType) print( "#========================EMSA SCRAPING COMPLETE=================================" )
def scrapEDA(): print("#========================= EDA SCRAPING =========================") edaData = eda.returnAgency('EDA') eda_link = edaData['link'][0] eda_id = edaData['id'][0] user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36" headers = { 'User-Agent' : user_agent } data = '' data = data.encode('ascii') req = urllib.request.Request(eda_link,data,headers) #print(emsa_link) with urllib.request.urlopen(req) as response: html = soup = BeautifulSoup(html, "html.parser") # Iterate through the internal groups for post_type in soup.find_all("h4"): internal_type = post_type.contents[0].strip() if internal_type not in ('Temporary Agents','Contractual Agents','Seconded National Experts'): continue elif internal_type == 'Temporary Agents': job_type = 'TA' elif internal_type == 'Contractual Agents': job_type = 'CA' else: job_type = 'SNE' print (post_type.contents[0]) internals = post_type.next_element.next_element.next_element.find_all("li") #try: # Iterate the URLs for each TA post for post in internals: ta_link = eda_link + post.find('a').get("href") ta_req = urllib.request.Request(ta_link,data,headers) with urllib.request.urlopen(ta_req) as response: ta_html = ta_soup = BeautifulSoup(ta_html, "html.parser") #Link print (ta_link) #Post post_title = ta_soup.findAll(attrs={"id":"cphMain_VacNotice_LabPost"})[0].contents[0].strip() print (post_title) #Grade post_grade = ta_soup.findAll(attrs={"id":"cphMain_VacNotice_LabGrade"})[0].contents[0].strip() print (post_grade) #Deadline post_deadline = ta_soup.findAll(attrs={"id":"cphMain_VacNotice_LabPublicationDateEnd"}) print (post_deadline[0].contents[0].strip()+ "\n") # Convert date try: date_object = datetime.strptime(post_deadline[0].contents[0].strip(), '%d %B %Y') deadline = #print (deadline) except: print ("could not modify " + str(post_deadline)) pass # Insert job details in database eda.persist(int(eda_id), str(post_title).strip(), str(post_grade), '', '', deadline, str(ta_link).strip(), '', job_type) #except: # pass print("#========================EDA SCRAPING COMPLETE=================================")
def scrapEUROPOL(): print( "#========================= EUROPOL SCRAPING =========================" ) europolData = europol.returnAgency('EUROPOL') europol_link = europolData['link'][0] europol_id = europolData['id'][0] html = urllib.request.urlopen(europol_link) soup = BeautifulSoup(html, "html.parser") def dateFormatFull(inputDate): dnotz = None for form in [ '%d %b %Y', '%d %b %y', '%d %B %Y', '%d/%m/%Y', '%d.%m.%Y' ]: try: dnotz = datetime.strptime(inputDate, form).date() return str(dnotz) except: continue if dnotz is None: print('Bad Date:', inputDate) return str(inputDate) # Find all ads start = soup.findAll(attrs={"class": re.compile("^views-row views-row-")}) #print ("posts found " + str(len(start))) # Iterate through the divs for advert in start: try: deadline = advert.find(attrs={ "class": "views-field views-field-deadline" }).findAll('span')[1].get_text() deadlineFormatted = dateFormatFull(deadline) print("Deadline:", dateFormatFull(deadline)) print( "Contract Type:", advert.find(attrs={ "class": "views-field views-field-contract-type" }).find('span').get_text()) jobTitle = advert.find("a").get_text() print("Title:", jobTitle) dept = advert.find(attrs={ "class": "views-field views-field-department" }).find('span').get_text() print("Department:", dept) title = advert.find( attrs={ "class": "views-field views-field-reference-number" }).find('span').get_text() print("Reference Number:", title) url = "" + advert.find("a").get("href") print("Link:", url) except: continue if'(AD+\d{1,2}?|AD +\d{1,2}?|TA)', title) is not None: jobType = "AD" elif'(AST+\d{1,2}?|AST +\d{1,2}?)', title) is not None: jobType = "AST" elif'(FG+\d|FG+III|FG+IV|Function Groups|CA)', title) is not None: jobType = "CA" elif'(trainee)', title, re.IGNORECASE) is not None: jobType = "Trainee" elif'(SNE|Seconded)', title, re.IGNORECASE) is not None: jobType = "SNE" else: jobType = "Other" print(int(europol_id), str(jobTitle).strip(), '', str(dept).strip(), str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType) europol.persist(int(europol_id), str(jobTitle).strip(), '', str(dept).strip(), str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType) print( "#========================EUROPOL SCRAPING COMPLETE=================================" )
def scrapEurojust(): print("#========================= EUROJUST SCRAPING =========================") # Database connection and agency retrieval eurojustData = eurojust.returnAgency('EUROJUST') eurojust_link = eurojustData['link'][0] eurojust_id = eurojustData['id'][0] html = urllib.request.urlopen(eurojust_link) soup = BeautifulSoup(html, "html.parser") # Find the first ad start = soup.findAll("table",attrs={"class":"vacancyAnnouncements2"}) # Iterate through the tables for table in start: for ad in table.findAll("tr",attrs={"class":"vacancyAnnouncements2Row"}): title = jobType = deadline = url = jobTitle = None for piece in ad.findAll("td"): if (title is None): title = piece.get_text() continue elif (url is None): url = piece.find('a').get('href') jobTitle = piece.get_text() continue elif (deadline is None): deadline = piece.get_text()[1:] deadlineFormatted = data_format.dateFormatFull(str(deadline).replace('/',' ')) continue else: pass print (jobTitle, deadlineFormatted) jobType = data_format.typeOfGrade(title) eurojust.persist(int(eurojust_id), str(jobTitle).strip(), '', '', str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType) for ad in table.findAll("tr",attrs={"class" : "vacancyAnnouncements2AlternatingRow"}): title = deadline = url = jobTitle = jobType = None for piece in ad.findAll("td"): if (title is None): title = piece.get_text() continue elif (url is None): url = piece.find('a').get('href') jobTitle = piece.get_text() continue elif (deadline is None): deadline = piece.get_text()[1:] deadlineFormatted = data_format.dateFormatFull(str(deadline).replace('/', ' ')) continue else: pass print (jobTitle, deadlineFormatted) jobType = data_format.typeOfPost(title) # Insert job details in database eurojust.persist(int(eurojust_id), str(jobTitle).strip(), '', '', str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType) print("#========================EUROJUST SCRAPING COMPLETE=================================")
def scrapEPSO(): print("#========================= EPSO SCRAPING =========================") epsoData = epso.returnAgency('EPSO') epso_link = epsoData['link'][0] html = urllib.request.urlopen(epso_link) text ='utf-8') soup = BeautifulSoup(text, "html.parser") #Initiate scrap start = soup.find(attrs={"class": "view-content"}) page = 0 while (start is not None): table = start.tbody.findAll("tr") for tr in table: # Retrieve job information print (tr.find(attrs={"class": "views-field views-field-field-epso-locations"}).get_text()) jobTitle = tr.find(attrs={"class": "views-field views-field-title-field"}).get_text() grade = tr.find(attrs={"class": "views-field views-field-field-epso-grade"}).get_text() institute = tr.find(attrs={"class": "views-field views-field-field-epso-institution-id"}).get_text() url = ""+ tr.find(attrs={"class": "views-field views-field-title-field"}).a.get("href") date_deadline = tr.find(attrs={"class": "views-field views-field-field-epso-deadline"}).get_text() contract = tr.find(attrs={"class": "views-field views-field-field-epso-type-of-contract"}).get_text() deadline = data_format.dateFormatFull(date_deadline.split ("-")[0].strip()) # Extract the agency code try: inst_code ='\((.*?)\)', institute).groups()[0] except: inst_code = institute check_institute = epso.EPSOinstitution(inst_code) #print ("inst:" + check_institute) if check_institute[2] == 1: continue # Retrieve the agency's id from eu_institute inst_id = check_institute[0] # Retrieve the agency's type from eu_institute inst_type = check_institute[1] # Determine the grade jobType = data_format.typeOfGrade(grade) # Insert job details in database epso.persist(inst_id, jobTitle, str(grade).strip(), str(institute).strip(), '', deadline, str(url).strip(), inst_type, jobType) print (inst_id, jobTitle, str(grade).strip(), str(institute).strip(), '', deadline, str(url).strip(), inst_type, jobType) page = int(page) + 1 epso_link = epso_link + str(page) html = urllib.request.urlopen(epso_link) text ='utf-8') soup = BeautifulSoup(text, "html.parser") start = soup.find(attrs={"class": "view-content"}) i = 2 print("#========================EPSO SCRAPING COMPLETE=================================")
def scrapEMSA(): print("#========================= EMSA SCRAPING =========================") emsaData = emsa.returnAgency('EMSA') emsa_link = emsaData['link'][0] emsa_id = emsaData['id'][0] user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36" headers = { 'User-Agent' : user_agent } data = '' data = data.encode('ascii') req = urllib.request.Request(emsa_link,data,headers) with urllib.request.urlopen(req) as response: html = soup = BeautifulSoup(html, "html.parser") # Find the first ad start = soup.findAll(attrs={"class":"sectiontableentry"}) # Iterate through the tables for cell in start: ad_code = cell.find("th").get_text() print ("Job Code:" + ad_code.strip()) ad_url = ""+cell.find('a').get('href') print ("Job URL:" + ad_url) count = 0 for ad in cell.findAll("td"): if count == 0 : ad_description = ad.get_text() print ("description: " + ad_description) if count == 2 : ad_deadline = ad.get_text() print ("deadline: " + ad_deadline) count = count + 1 # Convert date try: date_object = datetime.strptime(ad_deadline, '%d.%m.%Y') deadline = #print (deadline) except: print ("could not modify " + deadline) pass ad_raw = ad_code +" "+ ad_description # Identify type jobType = data_format.typeOfGrade(ad_raw) print (jobType) # Insert job details in database emsa.persist(int(emsa_id), str(ad_description).strip(), '', '', str(ad_code).strip(), deadline, str(ad_url).strip(), '', jobType) print("#========================EMSA SCRAPING COMPLETE=================================")
def scrapEUROPOL(): print("#========================= EUROPOL SCRAPING =========================") europolData = europol.returnAgency('EUROPOL') europol_link = europolData['link'][0] europol_id = europolData['id'][0] html = urllib.request.urlopen(europol_link) soup = BeautifulSoup(html, "html.parser") def dateFormatFull (inputDate): dnotz = None for form in ['%d %b %Y', '%d %b %y', '%d %B %Y','%d/%m/%Y','%d.%m.%Y']: try: dnotz = datetime.strptime(inputDate, form).date() return str(dnotz) except: continue if dnotz is None : print ('Bad Date:',inputDate) return str(inputDate) # Find all ads start = soup.findAll(attrs={"class":re.compile("^views-row views-row-")}) #print ("posts found " + str(len(start))) # Iterate through the divs for advert in start: try: deadline = advert.find(attrs={"class":"views-field views-field-deadline"}).findAll('span')[1].get_text() deadlineFormatted = dateFormatFull(deadline) print ("Deadline:",dateFormatFull(deadline)) print("Contract Type:", advert.find(attrs={"class": "views-field views-field-contract-type"}).find('span').get_text()) jobTitle = advert.find("a").get_text() print("Title:", jobTitle) dept = advert.find(attrs={"class": "views-field views-field-department"}).find('span').get_text() print("Department:", dept) title = advert.find(attrs={"class": "views-field views-field-reference-number"}).find('span').get_text() print("Reference Number:", title) url = "" + advert.find("a").get("href") print("Link:", url) except: continue if'(AD+\d{1,2}?|AD +\d{1,2}?|TA)',title) is not None: jobType="AD" elif'(AST+\d{1,2}?|AST +\d{1,2}?)',title)is not None: jobType="AST" elif'(FG+\d|FG+III|FG+IV|Function Groups|CA)',title)is not None: jobType="CA" elif'(trainee)',title,re.IGNORECASE)is not None: jobType="Trainee" elif'(SNE|Seconded)',title,re.IGNORECASE)is not None: jobType="SNE" else: jobType="Other" print(int(europol_id), str(jobTitle).strip(), '', str(dept).strip(), str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType) europol.persist(int(europol_id), str(jobTitle).strip(), '', str(dept).strip(), str(title).strip(), deadlineFormatted,str(url).strip(), '', jobType) print("#========================EUROPOL SCRAPING COMPLETE=================================")