Пример #1
0
def scrapETF():

    print("#========================= ETF SCRAPING =========================")

    # Database connection and agency retrieval

    etfData = etf.returnAgency('ETF')
    etf_link = etfData['link'][0]
    etf_id = etfData['id'][0]

    html = urllib.request.urlopen(etf_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('div',attrs={'class':'content_group piclist_content'})

    # Find the jobs table
    Jobtable = start.find('div').div.ul

    for child in Jobtable.children:

        jobTitle = child.div.h2.string
        jobLink = "http://www.etf.europa.eu" + child.div.p.a.get('href')
        jobCode = child.div.p.a.string
        jobDeadline = data_format.dateFormatFull(str(child.div.p)[12:22])
        print (jobTitle, jobLink.replace(' ','%20'), jobCode, jobDeadline)
        etf.persist(int(etf_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink.replace(' ','%20'), '', 'Other')

    print("#======================== ETF SCRAPING COMPLETE =================================")
Пример #2
0
def scrapEMA():

    print("#========================= EMA SCRAPING =========================")

    # Database connection and agency retrieval

    emaData = ema.returnAgency('EMA')
    ema_link = emaData['link'][0]
    ema_id = emaData['id'][0]

    html = urllib.request.urlopen(ema_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('div',attrs={'class':'main-col'})

    # Find the jobs table
    Jobtable = (start.find('table'))

    for child in Jobtable.children:
        if(child.find('td',attrs={'colspan':'top'})):
            continue
        jobTitle = child.td.string
        jobCode = child.td.next_sibling.string
        jobType = data_format.typeOfPost(jobCode)
        jobLink = "http://www.ema.europa.eu/ema/" + child.td.next_sibling.next_sibling.a.get('href')
        jobDeadline = data_format.dateFormatFull(child.td.next_sibling.next_sibling.next_sibling.string)
        print(jobTitle,jobCode,jobType,jobLink,jobDeadline)
        ema.persist(int(ema_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType)

    print("#========================EMA SCRAPING COMPLETE=================================")
Пример #3
0
def scrapESMA():

    print("#========================= ESMA SCRAPING =========================")

    # Database connection and agency retrieval

    esmaData = esma.returnAgency('ESMA')
    esma_link = esmaData['link'][0]
    esma_id = esmaData['id'][0]

    html = urllib.request.urlopen(esma_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('div',attrs={'class':'search-page_main'})

    # Find the jobs table
    Jobtable = (start.table.tbody.findAll('tr'))

    for child in Jobtable:
        titleSource = child.find('td',attrs={'class':'esma_library-title'})
        jobCode = child.find('td',attrs={'class':'esma_library-ref'}).string
        jobLink = titleSource.a.get('href')
        jobTitle = titleSource.string
        jobDeadline = data_format.dateFormatFull(re.sub('\D','',jobTitle))
        jobType = data_format.typeOfGrade(jobCode)
        print (jobTitle, jobCode, jobLink,jobDeadline,jobType)
        esma.persist(int(esma_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType)

    print("#========================ESMA SCRAPING COMPLETE=================================")
Пример #4
0
def scrapEFSA():

    print("#========================= EFSA SCRAPING =========================")

    # Database connection and agency retrieval

    efsaData = efsa.returnAgency('EFSA')
    efsa_link = efsaData['link'][0]
    efsa_id = efsaData['id'][0]

    html = urllib.request.urlopen(efsa_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.findAll('div', attrs={'class': 'jlr_right_hldr'})

    for child in start:

        jobTitle = child.p.string
        jobLink = child.p.a.get('href')
        jobDept = child.find(
            'div', attrs={
                'class': 'jlr_content_half jlr_content_right'
            }).p.span.next_element.next_element.next_element.string
        print(jobTitle, jobLink, jobDept)
        efsa.persist(int(efsa_id),
                     str(jobTitle).strip(), '', jobDept, '', 'SA', jobLink, '',
                     'Other')

    print(
        "#======================== EFSA SCRAPING COMPLETE ================================="
    )
Пример #5
0
def scrapGSA():
    print("#========================= GSA SCRAPING =========================")

    # Database connection and agency retrieval

    gsaData = gsa.returnAgency('GSA')
    gsa_link = gsaData['link'][0]
    gsa_id = gsaData['id'][0]

    pages = {
        "CATA": "gsa/jobs-opportunities",
        "TR": "traineeship-listing",
        "SNE": "gsa-seconded-national-experts"
    }

    for pairs in pages:
        title = pairs.title().upper()
        page_link = (gsa_link + "/" + pages[title])

        html = urllib.request.urlopen(page_link)
        soup = BeautifulSoup(html, "html.parser")

        # Create the soup
        start = soup.find('tbody')
        today = datetime.datetime.today().date()

        # Find the jobs table
        Jobtable = (start.findAll('tr'))

        for cell in Jobtable:

            td = cell.findAll('td')
            print(td[0])
Пример #6
0
def scrapCPVO():

    print("#========================= CPVO SCRAPING =========================")

    # Database connection and agency retrieval

    cpvoData = cpvo.returnAgency('CPVO')
    cpvo_link = cpvoData['link'][0]
    cpvo_id = cpvoData['id'][0]

    html = urllib.request.urlopen(cpvo_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('table',attrs={'summary':'Vacancies'})

    # Find the jobs table
    Jobtable = (start.findAll('tr'))

    for child in Jobtable:
        if(child.find('th',attrs={'id':'vacancy_title'})):
            continue
        #print (child)
        jobTitle = child.td.a.string.strip()
        jobLink = child.td.a.get('href')
        jobCode = child.td.next_sibling.next_sibling.string.strip()
        jobType = data_format.typeOfPost(jobCode)
        jobDeadline = data_format.dateFormatFull(child.td.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.string.strip())

        logging.debug (jobTitle,jobLink,jobCode,jobType,jobDeadline)
        cpvo.persist(int(cpvo_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType)

    print("#========================CPVO SCRAPING COMPLETE=================================")
Пример #7
0
def scrapESMA():

    print("#========================= ESMA SCRAPING =========================")

    # Database connection and agency retrieval

    esmaData = esma.returnAgency('ESMA')
    esma_link = esmaData['link'][0]
    esma_id = esmaData['id'][0]

    html = urllib.request.urlopen(esma_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('div', attrs={'class': 'search-page_main'})

    # Find the jobs table
    Jobtable = (start.table.tbody.findAll('tr'))

    for child in Jobtable:
        titleSource = child.find('td', attrs={'class': 'esma_library-title'})
        jobCode = child.find('td', attrs={'class': 'esma_library-ref'}).string
        jobLink = titleSource.a.get('href')
        jobTitle = titleSource.string
        jobDeadline = data_format.dateFormatFull(re.sub('\D', '', jobTitle))
        jobType = data_format.typeOfGrade(jobCode)
        print(jobTitle, jobCode, jobLink, jobDeadline, jobType)
        esma.persist(int(esma_id),
                     str(jobTitle).strip(), '', '', jobCode, jobDeadline,
                     jobLink, '', jobType)

    print(
        "#========================ESMA SCRAPING COMPLETE================================="
    )
Пример #8
0
def scrapGSA():
    print("#========================= GSA SCRAPING =========================")

    # Database connection and agency retrieval

    gsaData = gsa.returnAgency('GSA')
    gsa_link = gsaData['link'][0]
    gsa_id = gsaData['id'][0]

    pages = {"CATA": "gsa/jobs-opportunities", "TR": "traineeship-listing", "SNE": "gsa-seconded-national-experts"}

    for pairs in pages:
        title = pairs.title().upper()
        page_link = (gsa_link + "/" + pages[title])


        html = urllib.request.urlopen(page_link)
        soup = BeautifulSoup(html, "html.parser")


        # Create the soup
        start = soup.find('tbody')
        today = datetime.datetime.today().date()


        # Find the jobs table
        Jobtable = (start.findAll('tr'))

        for cell in Jobtable:

            td = cell.findAll('td')
            print (td[0])
Пример #9
0
def scrapSatCen():

    print("#========================= SatCen SCRAPING =========================")

    SatCenData = satcen.returnAgency('SATCEN')
    SatCen_link = SatCenData['link'][0]
    SatCen_id = SatCenData['id'][0]
    SatCen_source = urllib.request.urlopen(SatCen_link)

#Retrieve the list of jobs as bs4 navigable string
    soup = BeautifulSoup(SatCen_source,'html.parser')


        #Convert to bytes
    bytesEncoded = soup.encode('utf-8')
#Convert to string
    stringDecoded = bytesEncoded.decode('utf-8')
#Convert to dictionary
    jobsdict = json.loads(stringDecoded)
#Browse dictionaty and select available positions
    for job in jobsdict:
        if (job['Status']=='OPEN') and (job['InternalOnly'] == False):
            link = 'https://apps.satcen.europa.eu/recruitment/#/vacancy/'+job['Id']
            print(job['Reference'], job['ExpireOn'][:10],job['Title'],format.typeOfPost(job['TypePost']),job['WorkUnit'],link)
            satcen.persist(SatCen_id, job['Title'],job['Reference'],job['WorkUnit'],'', job['ExpireOn'][:10],link,'', format.typeOfPost(job['TypePost']))

    print("#========================SATCEN SCRAPING COMPLETE=================================")
Пример #10
0
def scrapEFSA():

    print("#========================= EFSA SCRAPING =========================")

    # Database connection and agency retrieval

    efsaData = efsa.returnAgency('EFSA')
    efsa_link = efsaData['link'][0]
    efsa_id = efsaData['id'][0]

    html = urllib.request.urlopen(efsa_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.findAll('div',attrs={'class':'jlr_right_hldr'})



    for child in start:

        jobTitle = child.p.string
        jobLink = child.p.a.get('href')
        jobDept = child.find('div',attrs={'class':'jlr_content_half jlr_content_right'}).p.span.next_element.next_element.next_element.string
        print (jobTitle, jobLink,jobDept)
        efsa.persist(int(efsa_id), str(jobTitle).strip(), '', jobDept, '', 'SA', jobLink, '', 'Other')

    print("#======================== EFSA SCRAPING COMPLETE =================================")
Пример #11
0
def scrapETF():

    print("#========================= ETF SCRAPING =========================")

    # Database connection and agency retrieval

    etfData = etf.returnAgency('ETF')
    etf_link = etfData['link'][0]
    etf_id = etfData['id'][0]

    html = urllib.request.urlopen(etf_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('div', attrs={'class': 'content_group piclist_content'})

    # Find the jobs table
    Jobtable = start.find('div').div.ul

    for child in Jobtable.children:

        jobTitle = child.div.h2.string
        jobLink = "http://www.etf.europa.eu" + child.div.p.a.get('href')
        jobCode = child.div.p.a.string
        jobDeadline = data_format.dateFormatFull(str(child.div.p)[12:22])
        print(jobTitle, jobLink.replace(' ', '%20'), jobCode, jobDeadline)
        etf.persist(int(etf_id),
                    str(jobTitle).strip(), '', '', jobCode, jobDeadline,
                    jobLink.replace(' ', '%20'), '', 'Other')

    print(
        "#======================== ETF SCRAPING COMPLETE ================================="
    )
Пример #12
0
def scrapEIOPA():

    print(
        "#========================= EIOPA SCRAPING =========================")

    # Database connection and agency retrieval

    eiopaData = eiopa.returnAgency('EIOPA')
    eiopa_link = eiopaData['link'][0]
    eiopa_id = eiopaData['id'][0]

    html = urllib.request.urlopen(eiopa_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('table', attrs={'class': 'ms-rteTable-EIOPATable'})

    for tr in start.tbody:
        if str(tr['class'][0]) == "ms-rteTableHeaderRow-EIOPATable":
            continue

        jobTitle = tr.th.next_sibling.a.string
        jobLink = "https://eiopa.europa.eu/" + tr.th.next_sibling.a.get('href')
        jobCode = str(re.match(r'(.*?)%20', jobLink).group(1)[29:])
        jobType = data_format.typeOfPost(jobCode)
        deadlinePosition = tr.td.next_sibling

        if re.match('\w', jobTitle[0]) is None:
            jobTitle = jobTitle[1:len(jobTitle)]

        if (len(deadlinePosition.contents[0].string) > 2):
            jobDeadline = deadlinePosition.contents[0].string
            jobDeadline = str(jobDeadline).strip()
            jobDeadline = data_format.dateFormatFull(jobDeadline[1:])

        else:
            extendedDeadlines = deadlinePosition.findAll('strong')
            newDeadline = extendedDeadlines[len(extendedDeadlines) - 1].string
            newDeadline = str(newDeadline.split(':')[1]).strip()
            jobDeadline = data_format.dateFormatFull(newDeadline)

        print(jobTitle, jobCode, jobType, jobDeadline)

        eiopa.persist(int(eiopa_id), jobTitle, '', '', jobCode, jobDeadline,
                      jobLink, '', jobType)

    print(
        "#========================EIOPA SCRAPING COMPLETE================================="
    )


#scrapEIOPA()
Пример #13
0
def scrapEBA():
    print("#========================= EBA SCRAPING =========================")

    # Database connection and agency retrieval

    ebaData = eba.returnAgency('EBA')
    eba_link = ebaData['link'][0]
    eba_id = ebaData['id'][0]

    pages = {"CA": "contract-agents", "TA": "temporary-agents", "SNE": "national-experts-on-secondment"}

    for pairs in pages:
        title = pairs.title().upper()
        page_link = (eba_link + "/" + pages[title])


        html = urllib.request.urlopen(page_link)
        soup = BeautifulSoup(html, "html.parser")


        # Create the soup
        start = soup.find('table', attrs={'class': 'Tabular'})
        today = datetime.datetime.today().date()


        # Find the jobs table
        Jobtable = (start.findAll('tr'))

        for cell in Jobtable:

            td = cell.findAll('td')
            try:
                status = td[3].string.strip()
                rawDate = td[2].string
                searchDate = re.match(r'(.*)at', rawDate)
                date = data_format.dateFormatFull(searchDate.group(1).strip())
            except:
                continue

            if (today < date) and (status == "ongoing"):

                jobLink = eba_link[:24] + td[0].a.get('href')
                jobTitle = td[0].string
                jobCode = td[1].string.strip()
                jobDeadline = date
                jobType = title
                print(jobCode, jobTitle, jobType, jobDeadline, jobLink)
                eba.persist(int(eba_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType)
            else:
                continue

    print("#========================EBA SCRAPING COMPLETE=================================")
Пример #14
0
def scrapBBI():

    print("#========================= BBI SCRAPING =========================")

    # Database connection and agency retrieval

    bbiData = bbi.returnAgency('BBI')
    bbi_link = bbiData['link'][0]
    bbi_id = bbiData['id'][0]

    html = urllib.request.urlopen(bbi_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('tbody',
                      attrs={'class': 'ui-datatable-data ui-widget-content'})

    # Find the jobs table
    Jobtable = (start.findAll('tr'))

    for child in Jobtable:
        jobCodeLocation = child.td.next_sibling
        jobTitleLocation = jobCodeLocation.next_sibling
        jobTypeLocation = jobTitleLocation.next_sibling
        jobGradeLocation = jobTypeLocation.next_sibling
        jobDeadlineLocation = jobGradeLocation.next_sibling
        jobLinkLocation = jobDeadlineLocation.next_sibling

        jobCode = jobCodeLocation.string
        jobTitle = jobTitleLocation.string
        jobType = data_format.typeOfPost(jobTypeLocation.string)
        jobGrade = data_format.typeOfGrade(jobGradeLocation.string)
        jobDeadline = data_format.dateFormatFull(
            jobDeadlineLocation.string[:10])
        jobLink = jobLinkLocation.a.get('href')

        print(jobCode, jobTitle, jobType, jobGrade, jobDeadline, jobLink)

        bbi.persist(int(bbi_id),
                    str(jobTitle).strip(), '', '', jobCode, jobDeadline,
                    jobLink, '', jobType)

    print(
        "#========================BBI SCRAPING COMPLETE================================="
    )
Пример #15
0
def scrapEACEA():

    print("#========================= EACEA SCRAPING =========================")

    # Database connection and agency retrieval

    eaceaData = eacea.returnAgency('EACEA')
    eacea_link = eaceaData['link'][0]
    eacea_id = eaceaData['id'][0]

    html = urllib.request.urlopen(eacea_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('table',attrs={'class':'table table-striped table-hover views-table cols-4'})

    # Find the jobs table
    Jobtable = (start.find('tbody'))

    rows = Jobtable.findAll('tr')


    for posts in rows:
        columns = posts.children

        for tds in columns:
            try:
                status = tds.next_sibling.next_sibling.next_sibling.next_sibling.span.string
                if (status == 'Open'):
                    jobLink = "http://eacea.ec.europa.eu"+ tds.a.get('href')
                    jobTitle = tds.a.string
                    deadline = tds.next_sibling.next_sibling.span.string
                    jobDeadline = data_format.dateFormatFull(deadline[:10])
                    if jobTitle.find('CA-FG')> 0:
                        jobType = "CA"
                    else:
                        jobType = "Other"
                    print(jobTitle, jobLink, jobType, jobDeadline)
                    eacea.persist(int(eacea_id), str(jobTitle).strip(), '', '', '', jobDeadline, jobLink, '', jobType)
                else: continue
            except:
                continue

    print("#========================EACEA SCRAPING COMPLETE=================================")
Пример #16
0
def scrapCDT():

    print("#========================= CDT SCRAPING =========================")

    # Database connection and agency retrieval

    cdtData = cdt.returnAgency('CDT')
    cdt_link = cdtData['link'][0]
    cdt_id = cdtData['id'][0]

    html = urllib.request.urlopen(cdt_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.findAll('div',
                         attrs={'class': 'ms-rtestate-read ms-rte-wpbox'})

    #print (start[1])#.ul.li.p.div.span.font.string.strip())

    for child in start:

        jobTitle = child.span.attrs['title']
        postType = jobTitle[:9].strip()
        for post in child.ul:
            try:
                #print (child.ul)
                job = post.find('h3')
                jobCode = job.a.string.strip()
                jobLink = "http://cdt.europa.eu" + job.a.get('href').replace(
                    ' ', '%20')
                jobTitle = post.find('p').div.span.font.string.strip()
                jobType = data_format.typeOfPost(postType)
                jobDeadline = data_format.dateFormatFull('SA')
                print(jobCode, jobLink, jobTitle, jobType, jobDeadline)
                cdt.persist(int(cdt_id),
                            str(jobTitle).strip(), '', '', jobCode,
                            jobDeadline, jobLink, '', jobType)
            except:
                continue
    print(
        "#========================CDT SCRAPING COMPLETE================================="
    )
Пример #17
0
def scrapF4E():

    print("#========================= F4E SCRAPING =========================")

    F4EData = F4E.returnAgency('F4E')
    F4E_link = F4EData['link'][0]
    F4E_id = F4EData['id'][0]

    html = urllib.request.urlopen(F4E_link)
    soup = BeautifulSoup(html, "html.parser")

    start = soup.findAll(attrs={"class": re.compile("^careersPurple2")})

    for contractType in start:

        deadline = jobTitle = jobCode = jobLink = ''
        contract = data_format.typeOfPost(contractType.a.string)

        jobInfo = contractType.next_sibling.next_sibling
        try:
            deadline = data_format.dateFormatFull(
                jobInfo.find(attrs={
                    "class": "careersDate"
                }).span.string)
            jobTitle = jobInfo.find(attrs={"class": "careersTitle"}).string
            jobCode = jobInfo.find(attrs={"class": "pdf"}).string
            jobLink = "http://fusionforenergy.europa.eu/careers/vacancies/" + jobInfo.find(
                attrs={
                    "class": "pdf"
                }).get("href")
            print(deadline, jobTitle.strip(), jobCode, jobLink, contract)
            F4E.persist(F4E_id, jobTitle.strip(), jobCode, '', '', deadline,
                        jobLink, '', contract)

        except:
            pass
    print(
        "#========================F4E SCRAPING COMPLETE================================="
    )
Пример #18
0
def scrapBBI():

    print("#========================= BBI SCRAPING =========================")

    # Database connection and agency retrieval

    bbiData = bbi.returnAgency('BBI')
    bbi_link = bbiData['link'][0]
    bbi_id = bbiData['id'][0]

    html = urllib.request.urlopen(bbi_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('tbody',attrs={'class':'ui-datatable-data ui-widget-content'})

    # Find the jobs table
    Jobtable = (start.findAll('tr'))

    for child in Jobtable:
        jobCodeLocation = child.td.next_sibling
        jobTitleLocation = jobCodeLocation.next_sibling
        jobTypeLocation = jobTitleLocation.next_sibling
        jobGradeLocation = jobTypeLocation.next_sibling
        jobDeadlineLocation = jobGradeLocation.next_sibling
        jobLinkLocation = jobDeadlineLocation.next_sibling

        jobCode = jobCodeLocation.string
        jobTitle = jobTitleLocation.string
        jobType = data_format.typeOfPost(jobTypeLocation.string)
        jobGrade = data_format.typeOfGrade(jobGradeLocation.string)
        jobDeadline = data_format.dateFormatFull(jobDeadlineLocation.string[:10])
        jobLink = jobLinkLocation.a.get('href')

        print (jobCode,jobTitle,jobType,jobGrade,jobDeadline,jobLink)

        bbi.persist(int(bbi_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType)

    print("#========================BBI SCRAPING COMPLETE=================================")
Пример #19
0
def scrapEMA():

    print("#========================= EMA SCRAPING =========================")

    # Database connection and agency retrieval

    emaData = ema.returnAgency('EMA')
    ema_link = emaData['link'][0]
    ema_id = emaData['id'][0]

    html = urllib.request.urlopen(ema_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('div', attrs={'class': 'main-col'})

    # Find the jobs table
    Jobtable = (start.find('table'))

    for child in Jobtable.children:
        if (child.find('td', attrs={'colspan': 'top'})):
            continue
        jobTitle = child.td.string
        jobCode = child.td.next_sibling.string
        jobType = data_format.typeOfPost(jobCode)
        jobLink = "http://www.ema.europa.eu/ema/" + child.td.next_sibling.next_sibling.a.get(
            'href')
        jobDeadline = data_format.dateFormatFull(
            child.td.next_sibling.next_sibling.next_sibling.string)
        print(jobTitle, jobCode, jobType, jobLink, jobDeadline)
        ema.persist(int(ema_id),
                    str(jobTitle).strip(), '', '', jobCode, jobDeadline,
                    jobLink, '', jobType)

    print(
        "#========================EMA SCRAPING COMPLETE================================="
    )
Пример #20
0
def scrapEDA():

    print("#========================= EDA SCRAPING =========================")

    edaData = eda.returnAgency('EDA')

    eda_link = edaData['link'][0]
    eda_id = edaData['id'][0]

    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36"
    headers = {'User-Agent': user_agent}
    data = ''
    data = data.encode('ascii')

    req = urllib.request.Request(eda_link, data, headers)

    #print(emsa_link)

    with urllib.request.urlopen(req) as response:
        html = response.read()

    soup = BeautifulSoup(html, "html.parser")

    # Iterate through the internal groups
    for post_type in soup.find_all("h4"):

        internal_type = post_type.contents[0].strip()

        if internal_type not in ('Temporary Agents', 'Contractual Agents',
                                 'Seconded National Experts'):
            continue
        elif internal_type == 'Temporary Agents':
            job_type = 'TA'
        elif internal_type == 'Contractual Agents':
            job_type = 'CA'
        else:
            job_type = 'SNE'

        print(post_type.contents[0])

        internals = post_type.next_element.next_element.next_element.find_all(
            "li")

        #try:
        # Iterate the URLs for each TA post
        for post in internals:

            ta_link = eda_link + post.find('a').get("href")
            ta_req = urllib.request.Request(ta_link, data, headers)

            with urllib.request.urlopen(ta_req) as response:
                ta_html = response.read()

            ta_soup = BeautifulSoup(ta_html, "html.parser")

            #Link
            print(ta_link)

            #Post
            post_title = ta_soup.findAll(
                attrs={"id": "cphMain_VacNotice_LabPost"
                       })[0].contents[0].strip()
            print(post_title)

            #Grade
            post_grade = ta_soup.findAll(
                attrs={"id": "cphMain_VacNotice_LabGrade"
                       })[0].contents[0].strip()
            print(post_grade)

            #Deadline
            post_deadline = ta_soup.findAll(
                attrs={"id": "cphMain_VacNotice_LabPublicationDateEnd"})
            print(post_deadline[0].contents[0].strip() + "\n")
            # Convert date
            try:
                date_object = datetime.strptime(
                    post_deadline[0].contents[0].strip(), '%d %B %Y')
                deadline = date_object.date()
                #print (deadline)
            except:
                print("could not modify " + str(post_deadline))
                pass

            # Insert job details in database
            eda.persist(int(eda_id),
                        str(post_title).strip(), str(post_grade), '', '',
                        deadline,
                        str(ta_link).strip(), '', job_type)
    #except:
    #   pass

    print(
        "#========================EDA SCRAPING COMPLETE================================="
    )
Пример #21
0
def scrapEPSO():

    print("#========================= EPSO SCRAPING =========================")

    epsoData = epso.returnAgency('EPSO')
    epso_link = epsoData['link'][0]

    html = urllib.request.urlopen(epso_link)
    text = html.read().decode('utf-8')
    soup = BeautifulSoup(text, "html.parser")

    #Initiate scrap
    start = soup.find(attrs={"class": "view-content"})
    page = 0

    while (start is not None):
        table = start.tbody.findAll("tr")
        for tr in table:
            # Retrieve job information
            print(
                tr.find(attrs={
                    "class": "views-field views-field-field-epso-locations"
                }).get_text())
            jobTitle = tr.find(attrs={
                "class": "views-field views-field-title-field"
            }).get_text()
            grade = tr.find(attrs={
                "class": "views-field views-field-field-epso-grade"
            }).get_text()
            institute = tr.find(attrs={
                "class":
                "views-field views-field-field-epso-institution-id"
            }).get_text()
            url = "https://epso.europa.eu" + tr.find(
                attrs={
                    "class": "views-field views-field-title-field"
                }).a.get("href")
            date_deadline = tr.find(
                attrs={
                    "class": "views-field views-field-field-epso-deadline"
                }).get_text()
            contract = tr.find(attrs={
                "class":
                "views-field views-field-field-epso-type-of-contract"
            }).get_text()
            deadline = data_format.dateFormatFull(
                date_deadline.split("-")[0].strip())

            # Extract the agency code
            try:
                inst_code = re.search('\((.*?)\)', institute).groups()[0]
            except:
                inst_code = institute

            check_institute = epso.EPSOinstitution(inst_code)

            #print ("inst:" + check_institute)

            if check_institute[2] == 1:
                continue

            # Retrieve the agency's id from eu_institute
            inst_id = check_institute[0]

            # Retrieve the agency's type from eu_institute
            inst_type = check_institute[1]

            # Determine the grade
            jobType = data_format.typeOfGrade(grade)

            # Insert job details in database
            epso.persist(inst_id, jobTitle,
                         str(grade).strip(),
                         str(institute).strip(), '', deadline,
                         str(url).strip(), inst_type, jobType)
            print(inst_id, jobTitle,
                  str(grade).strip(),
                  str(institute).strip(), '', deadline,
                  str(url).strip(), inst_type, jobType)

        page = int(page) + 1
        epso_link = epso_link + str(page)
        html = urllib.request.urlopen(epso_link)
        text = html.read().decode('utf-8')
        soup = BeautifulSoup(text, "html.parser")
        start = soup.find(attrs={"class": "view-content"})

        i = 2
    print(
        "#========================EPSO SCRAPING COMPLETE================================="
    )
Пример #22
0
def scrapEMSA():

    print("#========================= EMSA SCRAPING =========================")

    emsaData = emsa.returnAgency('EMSA')
    emsa_link = emsaData['link'][0]
    emsa_id = emsaData['id'][0]

    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36"
    headers = {'User-Agent': user_agent}
    data = ''
    data = data.encode('ascii')

    req = urllib.request.Request(emsa_link, data, headers)

    with urllib.request.urlopen(req) as response:
        html = response.read()

    soup = BeautifulSoup(html, "html.parser")

    # Find the first ad
    start = soup.findAll(attrs={"class": "sectiontableentry"})

    # Iterate through the tables
    for cell in start:
        ad_code = cell.find("th").get_text()
        print("Job Code:" + ad_code.strip())
        ad_url = "http://www.emsa.europa.eu" + cell.find('a').get('href')
        print("Job URL:" + ad_url)
        count = 0
        for ad in cell.findAll("td"):
            if count == 0:
                ad_description = ad.get_text()
                print("description: " + ad_description)
            if count == 2:
                ad_deadline = ad.get_text()
                print("deadline: " + ad_deadline)
            count = count + 1

    # Convert date
        try:
            date_object = datetime.strptime(ad_deadline, '%d.%m.%Y')
            deadline = date_object.date()
            #print (deadline)
        except:
            print("could not modify " + deadline)
            pass

        ad_raw = ad_code + " " + ad_description
        # Identify type
        jobType = data_format.typeOfGrade(ad_raw)

        print(jobType)

        # Insert job details in database
        emsa.persist(int(emsa_id),
                     str(ad_description).strip(), '', '',
                     str(ad_code).strip(), deadline,
                     str(ad_url).strip(), '', jobType)

    print(
        "#========================EMSA SCRAPING COMPLETE================================="
    )
Пример #23
0
def scrapEDA():

    print("#========================= EDA SCRAPING =========================")

    edaData = eda.returnAgency('EDA')

    eda_link = edaData['link'][0]
    eda_id = edaData['id'][0]

    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36"
    headers = { 'User-Agent' : user_agent }
    data = ''
    data = data.encode('ascii')


    req = urllib.request.Request(eda_link,data,headers)

    #print(emsa_link)

    with urllib.request.urlopen(req) as response:
        html = response.read()

    soup = BeautifulSoup(html, "html.parser")

    # Iterate through the internal groups
    for post_type in soup.find_all("h4"):

        internal_type = post_type.contents[0].strip()

        if internal_type not in ('Temporary Agents','Contractual Agents','Seconded National Experts'):
            continue
        elif internal_type == 'Temporary Agents':
            job_type = 'TA'
        elif internal_type == 'Contractual Agents':
            job_type = 'CA'
        else:
            job_type = 'SNE'

        print (post_type.contents[0])

        internals = post_type.next_element.next_element.next_element.find_all("li")

    #try:
        # Iterate the URLs for each TA post
        for post in internals:

            ta_link = eda_link + post.find('a').get("href")
            ta_req = urllib.request.Request(ta_link,data,headers)

            with urllib.request.urlopen(ta_req) as response:
                ta_html = response.read()

            ta_soup = BeautifulSoup(ta_html, "html.parser")

            #Link
            print (ta_link)

            #Post
            post_title = ta_soup.findAll(attrs={"id":"cphMain_VacNotice_LabPost"})[0].contents[0].strip()
            print (post_title)

            #Grade
            post_grade = ta_soup.findAll(attrs={"id":"cphMain_VacNotice_LabGrade"})[0].contents[0].strip()
            print (post_grade)

            #Deadline
            post_deadline = ta_soup.findAll(attrs={"id":"cphMain_VacNotice_LabPublicationDateEnd"})
            print (post_deadline[0].contents[0].strip()+ "\n")
            # Convert date
            try:
                date_object = datetime.strptime(post_deadline[0].contents[0].strip(), '%d %B %Y')
                deadline = date_object.date()
                #print (deadline)
            except:
                print ("could not modify " + str(post_deadline))
                pass

            # Insert job details in database
            eda.persist(int(eda_id), str(post_title).strip(), str(post_grade), '', '', deadline, str(ta_link).strip(), '', job_type)
    #except:
     #   pass


    print("#========================EDA SCRAPING COMPLETE=================================")
Пример #24
0
def scrapEUROPOL():

    print(
        "#========================= EUROPOL SCRAPING ========================="
    )

    europolData = europol.returnAgency('EUROPOL')

    europol_link = europolData['link'][0]
    europol_id = europolData['id'][0]

    html = urllib.request.urlopen(europol_link)
    soup = BeautifulSoup(html, "html.parser")

    def dateFormatFull(inputDate):
        dnotz = None
        for form in [
                '%d %b %Y', '%d %b %y', '%d %B %Y', '%d/%m/%Y', '%d.%m.%Y'
        ]:
            try:
                dnotz = datetime.strptime(inputDate, form).date()
                return str(dnotz)
            except:
                continue

        if dnotz is None:
            print('Bad Date:', inputDate)
            return str(inputDate)

    # Find all ads

    start = soup.findAll(attrs={"class": re.compile("^views-row views-row-")})

    #print ("posts found " + str(len(start)))

    # Iterate through the divs
    for advert in start:
        try:
            deadline = advert.find(attrs={
                "class": "views-field views-field-deadline"
            }).findAll('span')[1].get_text()
            deadlineFormatted = dateFormatFull(deadline)
            print("Deadline:", dateFormatFull(deadline))

            print(
                "Contract Type:",
                advert.find(attrs={
                    "class": "views-field views-field-contract-type"
                }).find('span').get_text())

            jobTitle = advert.find("a").get_text()
            print("Title:", jobTitle)

            dept = advert.find(attrs={
                "class": "views-field views-field-department"
            }).find('span').get_text()
            print("Department:", dept)

            title = advert.find(
                attrs={
                    "class": "views-field views-field-reference-number"
                }).find('span').get_text()
            print("Reference Number:", title)

            url = "http://www.europol.europa.eu" + advert.find("a").get("href")
            print("Link:", url)

        except:

            continue
        if re.search('(AD+\d{1,2}?|AD +\d{1,2}?|TA)', title) is not None:
            jobType = "AD"
        elif re.search('(AST+\d{1,2}?|AST +\d{1,2}?)', title) is not None:
            jobType = "AST"
        elif re.search('(FG+\d|FG+III|FG+IV|Function Groups|CA)',
                       title) is not None:
            jobType = "CA"
        elif re.search('(trainee)', title, re.IGNORECASE) is not None:
            jobType = "Trainee"
        elif re.search('(SNE|Seconded)', title, re.IGNORECASE) is not None:
            jobType = "SNE"
        else:
            jobType = "Other"

        print(int(europol_id),
              str(jobTitle).strip(), '',
              str(dept).strip(),
              str(title).strip(), deadlineFormatted,
              str(url).strip(), '', jobType)
        europol.persist(int(europol_id),
                        str(jobTitle).strip(), '',
                        str(dept).strip(),
                        str(title).strip(), deadlineFormatted,
                        str(url).strip(), '', jobType)

    print(
        "#========================EUROPOL SCRAPING COMPLETE================================="
    )
Пример #25
0
def scrapEurojust():

    print("#========================= EUROJUST SCRAPING =========================")

    # Database connection and agency retrieval

    eurojustData = eurojust.returnAgency('EUROJUST')
    eurojust_link = eurojustData['link'][0]
    eurojust_id = eurojustData['id'][0]

    html = urllib.request.urlopen(eurojust_link)
    soup = BeautifulSoup(html, "html.parser")

    # Find the first ad
    start = soup.findAll("table",attrs={"class":"vacancyAnnouncements2"})


    # Iterate through the tables
    for table in start:
        for ad in table.findAll("tr",attrs={"class":"vacancyAnnouncements2Row"}):
            title = jobType = deadline = url = jobTitle = None

            for piece in ad.findAll("td"):
                if (title is None):
                    title = piece.get_text()
                    continue
                elif (url is None):
                    url = piece.find('a').get('href')
                    jobTitle = piece.get_text()
                    continue
                elif (deadline is None):
                    deadline = piece.get_text()[1:]
                    deadlineFormatted = data_format.dateFormatFull(str(deadline).replace('/',' '))
                    continue
                else:
                    pass

                print (jobTitle, deadlineFormatted)
                jobType = data_format.typeOfGrade(title)

                eurojust.persist(int(eurojust_id), str(jobTitle).strip(), '', '', str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType)


        for ad in table.findAll("tr",attrs={"class" : "vacancyAnnouncements2AlternatingRow"}):
            title = deadline = url = jobTitle = jobType = None

            for piece in ad.findAll("td"):
                if (title is None):
                    title = piece.get_text()
                    continue
                elif (url is None):
                    url = piece.find('a').get('href')
                    jobTitle = piece.get_text()
                    continue
                elif (deadline is None):
                    deadline = piece.get_text()[1:]
                    deadlineFormatted = data_format.dateFormatFull(str(deadline).replace('/', ' '))
                    continue

                else:
                    pass
            print (jobTitle, deadlineFormatted)

            jobType = data_format.typeOfPost(title)

            # Insert job details in database
            eurojust.persist(int(eurojust_id), str(jobTitle).strip(), '', '', str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType)

    print("#========================EUROJUST SCRAPING COMPLETE=================================")
Пример #26
0
def scrapEPSO():

    print("#========================= EPSO SCRAPING =========================")

    epsoData = epso.returnAgency('EPSO')
    epso_link = epsoData['link'][0]

    html = urllib.request.urlopen(epso_link)
    text = html.read().decode('utf-8')
    soup = BeautifulSoup(text, "html.parser")

    #Initiate scrap
    start = soup.find(attrs={"class": "view-content"})
    page = 0

    while (start is not None):
        table = start.tbody.findAll("tr")
        for tr in table:
            # Retrieve job information
            print (tr.find(attrs={"class": "views-field views-field-field-epso-locations"}).get_text())
            jobTitle = tr.find(attrs={"class": "views-field views-field-title-field"}).get_text()
            grade = tr.find(attrs={"class": "views-field views-field-field-epso-grade"}).get_text()
            institute = tr.find(attrs={"class": "views-field views-field-field-epso-institution-id"}).get_text()
            url = "https://epso.europa.eu"+ tr.find(attrs={"class": "views-field views-field-title-field"}).a.get("href")
            date_deadline = tr.find(attrs={"class": "views-field views-field-field-epso-deadline"}).get_text()
            contract = tr.find(attrs={"class": "views-field views-field-field-epso-type-of-contract"}).get_text()
            deadline = data_format.dateFormatFull(date_deadline.split ("-")[0].strip())

            # Extract the agency code
            try:
                inst_code = re.search('\((.*?)\)', institute).groups()[0]
            except:
                inst_code = institute

            check_institute = epso.EPSOinstitution(inst_code)

            #print ("inst:" + check_institute)

            if check_institute[2] == 1:
                continue

            # Retrieve the agency's id from eu_institute
            inst_id = check_institute[0]

            # Retrieve the agency's type from eu_institute
            inst_type = check_institute[1]

            # Determine the grade
            jobType = data_format.typeOfGrade(grade)


            # Insert job details in database
            epso.persist(inst_id, jobTitle, str(grade).strip(), str(institute).strip(), '', deadline, str(url).strip(), inst_type, jobType)
            print (inst_id, jobTitle, str(grade).strip(), str(institute).strip(), '', deadline, str(url).strip(), inst_type, jobType)

        page = int(page) + 1
        epso_link = epso_link + str(page)
        html = urllib.request.urlopen(epso_link)
        text = html.read().decode('utf-8')
        soup = BeautifulSoup(text, "html.parser")
        start = soup.find(attrs={"class": "view-content"})

        i = 2
    print("#========================EPSO SCRAPING COMPLETE=================================")
Пример #27
0
def scrapEMSA():

    print("#========================= EMSA SCRAPING =========================")

    emsaData = emsa.returnAgency('EMSA')
    emsa_link = emsaData['link'][0]
    emsa_id = emsaData['id'][0]

    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36"
    headers = { 'User-Agent' : user_agent }
    data = ''
    data = data.encode('ascii')


    req = urllib.request.Request(emsa_link,data,headers)

    with urllib.request.urlopen(req) as response:
        html = response.read()

    soup = BeautifulSoup(html, "html.parser")



    # Find the first ad
    start = soup.findAll(attrs={"class":"sectiontableentry"})


    # Iterate through the tables
    for cell in start:
        ad_code = cell.find("th").get_text()
        print ("Job Code:" + ad_code.strip())
        ad_url = "http://www.emsa.europa.eu"+cell.find('a').get('href')
        print ("Job URL:" + ad_url)
        count = 0
        for ad in cell.findAll("td"):
            if count == 0 :
                ad_description = ad.get_text()
                print ("description: " + ad_description)
            if count == 2 :
                ad_deadline = ad.get_text()
                print ("deadline: " + ad_deadline)
            count = count + 1

    # Convert date
        try:
            date_object = datetime.strptime(ad_deadline, '%d.%m.%Y')
            deadline = date_object.date()
            #print (deadline)
        except:
            print ("could not modify " + deadline)
            pass

        ad_raw = ad_code +" "+ ad_description
    # Identify type
        jobType = data_format.typeOfGrade(ad_raw)

        print (jobType)

             # Insert job details in database
        emsa.persist(int(emsa_id), str(ad_description).strip(), '', '', str(ad_code).strip(), deadline, str(ad_url).strip(), '', jobType)

    print("#========================EMSA SCRAPING COMPLETE=================================")
Пример #28
0
def scrapEUROPOL():

    print("#========================= EUROPOL SCRAPING =========================")

    europolData = europol.returnAgency('EUROPOL')


    europol_link = europolData['link'][0]
    europol_id = europolData['id'][0]

    html = urllib.request.urlopen(europol_link)
    soup = BeautifulSoup(html, "html.parser")

    def dateFormatFull (inputDate):
        dnotz = None
        for form in ['%d %b %Y', '%d %b %y',
        '%d %B %Y','%d/%m/%Y','%d.%m.%Y']:
            try:
                dnotz = datetime.strptime(inputDate, form).date()
                return str(dnotz)
            except:
                continue

        if dnotz is None :
            print ('Bad Date:',inputDate)
            return str(inputDate)

    # Find all ads

    start = soup.findAll(attrs={"class":re.compile("^views-row views-row-")})

    #print ("posts found " + str(len(start)))

    # Iterate through the divs
    for advert in start:
        try:
            deadline = advert.find(attrs={"class":"views-field views-field-deadline"}).findAll('span')[1].get_text()
            deadlineFormatted = dateFormatFull(deadline)
            print ("Deadline:",dateFormatFull(deadline))



            print("Contract Type:", advert.find(attrs={"class": "views-field views-field-contract-type"}).find('span').get_text())

            jobTitle = advert.find("a").get_text()
            print("Title:", jobTitle)

            dept = advert.find(attrs={"class": "views-field views-field-department"}).find('span').get_text()
            print("Department:", dept)

            title = advert.find(attrs={"class": "views-field views-field-reference-number"}).find('span').get_text()
            print("Reference Number:", title)

            url = "http://www.europol.europa.eu" + advert.find("a").get("href")
            print("Link:", url)

        except:

            continue
        if re.search('(AD+\d{1,2}?|AD +\d{1,2}?|TA)',title) is not None:
            jobType="AD"
        elif re.search('(AST+\d{1,2}?|AST +\d{1,2}?)',title)is not None:
            jobType="AST"
        elif re.search('(FG+\d|FG+III|FG+IV|Function Groups|CA)',title)is not None:
            jobType="CA"
        elif re.search('(trainee)',title,re.IGNORECASE)is not None:
            jobType="Trainee"
        elif re.search('(SNE|Seconded)',title,re.IGNORECASE)is not None:
            jobType="SNE"
        else:
            jobType="Other"

        print(int(europol_id), str(jobTitle).strip(), '', str(dept).strip(), str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType)
        europol.persist(int(europol_id), str(jobTitle).strip(), '', str(dept).strip(), str(title).strip(), deadlineFormatted,str(url).strip(), '', jobType)

    print("#========================EUROPOL SCRAPING COMPLETE=================================")