Пример #1
0
def scrapCPVO():

    print("#========================= CPVO SCRAPING =========================")

    # Database connection and agency retrieval

    cpvoData = cpvo.returnAgency('CPVO')
    cpvo_link = cpvoData['link'][0]
    cpvo_id = cpvoData['id'][0]

    html = urllib.request.urlopen(cpvo_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('table',attrs={'summary':'Vacancies'})

    # Find the jobs table
    Jobtable = (start.findAll('tr'))

    for child in Jobtable:
        if(child.find('th',attrs={'id':'vacancy_title'})):
            continue
        #print (child)
        jobTitle = child.td.a.string.strip()
        jobLink = child.td.a.get('href')
        jobCode = child.td.next_sibling.next_sibling.string.strip()
        jobType = data_format.typeOfPost(jobCode)
        jobDeadline = data_format.dateFormatFull(child.td.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.string.strip())

        logging.debug (jobTitle,jobLink,jobCode,jobType,jobDeadline)
        cpvo.persist(int(cpvo_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType)

    print("#========================CPVO SCRAPING COMPLETE=================================")
Пример #2
0
def scrapEMA():

    print("#========================= EMA SCRAPING =========================")

    # Database connection and agency retrieval

    emaData = ema.returnAgency('EMA')
    ema_link = emaData['link'][0]
    ema_id = emaData['id'][0]

    html = urllib.request.urlopen(ema_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('div',attrs={'class':'main-col'})

    # Find the jobs table
    Jobtable = (start.find('table'))

    for child in Jobtable.children:
        if(child.find('td',attrs={'colspan':'top'})):
            continue
        jobTitle = child.td.string
        jobCode = child.td.next_sibling.string
        jobType = data_format.typeOfPost(jobCode)
        jobLink = "http://www.ema.europa.eu/ema/" + child.td.next_sibling.next_sibling.a.get('href')
        jobDeadline = data_format.dateFormatFull(child.td.next_sibling.next_sibling.next_sibling.string)
        print(jobTitle,jobCode,jobType,jobLink,jobDeadline)
        ema.persist(int(ema_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType)

    print("#========================EMA SCRAPING COMPLETE=================================")
Пример #3
0
def scrapSatCen():

    print("#========================= SatCen SCRAPING =========================")

    SatCenData = satcen.returnAgency('SATCEN')
    SatCen_link = SatCenData['link'][0]
    SatCen_id = SatCenData['id'][0]
    SatCen_source = urllib.request.urlopen(SatCen_link)

#Retrieve the list of jobs as bs4 navigable string
    soup = BeautifulSoup(SatCen_source,'html.parser')


        #Convert to bytes
    bytesEncoded = soup.encode('utf-8')
#Convert to string
    stringDecoded = bytesEncoded.decode('utf-8')
#Convert to dictionary
    jobsdict = json.loads(stringDecoded)
#Browse dictionaty and select available positions
    for job in jobsdict:
        if (job['Status']=='OPEN') and (job['InternalOnly'] == False):
            link = 'https://apps.satcen.europa.eu/recruitment/#/vacancy/'+job['Id']
            print(job['Reference'], job['ExpireOn'][:10],job['Title'],format.typeOfPost(job['TypePost']),job['WorkUnit'],link)
            satcen.persist(SatCen_id, job['Title'],job['Reference'],job['WorkUnit'],'', job['ExpireOn'][:10],link,'', format.typeOfPost(job['TypePost']))

    print("#========================SATCEN SCRAPING COMPLETE=================================")
Пример #4
0
def scrapEIOPA():

    print(
        "#========================= EIOPA SCRAPING =========================")

    # Database connection and agency retrieval

    eiopaData = eiopa.returnAgency('EIOPA')
    eiopa_link = eiopaData['link'][0]
    eiopa_id = eiopaData['id'][0]

    html = urllib.request.urlopen(eiopa_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('table', attrs={'class': 'ms-rteTable-EIOPATable'})

    for tr in start.tbody:
        if str(tr['class'][0]) == "ms-rteTableHeaderRow-EIOPATable":
            continue

        jobTitle = tr.th.next_sibling.a.string
        jobLink = "https://eiopa.europa.eu/" + tr.th.next_sibling.a.get('href')
        jobCode = str(re.match(r'(.*?)%20', jobLink).group(1)[29:])
        jobType = data_format.typeOfPost(jobCode)
        deadlinePosition = tr.td.next_sibling

        if re.match('\w', jobTitle[0]) is None:
            jobTitle = jobTitle[1:len(jobTitle)]

        if (len(deadlinePosition.contents[0].string) > 2):
            jobDeadline = deadlinePosition.contents[0].string
            jobDeadline = str(jobDeadline).strip()
            jobDeadline = data_format.dateFormatFull(jobDeadline[1:])

        else:
            extendedDeadlines = deadlinePosition.findAll('strong')
            newDeadline = extendedDeadlines[len(extendedDeadlines) - 1].string
            newDeadline = str(newDeadline.split(':')[1]).strip()
            jobDeadline = data_format.dateFormatFull(newDeadline)

        print(jobTitle, jobCode, jobType, jobDeadline)

        eiopa.persist(int(eiopa_id), jobTitle, '', '', jobCode, jobDeadline,
                      jobLink, '', jobType)

    print(
        "#========================EIOPA SCRAPING COMPLETE================================="
    )


#scrapEIOPA()
Пример #5
0
def scrapBBI():

    print("#========================= BBI SCRAPING =========================")

    # Database connection and agency retrieval

    bbiData = bbi.returnAgency('BBI')
    bbi_link = bbiData['link'][0]
    bbi_id = bbiData['id'][0]

    html = urllib.request.urlopen(bbi_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('tbody',
                      attrs={'class': 'ui-datatable-data ui-widget-content'})

    # Find the jobs table
    Jobtable = (start.findAll('tr'))

    for child in Jobtable:
        jobCodeLocation = child.td.next_sibling
        jobTitleLocation = jobCodeLocation.next_sibling
        jobTypeLocation = jobTitleLocation.next_sibling
        jobGradeLocation = jobTypeLocation.next_sibling
        jobDeadlineLocation = jobGradeLocation.next_sibling
        jobLinkLocation = jobDeadlineLocation.next_sibling

        jobCode = jobCodeLocation.string
        jobTitle = jobTitleLocation.string
        jobType = data_format.typeOfPost(jobTypeLocation.string)
        jobGrade = data_format.typeOfGrade(jobGradeLocation.string)
        jobDeadline = data_format.dateFormatFull(
            jobDeadlineLocation.string[:10])
        jobLink = jobLinkLocation.a.get('href')

        print(jobCode, jobTitle, jobType, jobGrade, jobDeadline, jobLink)

        bbi.persist(int(bbi_id),
                    str(jobTitle).strip(), '', '', jobCode, jobDeadline,
                    jobLink, '', jobType)

    print(
        "#========================BBI SCRAPING COMPLETE================================="
    )
Пример #6
0
def scrapCDT():

    print("#========================= CDT SCRAPING =========================")

    # Database connection and agency retrieval

    cdtData = cdt.returnAgency('CDT')
    cdt_link = cdtData['link'][0]
    cdt_id = cdtData['id'][0]

    html = urllib.request.urlopen(cdt_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.findAll('div',
                         attrs={'class': 'ms-rtestate-read ms-rte-wpbox'})

    #print (start[1])#.ul.li.p.div.span.font.string.strip())

    for child in start:

        jobTitle = child.span.attrs['title']
        postType = jobTitle[:9].strip()
        for post in child.ul:
            try:
                #print (child.ul)
                job = post.find('h3')
                jobCode = job.a.string.strip()
                jobLink = "http://cdt.europa.eu" + job.a.get('href').replace(
                    ' ', '%20')
                jobTitle = post.find('p').div.span.font.string.strip()
                jobType = data_format.typeOfPost(postType)
                jobDeadline = data_format.dateFormatFull('SA')
                print(jobCode, jobLink, jobTitle, jobType, jobDeadline)
                cdt.persist(int(cdt_id),
                            str(jobTitle).strip(), '', '', jobCode,
                            jobDeadline, jobLink, '', jobType)
            except:
                continue
    print(
        "#========================CDT SCRAPING COMPLETE================================="
    )
Пример #7
0
def scrapBBI():

    print("#========================= BBI SCRAPING =========================")

    # Database connection and agency retrieval

    bbiData = bbi.returnAgency('BBI')
    bbi_link = bbiData['link'][0]
    bbi_id = bbiData['id'][0]

    html = urllib.request.urlopen(bbi_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('tbody',attrs={'class':'ui-datatable-data ui-widget-content'})

    # Find the jobs table
    Jobtable = (start.findAll('tr'))

    for child in Jobtable:
        jobCodeLocation = child.td.next_sibling
        jobTitleLocation = jobCodeLocation.next_sibling
        jobTypeLocation = jobTitleLocation.next_sibling
        jobGradeLocation = jobTypeLocation.next_sibling
        jobDeadlineLocation = jobGradeLocation.next_sibling
        jobLinkLocation = jobDeadlineLocation.next_sibling

        jobCode = jobCodeLocation.string
        jobTitle = jobTitleLocation.string
        jobType = data_format.typeOfPost(jobTypeLocation.string)
        jobGrade = data_format.typeOfGrade(jobGradeLocation.string)
        jobDeadline = data_format.dateFormatFull(jobDeadlineLocation.string[:10])
        jobLink = jobLinkLocation.a.get('href')

        print (jobCode,jobTitle,jobType,jobGrade,jobDeadline,jobLink)

        bbi.persist(int(bbi_id), str(jobTitle).strip(), '', '', jobCode, jobDeadline, jobLink, '', jobType)

    print("#========================BBI SCRAPING COMPLETE=================================")
Пример #8
0
def scrapF4E():

    print("#========================= F4E SCRAPING =========================")

    F4EData = F4E.returnAgency('F4E')
    F4E_link = F4EData['link'][0]
    F4E_id = F4EData['id'][0]

    html = urllib.request.urlopen(F4E_link)
    soup = BeautifulSoup(html, "html.parser")

    start = soup.findAll(attrs={"class": re.compile("^careersPurple2")})

    for contractType in start:

        deadline = jobTitle = jobCode = jobLink = ''
        contract = data_format.typeOfPost(contractType.a.string)

        jobInfo = contractType.next_sibling.next_sibling
        try:
            deadline = data_format.dateFormatFull(
                jobInfo.find(attrs={
                    "class": "careersDate"
                }).span.string)
            jobTitle = jobInfo.find(attrs={"class": "careersTitle"}).string
            jobCode = jobInfo.find(attrs={"class": "pdf"}).string
            jobLink = "http://fusionforenergy.europa.eu/careers/vacancies/" + jobInfo.find(
                attrs={
                    "class": "pdf"
                }).get("href")
            print(deadline, jobTitle.strip(), jobCode, jobLink, contract)
            F4E.persist(F4E_id, jobTitle.strip(), jobCode, '', '', deadline,
                        jobLink, '', contract)

        except:
            pass
    print(
        "#========================F4E SCRAPING COMPLETE================================="
    )
Пример #9
0
def scrapEMA():

    print("#========================= EMA SCRAPING =========================")

    # Database connection and agency retrieval

    emaData = ema.returnAgency('EMA')
    ema_link = emaData['link'][0]
    ema_id = emaData['id'][0]

    html = urllib.request.urlopen(ema_link)
    soup = BeautifulSoup(html, "html.parser")

    # Create the soup
    start = soup.find('div', attrs={'class': 'main-col'})

    # Find the jobs table
    Jobtable = (start.find('table'))

    for child in Jobtable.children:
        if (child.find('td', attrs={'colspan': 'top'})):
            continue
        jobTitle = child.td.string
        jobCode = child.td.next_sibling.string
        jobType = data_format.typeOfPost(jobCode)
        jobLink = "http://www.ema.europa.eu/ema/" + child.td.next_sibling.next_sibling.a.get(
            'href')
        jobDeadline = data_format.dateFormatFull(
            child.td.next_sibling.next_sibling.next_sibling.string)
        print(jobTitle, jobCode, jobType, jobLink, jobDeadline)
        ema.persist(int(ema_id),
                    str(jobTitle).strip(), '', '', jobCode, jobDeadline,
                    jobLink, '', jobType)

    print(
        "#========================EMA SCRAPING COMPLETE================================="
    )
Пример #10
0
def scrapEurojust():

    print("#========================= EUROJUST SCRAPING =========================")

    # Database connection and agency retrieval

    eurojustData = eurojust.returnAgency('EUROJUST')
    eurojust_link = eurojustData['link'][0]
    eurojust_id = eurojustData['id'][0]

    html = urllib.request.urlopen(eurojust_link)
    soup = BeautifulSoup(html, "html.parser")

    # Find the first ad
    start = soup.findAll("table",attrs={"class":"vacancyAnnouncements2"})


    # Iterate through the tables
    for table in start:
        for ad in table.findAll("tr",attrs={"class":"vacancyAnnouncements2Row"}):
            title = jobType = deadline = url = jobTitle = None

            for piece in ad.findAll("td"):
                if (title is None):
                    title = piece.get_text()
                    continue
                elif (url is None):
                    url = piece.find('a').get('href')
                    jobTitle = piece.get_text()
                    continue
                elif (deadline is None):
                    deadline = piece.get_text()[1:]
                    deadlineFormatted = data_format.dateFormatFull(str(deadline).replace('/',' '))
                    continue
                else:
                    pass

                print (jobTitle, deadlineFormatted)
                jobType = data_format.typeOfGrade(title)

                eurojust.persist(int(eurojust_id), str(jobTitle).strip(), '', '', str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType)


        for ad in table.findAll("tr",attrs={"class" : "vacancyAnnouncements2AlternatingRow"}):
            title = deadline = url = jobTitle = jobType = None

            for piece in ad.findAll("td"):
                if (title is None):
                    title = piece.get_text()
                    continue
                elif (url is None):
                    url = piece.find('a').get('href')
                    jobTitle = piece.get_text()
                    continue
                elif (deadline is None):
                    deadline = piece.get_text()[1:]
                    deadlineFormatted = data_format.dateFormatFull(str(deadline).replace('/', ' '))
                    continue

                else:
                    pass
            print (jobTitle, deadlineFormatted)

            jobType = data_format.typeOfPost(title)

            # Insert job details in database
            eurojust.persist(int(eurojust_id), str(jobTitle).strip(), '', '', str(title).strip(), deadlineFormatted, str(url).strip(), '', jobType)

    print("#========================EUROJUST SCRAPING COMPLETE=================================")