Python process_pdf 예제들, shared.pdf.process_pdf Python 예제들

예제 #1

0

파일 보기

파일: universityOfRegensburg.py 프로젝트: ayat-ra/scraping

def processJob(jobDetails):
    try:
        job = {}
        job["url"] = urljoin(rootUrl, jobDetails.a['href'])
        job["language"] = 'de'
        #remove header details
        header = jobDetails.find(text=re.compile('Stellenausschreibung'))
        if header:
            header.extract()
        job["title"] = jobDetails.find('strong').text
        print job['title']
        appDateTxt = re.search(r'Bewerbungsschluss:(?: |\xa0)(\d{2}[.\w\s]+?\d{4})', jobDetails.get_text())
        if appDateTxt:
            try:
                appDate = text_to_date(appDateTxt.group(1), u'%d. %B %Y')
            except:
                appDate = text_to_date(appDateTxt.group(1), u'%d.%m.%Y')
            job['applicationDate'] = appDate
            print job['applicationDate']
        try:
            text = process_pdf(job['url'])
            job['text'] = text
        except:
            job['text'] = 'n/a'
     
        thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #2

0

파일 보기

파일: universityOfKiel.py 프로젝트: ayat-ra/scraping

def processJob(jobDetails):
    try:
        job = dict(language = 'de')
        if jobDetails.name == 'tr' and not jobDetails.find('a'):#handle table row
            print('job has no text application date is expired')
        elif jobDetails.name == 'tr' and jobDetails.find('a'):
            job['url'] = rootUrl + u'/de/' + jobDetails.a['href']
            url = job['url']
            job = dict(url = url, language = 'de')
            if findDate(jobDetails.text):
                job['applicationDate'] = findDate(jobDetails.text)
            job['text'] = pdf.process_pdf(job['url'])
            job['title'] = jobDetails.td.get_text()
        else:       #handle div element
            job["url"] = jobDetails.a['href']
            url = job['url']
            job = dict(url = url, language = 'de')
            job['title'] = jobDetails.a.get_text()
            soup = thisInstitution.getSoup(job['url'])
            content = soup.find(class_='fallback')
            job['text'] = unicode(content.div)
            if findDate(job['text']):
                job['applicationDate'] = findDate(job['text'])

        thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        job = dict(language = 'de')
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #3

0

파일 보기

파일: universityOfLuebeck.py 프로젝트: ayat-ra/scraping

def processJob(jobDetails):
    try:
        job = {}
        job['url'] = urljoin(rootUrl, jobDetails['href'])
        job['title'] = jobDetails.get_text()
        print job['title']
        job["language"] = 'de'
        try:
            job['text'] = process_pdf(job['url'])
        except:
            job['text'] = "Details for this job are available here " + job['url']
        appDateElm = jobDetails.parent.find(text=re.compile(r'Bewerbungsfrist'))
        if appDateElm:
            appDateTxt = re.search(r'\d{1,2}[.\w\s]+\d{4}', appDateElm)
            try:
                appDate = text_to_date(appDateTxt.group(0), u'%d.%m.%Y')
            except:
                appDate = text_to_date(appDateTxt.group(0), u'%d. %B %Y')
            job['applicationDate'] = appDate
        if findDate(job['text']):
            job['applicationDate'] = findDate(job['text'])
        
        thisInstitution.addRecord(job) 
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #4

0

파일 보기

파일: universityOfVechta.py 프로젝트: ayat-ra/scraping

def processJob(jobDetails):
    try:
        job = {}
        job["url"] = urljoin(rootUrl, jobDetails['href'])
        job["language"] = 'de'
        job["title"] = jobDetails.get_text()
        job['text'] = process_pdf(job['url'])
        thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #5

0

파일 보기

파일: universityOfDuisburgEssen.py 프로젝트: ayat-ra/scraping

def processJob(jobDetails):
    try:
        job = {}
        job["url"] = urljoin(rootUrl, jobDetails.a["href"]).encode("utf-8")
        job["title"] = jobDetails.text.encode("utf-8")
        print job["title"]
        if re.search("pdf", job["url"]):
            job["text"] = process_pdf(job["url"])
        else:
            job["text"] = u"n/a"
        thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #6

0

파일 보기

파일: universityOfHof.py 프로젝트: ayat-ra/scraping

def processJob(jobDetails):
    try:
        job = {}
        job["url"] = urljoin(rootUrl, jobDetails.a['href'])
        job["language"] = 'de'
        job["title"] = jobDetails.find(class_='h3_title').get_text()
        app_date_txt = re.search(r'Bewerbungsschluss:\s*(\d{1,2}[.\w\s]+\d{4})', jobDetails.get_text())
        if app_date_txt:
            app_date = text_to_date(app_date_txt.group(1), u'%d.%m.%Y')
            job['applicationDate'] = app_date
        job['text'] = process_pdf(job['url'])
     
        thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #7

0

파일 보기

파일: universityOfHeilbronn.py 프로젝트: ayat-ra/scraping

def processJobC(jobDetails):
    try:
        job = {}
        job["url"] = urljoin(rootUrl, jobDetails.a['href'])
        job["language"] = 'de'
        job["title"] = u' '.join([a.get_text() for a in jobDetails.find_all('a')])
        print job['title']
        job['text'] = process_pdf(job['url'])
        appDateElm = jobDetails.find(text=re.compile('Bewerbungsschluss'))
        if appDateElm:
            job['applicationDate'] = find_date_in_text(appDateElm.string)

        thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #8

0

파일 보기

파일: universityOfPotsdam.py 프로젝트: ayat-ra/scraping

def processJobPdf(jobDetails):
    try:
        job = {}
        job["url"] = urljoin(baseUrl, quote(jobDetails['href']))
        job["language"] = 'de'
        job["title"] = re.match(r'[^)(]+', jobDetails.get_text()).group(0)
        print job['title']
        app_date = find_date_in_text(re.search(r'(?:Ausschreibungsfrist|Deadline):[^)]+', jobDetails.get_text()).group(0))
        if app_date:
            job['applicationDate'] = app_date
            print job['applicationDate']
        job['text'] = process_pdf(job['url'])

        thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #9

0

파일 보기

파일: universityOfPaderborn.py 프로젝트: ayat-ra/scraping

def processJob(jobDetails):
    try:
        job = {}
        job["url"] = jobDetails.a['href']
        job["language"] = 'de'
        job["title"] = u'\n'.join(jobDetails.td.strings)
        print job['title']
        appDateTxt = jobDetails.contents[1].string
        if appDateTxt:
            appDate = text_to_date(appDateTxt, u'%d.%m.%Y')
            job['applicationDate'] = appDate
            print job['applicationDate']
        job['text'] = process_pdf(job['url'])
     
        thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #10

0

파일 보기

파일: universityOfKoblenzLandau.py 프로젝트: ayat-ra/scraping

def processJob(jobDetails):
    try:
        #only elements that include pdf links are job elements
        link = jobDetails.find('a', href=re.compile('pdf'))
        if link:
            job = {}
            job["url"] = jobDetails.a['href']
            job["language"] = 'de'
            job["title"] = jobDetails.a.get_text()
            print job['title']
            app_date = find_date_in_text(jobDetails.get_text())
            if app_date:
                job['applicationDate'] = app_date
            job['text'] = process_pdf(job['url'])

            thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #11

0

파일 보기

파일: universityOfTrier.py 프로젝트: ayat-ra/scraping

def processJob(jobDetails):
    try:
        job = {}
        job["url"] = urljoin(rootUrl, jobDetails.a['href'])
        job['url'] = re.sub('\s*', '', job['url'])
        job["language"] = 'de'
        contents = jobDetails.find_all('td')
        job["title"] = contents[1].get_text()
        print job['title']
        try:
            job['text'] = process_pdf(job['url'])
        except:
            job['text'] = "Job details are available here: " + job['url']
        app_date = find_date_in_text(contents[-1].get_text())
        if app_date:
            job['applicationDate'] = app_date

        thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False

예제 #12

0

파일 보기

파일: universityOfIlmenau.py 프로젝트: ayat-ra/scraping

def processJob(jobDetails):
    try:
        job = {}
        job["url"] = urljoin(rootUrl, jobDetails.find('a')['href'])

        job["language"] = 'de'
        job["title"] = jobDetails.find_all('td')[1].get_text()
        app_date = find_date_in_text(jobDetails.find_all('td')[4].get_text(), 'de_DE')
        if app_date:
            job['applicationDate'] = app_date

        publish_date = find_date_in_text(jobDetails.find_all('td')[2].get_text(), 'de_DE')
        if publish_date:
            job["publishDate"] = publish_date

        job['text'] = process_pdf(job['url'])

        thisInstitution.addRecord(job)  # data is recorded here, job is the only argument
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False