def ScrapeTotalJobs(url):

    # Dictionary to hold standard job data.
    StandardData = {}
    StandardDataKeys = [
        'company', 'title', 'location', 'salary_min', 'salary_max'
    ]

    # Populate dictionary holding regular expressions used to extract job data from web.
    WebFieldRes = {}
    WebFieldRes['removed'] = 'expiry-message\">.*?<p>(.*?)</p>'
    WebFieldRes['title'] = '<h1 class=\"brand-font\">(.*?)</h1>'
    WebFieldRes['location'] = '<li class=\"location icon\">(.*?)</li>'
    WebFieldRes['commute'] = 'locationText\">.*?<ul>.*?<li>(.*?)</li'
    WebFieldRes['salary'] = '<li class=\"salary icon\">.*?<div>(.*?)</div>'
    WebFieldRes[
        'company'] = '<li class="company icon">.*?\"View jobs\">(.*?)</a>.*?</li>'
    WebFieldRes['job_type'] = '<li class=\"job-type icon\">.*?<div>(.*?)</div>'
    WebFieldRes[
        'expiry'] = '<li class=\"date-posted icon\">.*?<span>(.*?)</span>'

    # Request http content and convert to character stream
    #Httpresponse = requests.get(url,auth=('*****@*****.**','l142rpn'),timeout = (2,5))
    Httpresponse = requests.get(url, timeout=(2, 5))
    Httpcontent = Httpresponse.text.replace('\n', '')

    # Dictionary to hold web job data
    WebData = {}
    for WebFieldRe in WebFieldRes:
        WebData[WebFieldRe] = ''

    # Extract web data
    for WebFieldRe in WebFieldRes:
        Httpmatch = re.search(WebFieldRes[WebFieldRe], Httpcontent)
        if Httpmatch:
            WebData[WebFieldRe] = Httpmatch.group(1).strip()
            Httpcontent = Httpcontent[Httpmatch.end(1):]

    # Return an empty dictionary of data if the job has expired.
    if (WebData['removed'] ==
            'The job you are looking for is no longer available.'):
        return StandardData
    if (WebData['expiry'] == 'Expired'): return StandardData
    if (WebData['expiry'] == 'Recently'): return StandardData

    # Populate standard data dictionary
    StandardData['company'] = WebData['company']
    StandardData['title'] = WebData['title']

    # Standard location value needs to be derived from one of two web values
    Location = ''

    if (len(WebData['location']) != 0):
        LocationParts = re.findall('class=\"engagement-metric\">(.*?)<',
                                   WebData['location'])
        if not LocationParts:
            LocationParts = re.findall('<div>(.*?)<', WebData['location'])
    else:
        LocationParts = re.findall('class=\"engagement-metric\">(.*?)<',
                                   WebData['commute'])
        if not LocationParts:
            LocationParts = WebData['commute'].split(',')

    # Create location string.
    if LocationParts:
        for LocationPart in LocationParts:
            Location = Location + ',' + LocationPart
            StandardData['location'] = Location.lstrip(',')

    # Standard salary values need deriving form web value.
    SalaryValues = Interface.Extractsalarydata(WebData['salary'])
    StandardData['salary_min'] = int(SalaryValues[0])
    StandardData['salary_max'] = int(SalaryValues[1])

    return StandardData
def ScrapeIndeed(url):

    "scrapes additional job data from the web site"

    # Matrices for converting raw job data to standard job data.
    ConversionDict = {
        'company': ['company'],
        'title': ['title'],
        'location': ['location'],
        'salary_min': ['salary'],
        'salary_max': ['salary']
    }

    # set regular expressions
    titlere = 'jobsearch-JobInfoHeader-title\">(.*?)<'
    expiredre = 'icl-Alert-headline\">(.*?)</'
    valuere = '\"jobsearch-JobMetadataHeader-iconLabel\">(.*?)<'
    # valuere ='icl-u-xs-m[rt]--xs\">(.*?)<'
    # The above may work on some Indeed job adverts however
    # the display code looks too unstable to warrant any changes.
    keyre = 'icl-IconFunctional--(.*?) icl'

    # Retrieve web text
    Httpresponse = requests.get(url)
    Httplines = Httpresponse.text.split('\n')

    # Search for job title and other job attributes
    EngineJobData = {}
    Attributekeys = []
    Attributevalues = []
    ProcessedJobData = {}

    for Httpline in Httplines:
        Httpmatch = re.search(expiredre, Httpline)
        if Httpmatch:
            if (Httpmatch.group(1) == 'This job has expired on Indeed'):
                return ProcessedJobData
        Httpmatch = re.search(titlere, Httpline)
        if Httpmatch: EngineJobData['title'] = Httpmatch.group(1)
        Httpmatch = re.findall(keyre, Httpline)
        if Httpmatch: Attributekeys = Httpmatch
        Httpmatch = re.findall(valuere, Httpline)
        if Httpmatch: Attributevalues = Httpmatch

    # Create a distionary of job atributes
    Attributeindex = 0

    while (Attributeindex < len(Attributekeys)):
        EngineJobData[
            Attributekeys[Attributeindex]] = Attributevalues[Attributeindex]
        Attributeindex += 1

    # Convert raw job data to standard job data
    for StandardDataKey in ConversionDict:
        StandardDataValue = ''
        for EngineDataKey in ConversionDict[StandardDataKey]:
            if EngineDataKey in EngineJobData:
                StandardDataValue = StandardDataValue + EngineJobData[
                    EngineDataKey] + ','

        ProcessedJobData[StandardDataKey] = StandardDataValue.rstrip(',')

    # Convert salary strings.
    SalaryValues = Interface.Extractsalarydata(ProcessedJobData['salary_min'])
    ProcessedJobData['salary_min'] = int(SalaryValues[0])
    ProcessedJobData['salary_max'] = int(SalaryValues[1])

    return ProcessedJobData
def ScrapeReed(url):

    "scrapes additional job data from the web site"

    # Matrices for converting raw job data to standard job data.
    ConversionDict = {
        'company': ['company'],
        'title': ['title'],
        'location': ['locality', 'region'],
        'salary_min': ['salary'],
        'salary_max': ['salary']
    }

    # set regular expressions
    titlere = '<meta itemprop=\"title\" content=\"(.*?)\" />'
    companyre = '<span itemprop=\"name\">(.*?)<'
    keyre = '<span\s*data-qa=\"(.*?)MobileLbl\"\s*>'
    valuere = '<span\s*data-qa=\"(.*?)MobileLbl\"\s*>(.*?)<'
    expiredre = 'The following job is no longer available'

    # Retrieve web text
    Httpresponse = requests.get(url)
    Httplines = Httpresponse.text.split('\n')

    # Search for job title and other job attributes
    EngineJobData = {}
    Attributekeys = []
    Attributevalues = []
    ProcessedJobData = {}
    JobExpired = False

    for Httpline in Httplines:
        Httpmatch = re.search(titlere, Httpline)
        if Httpmatch: EngineJobData['title'] = Httpmatch.group(1)
        Httpmatch = re.search(companyre, Httpline)
        if Httpmatch: EngineJobData['company'] = Httpmatch.group(1)
        Httpmatch = re.search(keyre, Httpline)
        if Httpmatch: Attributekeys.append(Httpmatch.group(1))
        Httpmatch = re.search(valuere, Httpline)
        if Httpmatch: Attributevalues.append(Httpmatch.group(2))
        Httpmatch = re.search(expiredre, Httpline)
        if Httpmatch: JobExpired = True

    # Return with no data if job is expired
    if (JobExpired): return ProcessedJobData

    # Create a distionary of job atributes
    Attributeindex = 0

    while (Attributeindex < len(Attributekeys)):
        EngineJobData[
            Attributekeys[Attributeindex]] = Attributevalues[Attributeindex]
        Attributeindex += 1

    # Convert raw job data to standard job data
    for StandardDataKey in ConversionDict:
        StandardDataValue = ''
        for EngineDataKey in ConversionDict[StandardDataKey]:
            if EngineDataKey in EngineJobData:
                StandardDataValue = StandardDataValue + EngineJobData[
                    EngineDataKey] + ','

        ProcessedJobData[StandardDataKey] = StandardDataValue.rstrip(',')

    # Convert salary strings.
    SalaryValues = Interface.Extractsalarydata(ProcessedJobData['salary_min'])
    ProcessedJobData['salary_min'] = int(SalaryValues[0])
    ProcessedJobData['salary_max'] = int(SalaryValues[1])

    return ProcessedJobData
def ScrapeCVLibrary(url):

    "scrapes additional job data from the web site"

    # Matrices for converting raw job data to standard job data.
    ConversionDict = {
        'company': ['JOB_COMPANY_NAME'],
        'title': ['JOB_TITLE'],
        'location': ['JOB_TOWN', 'JOB_COUNTY'],
        'salary_min': ['JOB_SALARY'],
        'salary_max': ['JOB_SALARY']
    }

    # set regular expressions
    attributesre = 'dataLayer\.push\(\{(.*?)\}\)'

    # Set expiration flag
    JobExpired = False

    # Initialize job data
    ProcessedJobData = {}

    # Retrieve web text
    Httpresponse = requests.get(url)

    # Return no data if job has expires
    Httpheaders = Httpresponse.headers
    if (not 'Vary' in Httpheaders): return ProcessedJobData

    Httplines = Httpresponse.text.split('\n')

    # Search for required job data
    # in web text
    for Httpline in Httplines:
        Httpmatch = re.search(attributesre, Httpline)
        if Httpmatch: break

    # Process data line
    JobAttributes = Httpmatch.group(1)
    JobAttributes = JobAttributes.replace('\"', '')
    JobAttributes = JobAttributes.split(',')

    # Convert engine job attributes into key value pairs
    EngineJobData = {}
    for JobAttribute in JobAttributes:
        # It's possible that data doesn't follow <key>:<value> format.
        if ':' in JobAttribute:
            AttributePair = JobAttribute.split(':')
            EngineJobData[AttributePair[0]] = AttributePair[1]

    # Convert raw job data to standard job data
    for StandardDataKey in ConversionDict:
        StandardDataValue = ''
        for EngineDataKey in ConversionDict[StandardDataKey]:
            if EngineDataKey in EngineJobData:
                StandardDataValue = StandardDataValue + EngineJobData[
                    EngineDataKey] + ','

        ProcessedJobData[StandardDataKey] = StandardDataValue.rstrip(',')

    # Convert salary strings.
    # LinkedIn does not display salaries in a consistent location
    SalaryValues = Interface.Extractsalarydata(ProcessedJobData['salary_min'])
    ProcessedJobData['salary_min'] = int(SalaryValues[0])
    ProcessedJobData['salary_max'] = int(SalaryValues[1])

    return ProcessedJobData