示例#1
0
def scrape_image(name, url, csv, image_credit=''):
    url = url.strip()
    if not url:
        return

    print "scraping %s" % name
    parts = urlparse.urlparse(url)
    if not parts.scheme:
        url = 'http://%s' % url
        parts = urlparse.urlparse(url)
    resp = requests.get(url)
    mime_type = resp.headers['content-type']
    if not mime_type.startswith('image/'):
        scrape_func = VALID_ENDPOINTS.get(parts.netloc, None)
        if scrape_func is None:
            raise ScraperException("Cannot scrape image from %s" %
                                   parts.netloc)
        image_url = scrape_func(resp)
        url = make_abs_url(url, image_url)
    elif '.%s' % mime_type.split('/')[1] not in ACCEPTED_IMAGE_EXTENSIONS:
        raise ScraperException("Unsupported image format at %s" % url)

    csv.write('profileimages.csv', {
        'name': name,
        'image_url': url,
        'image_credit': image_credit
    })
示例#2
0
def get_absolute_url(url):
    if not url.startswith(ENDPOINT_URL):
        parts = urlparse.urlparse(url)
        if parts.scheme == 'https':
            raise ScraperException("Who's Who does not accept https connections")
        elif parts.netloc:
            raise ScraperException("'%s' is not a Who's Who URL" % url)
        return urlparse.urlunparse(['http', 'whoswho.co.za'] + list(parts[2:]))
    return url
示例#3
0
def _scrape_from_parliament(resp):
    root = html.fromstring(resp.text)
    # selects the first image after the brownHeading el
    # NOTE: should we rather not scrape this at all?
    img_el = root.xpath("//*[@class='brownHeading']//following::*//img")
    if len(img_el) > 0:
        return img_el[0].get('src')
    raise ScraperException("Image not found at %s" % resp.url)
示例#4
0
def _scrape_from_whoswho(resp):
    root = html.fromstring(resp.text)
    pic_el = root.get_element_by_id('profile-pic', None)
    if pic_el is not None:
        pic_el = pic_el.xpath('a[1]/img')
        if pic_el:
            return pic_el[0].get('src')
    raise ScraperException("Image not found at %s" % resp.url)
示例#5
0
def _scrape_from_pa(resp):
    root = html.fromstring(resp.text)
    pic_el = root.find_class('profile-pic')
    if pic_el:
        pic_el = pic_el[0].xpath('img[1]')
        if pic_el:
            return pic_el[0].get('src')
    raise ScraperException("Image not found at %s" % resp.url)
示例#6
0
def _scrape_from_wikipedia(resp):
    match = re.search(r'(#mediaviewer|wiki)/(?P<filename>File:.*)$', resp.url)
    if match:
        filename = match.group('filename')
    else:
        root = html.fromstring(resp.text)
        image_el = root.find_class('vcard')[0].find_class('image')
        if not image_el:
            raise ScraperException("Image not found at %s" % resp.url)
        filename = image_el[0].get('href')
        filename = filename[filename.index('File:'):]
    # use the Wikimedia API to get the file url at a reasonable size
    params = WIKI_PARAMS.copy()
    params['titles'] = filename.replace('_', ' ')
    response = requests.get(WIKI_ENDPOINT_URL, params=params)
    data = response.json()['query']
    if 'pages' not in data or len(data['pages']) == 0:
        raise ScraperException("Image not found at %s" % resp.url)
    return data['pages'].values()[0]['imageinfo'][0]['thumburl']
示例#7
0
def _scrape_from_google(resp):
    parts = urlparse.urlparse(resp.url)
    if parts.path == '/imgres':
        params = dict(urlparse.parse_qsl(parts.query, True))
        if 'imgurl' in params:
            return params['imgurl']
    elif parts.path == '/search':
        if parts.fragment.strip() != '':
            params = dict(urlparse.parse_qsl(parts.fragment, True))
            if 'imgrc' in params:
                url = urllib.unquote(params['imgrc']).split(';')[-4]
                return urllib.unquote(url)
    raise ScraperException("Image not found at %s" % resp.url)
示例#8
0
def parse_content(content):
    # TODO: achievements sections
    data = {
        'related_profiles': [],
        'professional_details': [],
        'activities': [],
        'education': [],
    }
    root = html.fromstring(content)

    # basic info
    basic_el = root.xpath("//*[@itemtype='http://schema.org/Person'][1]")
    if len(basic_el) == 0:
        raise ScraperException("Content doesn't appear to be a person's profile")
    basic_el = basic_el[0]
    display_name = first_or_empty(basic_el.xpath("*[@itemprop='name'][1]/text()"))
    full_name = first_or_empty(basic_el.xpath("*[@itemprop='name']/following-sibling::p[1]/em/text()"))
    job_title = first_or_empty(basic_el.xpath("*[@itemprop='jobTitle'][1]/text()"))
    bio = first_or_empty(basic_el.xpath("*[@id='contact_info']/preceding-sibling::p[1]/text()"))
    data['basic_info'] = {
        'display_name': display_name,
        'full_name': full_name,
        'job_title': job_title,
        'bio': bio
    }
    # date of birth
    birth_node = basic_el.xpath("p[contains(., 'Born')][1]")
    if birth_node:
        birth_node = birth_node[0]
        birth_date = birth_node.xpath('a[1]/text()')
        if birth_date:
            data['basic_info']['birth_date'] = datetime.strptime(
                birth_date[0],
                DATE_FORMAT
            )
        birth_town = birth_node.find_class('locality')
        if birth_town:
            if birth_town[0].xpath('a'):
                data['basic_info']['birth_town'] = birth_town[0].xpath('a[1]/text()')[0]
            elif birth_town[0].text:
                text = birth_town[0].text.strip()
                if text.startswith('in '):
                    text = text[3:]
                data['basic_info']['birth_town'] = text
        birth_country = birth_node.xpath("*[@itemprop='nationality'][1]/text()")
        if birth_country:
            data['basic_info']['country'] = birth_country[0]

    # professional info
    prof_el = root.get_element_by_id('professional-details', None)
    if prof_el is not None:
        current = None
        for el in prof_el:
            if el.tag == 'h2':
                if current is None:
                    current = True
                else:
                    current = False
            elif el.tag == 'div' and current is not None and \
                    not el.get('class', ''):
                role_parts = [s.strip() for s in 
                              el.xpath('h6/br/preceding-sibling::text()[1]')[0]
                                .split('|')
                              if s.strip() != '']
                date_parts = [s.strip() for s in 
                              el.xpath('h6/br/following-sibling::text()[1]')[0]
                                .split('|')
                              if s.strip() != '']
                role_data = {
                    'role_name': first_or_empty(role_parts),
                    'status': 'active' if current else 'inactive'
                }
                # get start and end year
                if date_parts:
                    date_parts = R_YEAR_RANGE.match(date_parts[-1])
                    if date_parts:
                        role_data['role_start_year'] = int(date_parts.group('start'))
                        if date_parts.group('current'):
                            assert current
                        elif date_parts.group('end'):
                            role_data['role_end_year'] = int(date_parts.group('end'))
                        elif not current:
                            role_data['role_end_year'] = role_data['role_start_year']
                # get organization info
                org_el = el.xpath('h6/a[last()]')
                if len(org_el) > 0:
                    org_el = org_el[0]
                    role_data['organization_name'] = org_el.text
                    role_data['organization_url'] = org_el.get('href', None)
                    if role_data['organization_url']:
                        role_data['organization_url'] = '%s%s' % (
                            ENDPOINT_URL.rstrip('/'),
                            role_data['organization_url']
                        )
                # the organization doesn't have a url
                # use 2nd last piece of plain text
                elif len(role_parts) > 2:
                    role_data['organization_name'] = role_parts[-2]
                else:
                    continue
                data['professional_details'].append(role_data)

    # education info
    edu_el = root.get_element_by_id('education', None)
    if edu_el is not None:
        level = None
        for el in edu_el.xpath("h1[1]/following-sibling::node()"):
            if not isinstance(el, html.HtmlElement):
                continue
            if el.tag == 'h2':
                level = el.text.lower()
                continue
            elif el.tag != 'div' or el.get('class', None) == 'clear':
                continue
            # parse secondary education (single line)
            if level == 'secondary':
                org_parts = el.xpath('h6[1]/text()')[0]
                org_parts = [s.strip() for s in org_parts.split(',')]
                place = ', '.join(org_parts[1:])
                edu_data = {
                    'organization_name': org_parts[0],
                    'level': level,
                    'place': place,
                }
                match = re.match(r'.*(?P<year>\d{4})$', place)
                if match:
                    edu_data['year_awarded'] = int(match.group('year'))
                    edu_data['status'] = 'complete'
                    edu_data['place'] = place[:-7]
            # parse tertiary education (complex tags)
            elif level == 'tertiary':
                edu_data = {'level': level}
                org_name = el.xpath('h6[1]/a')
                if org_name:
                    edu_data['organization_name'] = org_name[0].text
                else:
                    org_name = el.xpath('h6[1]/text()')
                    if org_name:
                        edu_data['organization_name'] = org_name[0]
                try:
                    date_parts = el.xpath('p[1]/text()')[0] \
                                   .split('|')[-1] \
                                   .strip()
                except IndexError:
                    continue
                if date_parts.startswith('Awarded in ') or \
                        date_parts.startswith('Completed '):
                    edu_data['year_awarded'] = int(date_parts[-4:])
                    if date_parts.startswith('Completed '):
                        edu_data['status'] = 'complete'
                    qualification = el.xpath('p[2]/text()')
                    if qualification:
                        edu_data['qualification'] = qualification[0]
                else:
                    date_parts = R_YEAR_RANGE.match(date_parts)
                    if date_parts:
                        qualification = el.xpath('p[2]/text()')
                        if qualification:
                            edu_data['qualification'] = qualification[0]
                        edu_data['start_year'] = int(date_parts.group('start'))
                        if date_parts.group('current'):
                            edu_data['status'] = 'in progress'
                        elif date_parts.group('end'):
                            edu_data['status'] = 'complete'
                            edu_data['year_awarded'] = int(date_parts.group('end'))
                    else:
                        edu_data['qualification'] = date_parts
            data['education'].append(edu_data)

    # activities info
    activity_el = root.get_element_by_id('activities', None)
    if activity_el is not None:
        # only doing memberships
        for el in activity_el.xpath("h2[.='Memberships']/following-sibling::node()"):
            if not isinstance(el, html.HtmlElement):
                continue
            if el.tag != 'div' or el.get('class', None) == 'clear':
                break
            org_name = el.xpath('h6[1]/text()')[0]
            role_data = {'organization_name': org_name}
            role_parts = el.xpath('p[1]/em')[0].text
            if role_parts:
                role_parts = role_parts.split(',')
                role_data['role_name'] = role_parts[0].strip()
                if len(role_parts) == 2:
                    date_parts = R_YEAR_RANGE.match(role_parts[1].strip())
                    if date_parts:
                        role_data['role_start_year'] = int(date_parts.group('start'))
                        if date_parts.group('current'):
                            role_data['status'] = 'active'
                        elif date_parts.group('end'):
                            role_data['status'] = 'inactive'
                            role_data['role_end_year'] = int(date_parts.group('end'))
            data['activities'].append(role_data)

    # related profile info
    related_el = root.get_element_by_id('related', None)
    if related_el is not None:
        for el in related_el.find_class('item'):
            a_el = el.xpath('a')[0]
            related_data = {'url': a_el.get('href')}
            img_el = a_el.xpath('img')
            if len(img_el) > 0:
                img_el = img_el[0]
                related_data['image_url'] = img_el.get('src')
                related_data['title'] = img_el.get('title')
            data['related_profiles'].append(related_data)

    return data