Exemplo n.º 1
0
def scrape_collection_files(topic, url):
    assets = json.loads(downloader.read(url))['data']
    images = []
    for asset in assets:
        if asset['attributes']['extension'] == 'png':
            images.append({
                'url':
                asset['attributes']['thumbnail_url'].replace(
                    'element.png', '*****@*****.**'),
                'caption':
                asset['attributes']['name']
            })

        elif asset['attributes']['extension'] == 'mp4':
            video_data = json.loads(
                downloader.read(FILE_STORAGE_URL.format(id=asset['id'])))
            video = video_data['data'][0]['attributes']
            topic.add_child(
                nodes.VideoNode(source_id=video['url'],
                                title=asset['attributes']['name'],
                                license=LICENSE,
                                files=[
                                    files.VideoFile(video['url']),
                                    files.ThumbnailFile(video['thumbnail_url'])
                                ]))
        else:
            LOGGER.warning('Unable to add {} from {}'.format(
                asset['attributes']['extension'], url))

    # Add images to slideshow node
    if len(images):
        topic.add_child(create_slideshow(images, url, topic.title, 'English'))
def scrape_resource(url, topic):
    resource = BeautifulSoup(downloader.read(url), 'html5lib')
    LOGGER.info('      {}'.format(resource.find('h2').text))

    filepath = download_resource(resource.find('div', {'class': 'decargas'}).find('a')['href'])
    license = None
    author = ''
    for data_section in resource.find('div', {'class': 'datos_generales'}).find_all('h4'):
        if 'Licencia' in data_section.text:
            try:
                license = LICENSE_MAP[data_section.find_next_sibling('p').text](copyright_holder="Ceibal")
            except KeyError as e:
                LOGGER.error(str(e))
                license = licenses.CC_BYLicense
        elif 'Autor' in data_section.text:
            author = data_section.find_next_sibling('p').text
    if filepath:
        thumbnail = resource.find('div', {'class': 'img-recurso'}).find('img')['src']
        if thumbnail.endswith('.gif'):
            thumbnail = os.path.sep.join([DOWNLOAD_DIRECTORY, thumbnail.split('/')[-1].replace('.gif', '.png')])
            with open(thumbnail, 'wb') as fobj:
                fobj.write(downloader.read(resource.find('div', {'class': 'img-recurso'}).find('img')['src']))

        topic.add_child(nodes.HTML5AppNode(
            title=resource.find('h2').text,
            source_id=url,
            license=license,
            author=author,
            description=resource.find('form').find_all('p')[1].text,
            thumbnail=thumbnail,
            tags = [tag.text[:30] for tag in resource.find_all('a', {'class': 'tags'})],
            files=[files.HTMLZipFile(path=filepath)],
        ))
Exemplo n.º 3
0
def scrape_multilanguage_slideshows(channel):
    LOGGER.info('Scraping multi-language content...')
    contents = BeautifulSoup(downloader.read(SLIDESHOWS_URL), 'html5lib')
    collection_key = get_collection_key(contents)

    languages_selection = contents.find('div', {
        'class': 'asset-list'
    }).find('div')
    language_list = json.loads(
        languages_selection['data-react-props'])['sections']

    for language in language_list:
        asset_url = SLIDESHOW_ASSETS_URL.format(
            collection='qac6i4-foozd4-68u325', section=language['section_key'])
        slide_data = json.loads(downloader.read(asset_url))['data']
        translated_name = languages.getlang(
            LANGUAGE_MAP[language['name']]).native_name if LANGUAGE_MAP[
                language['name']] else language['name']
        LOGGER.info('    {}'.format(translated_name.encode('utf-8')))

        slides = [{
            'url':
            slide['attributes']['thumbnail_url'].replace(
                'element.png', '*****@*****.**')
        } for slide in slide_data]
        if len(slides):
            channel.add_child(
                create_slideshow(slides, asset_url, translated_name,
                                 language['name']))
Exemplo n.º 4
0
def scrape_student_resources():
    """
    Scrape student resources from the main page http://edsitement.neh.gov/student-resources
    """
    STUDENT_RESOURCES_URL = urljoin(BASE_URL, "student-resources/")
    subject_ids = [25, 21, 22, 23]
    levels = ["Student Resources"]
    for subject in subject_ids[STUDENT_RESOURCE_SUBJECT_INIT:STUDENT_RESOURCE_SUBJECT_END]:
        params_url = "all?grade=All&subject={}&type=All".format(subject)
        page_url = urljoin(STUDENT_RESOURCES_URL, params_url)
        LOGGER.info("Scrapping: " + page_url)
        page_contents = downloader.read(page_url, session=sess)
        page = BeautifulSoup(page_contents, 'html.parser')
        resource_links = page.find_all(lambda tag: tag.name == "a" and tag.findParent("h3"))
        for link in resource_links[STUDENT_RESOURCE_INIT:STUDENT_RESOURCE_END]:
            time.sleep(TIME_SLEEP)
            if link["href"].rfind("/student-resource/") != -1:
                student_resource_url = urljoin(BASE_URL, link["href"])
                try:
                    page_contents = downloader.read(student_resource_url, session=sess)
                except requests.exceptions.HTTPError as e:
                    LOGGER.info("Error: {}".format(e))
                page = BeautifulSoup(page_contents, 'html.parser')
                topic_name = student_resource_url.split("/")[-1]
                student_resource = StudentResourceIndex(page,
                    filename="/tmp/student-resource-"+topic_name+".zip",
                    levels=levels)
                student_resource.to_file()
Exemplo n.º 5
0
    def download_content(self):
        self.load_tree_data()
        assert self.channel_tree

        def get_filename(url):
            return url.split('/')[-1].split('?')[0]

        for lang in self.channel_tree:
            for class_name in self.channel_tree[lang]:
                for subject in self.channel_tree[lang][class_name]:
                    for item in self.channel_tree[lang][class_name][subject]['items']:
                        url = get_column(item, 'url')
                        url = url.replace('?dl=0', '?dl=1')
                        filename = get_filename(url)
                        if url:
                            download_path = os.path.join(self.ARCHIVE_DIR, lang, class_name, subject, filename)
                            os.makedirs(os.path.dirname(download_path), exist_ok=True)
                            if not os.path.exists(download_path):
                                content = downloader.read(url)
                                with open(download_path, 'wb') as f:
                                    f.write(content)
                            item['file'] = download_path

                        icon = get_column(item, 'icon')
                        icon = icon.replace('?dl=0', '?dl=1')
                        if icon:
                            icon_filename = get_filename(icon)
                            icon_path = os.path.join(self.ARCHIVE_DIR, lang, class_name, subject, icon_filename)
                            content = downloader.read(icon)
                            with open(icon_path, 'wb') as f:
                                f.write(content)
                            item['thumbnail'] = icon_path
Exemplo n.º 6
0
    def get_scraper(self):
        from pages import DEFAULT_PAGE_HANDLERS
        for handler in (DEFAULT_PAGE_HANDLERS + self.extra_scrapers):
            if handler.test(self.link):
                return handler

        downloader.read(self.link) # Will raise an error if this is broken
        raise UnscrapableSourceException
def scrape_resource_list(url, topic):
    resource_list_page = BeautifulSoup(downloader.read(url), 'html5lib')

    # Go through pages, omitting Previous and Next buttons
    for page in range(len(resource_list_page.find_all('a', {'class': 'page-link'})[1:-1])):
        # Use numbers instead of url as the links on the site are also broken
        resource_list = BeautifulSoup(downloader.read("{}&page={}".format(url, page + 1)), 'html5lib')
        for resource in resource_list.find_all('a', {'class': 'card-link'}):
            resource_file = scrape_resource(resource['href'], topic)
Exemplo n.º 8
0
 def get_scraper(self):
     scraper = guess_scraper(self.link,
                             scrapers=self.extra_scrapers,
                             locale=self.locale,
                             triaged=self.triaged,
                             zipper=self.zipper)
     if not scraper:
         downloader.read(self.link)  # Will raise an error if this is broken
         raise UnscrapableSourceException
     return scraper
Exemplo n.º 9
0
def get_thumbnail(url):
    filename, _ext = os.path.splitext(os.path.basename(url))
    img_path = os.path.sep.join(
        [THUMBNAILS_DIRECTORY, "{}.png".format(filename)])
    svg_path = os.path.sep.join(
        [THUMBNAILS_DIRECTORY, "{}.svg".format(filename)])

    # This thumbnail gets converted with an error, so download it separately for now
    if "US_history" in filename:
        return files.ThumbnailFile(path="US_history.png")

    # Copy pngs to local storage
    if url.endswith("png"):
        with open(img_path, 'wb') as pngobj:
            pngobj.write(downloader.read(url))

    elif url.endswith("svg"):
        with open(svg_path, 'wb') as svgobj:
            # renderPM doesn't read <style> tags, so add style to individual elements
            svg_contents = BeautifulSoup(downloader.read(url), 'html.parser')
            svg_contents = BeautifulSoup(
                svg_contents.find('svg').prettify(), 'html.parser')
            if svg_contents.find('style'):
                sheet = cssutils.parseString(svg_contents.find('style').string)
                for rule in sheet:
                    rectangles = svg_contents.find_all(
                        'rect', {'class': rule.selectorText.lstrip('.')})
                    paths = svg_contents.find_all(
                        'path', {'class': rule.selectorText.lstrip('.')})
                    polygons = svg_contents.find_all(
                        'polygon', {'class': rule.selectorText.lstrip('.')})
                    for el in rectangles + paths + polygons:
                        el['style'] = ""
                        for prop in rule.style:
                            el['style'] += "{}:{};".format(
                                prop.name, prop.value)

            # Beautifulsoup autocorrects some words to be all lowercase, so undo correction
            autocorrected_fields = ["baseProfile", "viewBox"]
            svg = svg_contents.find('svg')
            for field in autocorrected_fields:
                if svg.get(field.lower()):
                    svg[field] = svg[field.lower()]
                    del svg[field.lower()]

            svgobj.write(svg_contents.renderContents())
        drawing = svg2rlg(svg_path)
        renderPM.drawToFile(drawing, img_path)

    else:
        import pdb
        pdb.set_trace()

    return files.ThumbnailFile(path=img_path)
def read(url):
    """ Read contents from url
        Args:
            url (str): url to read
        Returns contents from url
    """
    return downloader.read(format_url(url))
Exemplo n.º 11
0
    def process(self):
        if 'fonts' in self.link:  # Omit google fonts
            self.tag.decompose()
            return

        # Parse urls in css (using parseString because it is much faster than parseUrl)
        style_sheet = downloader.read(self.link).decode('utf-8-sig',
                                                        errors='ignore')
        sheet = cssutils.parseString(style_sheet)
        for css_url in cssutils.getUrls(sheet):
            if not css_url.startswith('data:image') and not css_url.startswith(
                    'data:application'):
                try:
                    style_sheet = style_sheet.replace(
                        css_url,
                        os.path.basename(
                            self.write_url(css_url,
                                           url=self.link,
                                           default_ext='.png')))
                except BROKEN_EXCEPTIONS as e:
                    LOGGER.warn(
                        'Unable to download stylesheet url at {} ({})'.format(
                            self.url, str(e)))

        self.tag[self.attribute] = self.format_url(
            self.write_contents(self.get_filename(self.link), style_sheet))
        return self.tag[self.attribute]
Exemplo n.º 12
0
 def _download_file(self, write_to_path):
     video_id = self.url.split('#')[1]
     with open(write_to_path, 'wb') as fobj:
         fobj.write(
             downloader.read(
                 'https://www.wevideo.com/api/2/media/{}/content'.format(
                     video_id)))
Exemplo n.º 13
0
    def process(self):
        # Using html.parser as it is better at handling special characters
        contents = BeautifulSoup(downloader.read(self.url, loadjs=self.loadjs),
                                 'html.parser')

        # If a main area is specified, replace body contents with main area
        if self.main_area_selector:
            body = self.create_tag('body')
            body.append(contents.find(*self.main_area_selector))
            contents.body.replaceWith(body)

        # Remove any items to omit
        for item in self.omit_list:
            for element in contents.find_all(*item):
                element.decompose()

        self.preprocess(contents)

        # Scrape tags
        for tag_class in (self.extra_tags + COMMON_TAGS):
            for tag in contents.find_all(*tag_class.selector):
                scraper = tag_class(tag,
                                    self.url,
                                    zipper=self.zipper,
                                    scrape_subpages=self.scrape_subpages,
                                    triaged=self.triaged,
                                    locale=self.locale,
                                    extra_scrapers=self.scrapers,
                                    color=self.color)
                scraper.scrape()

        self.postprocess(contents)
        return contents.prettify(formatter="minimal").encode(
            'utf-8-sig', 'ignore')
Exemplo n.º 14
0
 def process(self):
     # Read URL and generate slideshow html
     contents = BeautifulSoup(downloader.read(self.url, loadjs=self.loadjs), 'html.parser')
     images = []
     for img  in contents.find_all(*self.img_selector):
         images.append(self.write_url(img[self.img_attr], directory="slides"))
     return self.generate_slideshow(images)
 def run(self, limit_page=1, page_number=1):
     total_items = None
     counter = 0
     try:
         page_contents = downloader.read(self.resource_url, loadjs=False)
     except requests.exceptions.HTTPError as e:
         LOGGER.info("Error: {}".format(e))
     else:
         page = BeautifulSoup(page_contents, 'html.parser')
         states = page.find("div", class_=["lm-filter-course"])
         states_tree = self.get_state_lang(states)
         subjects = page.find("div", class_=["lm-filter-subject"])
         subjects_tree = self.get_subjects(subjects)
         levels = page.find("div", class_=["lm-filter-level"])
         levels_tree = self.get_levels(levels)
         pages_params = self.build_page_params(states_tree, subjects_tree,
                                               levels_tree)
         for page_params in pages_params:
             url = self.build_url(
                 page_params["course_tid"], page_params["subject_tid"],
                 page_params.get("educational_level_tid", None))
             yield dict(url=url,
                        subject_name=page_params["subject_name"],
                        state_lang=page_params["state_lang"],
                        level_name=page_params.get("level_name", None))
             LOGGER.info("CRAWLING : URL {}".format(url))
             time.sleep(TIME_SLEEP)
Exemplo n.º 16
0
 def postprocess(self, contents):
     for script in contents.find_all('script'):
         if script.string:
             script.string = script.text.replace(
                 'background="HalfBakedBG.gif"', '')
             for match in re.finditer(
                     r'(?:src)=(?:\'|\")([^\'\"]+)(?:\'|\")', script.string,
                     re.MULTILINE):
                 img_filename = match.group(1).split('?')[0].split(
                     '/')[-1][-20:]
                 script.string = script.text.replace(
                     match.group(1),
                     self.write_url(match.group(1), directory="webimg"))
             for match in re.finditer(
                     r"onclick=\\(?:'|\")parent\.location\s*=\s*(?:'|\")([^'\"]+)(?:'|\")",
                     script.string, re.MULTILINE):
                 page_filename = 'recursostic-{}'.format(
                     match.group(1).split('?')[0].split('/')[-1])
                 page = BeautifulSoup(
                     downloader.read(self.get_relative_url(match.group(1))),
                     'html5lib')
                 page_link = RecursosticScraper(
                     self.get_relative_url(match.group(1)),
                     zipper=self.zipper,
                     locale=self.locale).to_zip()
                 script.string = script.text.replace(
                     match.group(1), page_link)
Exemplo n.º 17
0
 def to_file(self, description, filepath):
     try:
         page_contents = downloader.read(self.resource_url, session=sess)
     except requests.exceptions.HTTPError as e:
         LOGGER.info("Error: {}".format(e))
         return None
     else:
         metadata_dict = {"description": description,
             "language": "en",
             "license": licenses.CC_BY,
             "copyright_holder": "National Endowment for the Humanities",
             "author": "",
             "source_id": self.resource_url}
         page = BeautifulSoup(page_contents, 'html.parser')
         LOGGER.info("COPYRIGHT {}".format(has_copyright(page)))
         content = page.find("div", id="content")
         if self.swf_content(content):
             return
         files = self.remove_external_links(content)
         images = self.find_local_images(content)
         for file_ in files:
             metadata_files = metadata_dict.copy()
             metadata_files["source_id"] = file_
             self.add_resources_files(file_, metadata_files)
         #for img in images:
         #    self.add_resources_files(img)
         self.write('<html><body><head><meta charset="UTF-8"></head>'+\
                     str(content)+'</body><html>', filepath)
         return metadata_dict
Exemplo n.º 18
0
def scrape_channel(channel):
    # Read from Categorias dropdown menu
    page = BeautifulSoup(downloader.read(BASE_URL), 'html5lib')
    dropdown = page.find('a', {'id': 'btn-categorias'}).find_next_sibling('ul')

    # Go through dropdown and generate topics and subtopics
    for category_list in dropdown.find_all('li', {'class': 'has-children'}):

        # Parse categories
        for category in category_list.find_all('li', {'class': 'has-children'}):
            # Add this topic to channel when scraping entire channel
            category_name = category.find('a').text
            topic = nodes.TopicNode(title=category_name, source_id=get_source_id(category_name))
            channel.add_child(topic)
            LOGGER.info(topic.title)

            # Parse subcategories
            for subcategory in category.find_all('li'):
                if not subcategory.attrs.get('class') or 'go-back' not in subcategory.attrs['class']:
                    # Get rid of this check to scrape entire site
                    subcategory_name = subcategory.find('a').text
                    subcategory_link = subcategory.find('a')['href']
                    LOGGER.info('  {}'.format(subcategory_name))
                    subtopic = nodes.TopicNode(title=subcategory_name, source_id=get_source_id(subcategory_link))
                    topic.add_child(subtopic)

                    # Parse resources
                    scrape_subcategory(subcategory_link, subtopic)
Exemplo n.º 19
0
def scrape_english_collection(channel):
    LOGGER.info('Scraping English collection...')
    english_topic = nodes.TopicNode(source_id=ENGLISH_COLLECTION_URL,
                                    title="English")
    channel.add_child(english_topic)

    contents = BeautifulSoup(downloader.read(ENGLISH_COLLECTION_URL),
                             'html5lib')
    collection_key = get_collection_key(contents)

    topic_selection = contents.find('div', {'class': 'asset-list'}).find('div')
    topic_list = [
        t for t in json.loads(topic_selection['data-react-props'])['sections']
        if t['id'] not in EXCLUDED_TOPIC_IDS
    ]

    for topic in topic_list:
        LOGGER.info('    {}'.format(topic['name'].encode('utf-8')))
        topic_node = nodes.TopicNode(source_id=topic['section_key'],
                                     title=topic['name'])
        english_topic.add_child(topic_node)

        # Scrape items in the topic
        url = ENGLISH_ASSETS_URL.format(collection=collection_key,
                                        section=topic['section_key'])
        scrape_collection_files(topic_node, url)
Exemplo n.º 20
0
 def _download_file(self, write_to_path):
     audio_id = re.search(r'(?:player_ek_)([^_]+)(?:_2_1\.html)',
                          self.url).group(1)
     with open(write_to_path, 'wb') as fobj:
         fobj.write(
             downloader.read(
                 'http://www.ivoox.com/listenembeded_mn_{}_1.m4a?source=EMBEDEDHTML5'
                 .format(audio_id)))
Exemplo n.º 21
0
 def write_url(self, url, filename, directory=None):
     """ write_url: Write contents from url to filename in zip
         Args:
             url: (str) url to file to download
             filename: (str) name of file in zip
             directory: (str) directory in zipfile to write file to (optional)
         Returns: path to file in zip
     """
     return self.write_contents(filename, read(url), directory=directory)
Exemplo n.º 22
0
def save_thumbnail(url, save_as, sess):
    THUMB_DATA_DIR = build_path([DATA_DIR, 'thumbnail'])
    filepath = os.path.join(THUMB_DATA_DIR, save_as)
    try:
        document = downloader.read(url, loadjs=False, session=sess)
    except requests.exceptions.ConnectionError as e:
        return None
    else:
        with open(filepath, 'wb') as f:
            f.write(document)
            return filepath
Exemplo n.º 23
0
def scrape_subcategory(link, topic):
    url = "{}{}".format(BASE_URL, link.lstrip("/"))
    resource_page = BeautifulSoup(downloader.read(url), 'html5lib')

    # Skip "All" category
    for resource_filter in resource_page.find('div', {'class': 'menu-filtro'}).find_all('a')[1:]:
        LOGGER.info('    {}'.format(resource_filter.text))
        source_id = get_source_id('{}/{}'.format(topic.title, resource_filter.text))
        filter_topic = nodes.TopicNode(title=resource_filter.text, source_id=source_id)
        scrape_resource_list(url + resource_filter['href'], filter_topic)
        topic.add_child(filter_topic)
Exemplo n.º 24
0
def get_available_languages():
    contents = BeautifulSoup(
        downloader.read(BASE_URL.format(language='en', endpoint='')),
        'html5lib')
    languages = []
    for lang in contents.find('ul', {
            'class': 'sf-lang-selector'
    }).findAll('li'):
        languages.append(
            re.search(r"openLinkWithTranslation\('([^\']+)'\)",
                      lang.find('a')['onclick']).group(1))
    return languages
Exemplo n.º 25
0
    def preprocess(self, contents):
        # Hide certain elements from the page
        style_tag = self.create_tag('style')
        style_tag.string = '.genially-view-logo { pointer-events: none;} .genially-view-navigation-actions,'\
            ' .genially-view-navigation-actions-toggle-button{display: none !important; pointer-events:none;}'
        contents.head.append(style_tag)

        # Prefetch API response and replace script content accordingly
        genial_id = self.url.split('/')[-1]
        response = requests.get(
            'https://view.genial.ly/api/view/{}'.format(genial_id))
        for script in contents.find_all('script'):
            if script.get('src') and 'main' in script['src']:
                script_contents = downloader.read(
                    self.get_relative_url(script['src'])).decode('utf-8')
                genial_data = json.loads(response.content)

                if len(genial_data['Videos']) or len(genial_data['Audios']):
                    LOGGER.error(
                        'Unhandled genial.ly video or audio at {}'.format(url))

                if genial_data['Genially']['ImageRender']:
                    genial_data['Genially']['ImageRender'] = self.write_url(
                        genial_data['Genially']['ImageRender'],
                        directory='webimg')
                for image in genial_data['Images']:
                    image['Source'] = self.write_url(image['Source'],
                                                     directory='webimg')
                for slide in genial_data['Slides']:
                    slide['Background'] = self.write_url(slide['Background'],
                                                         directory='webimg')
                for code in genial_data['Contents']:
                    code_contents = BeautifulSoup(code['HtmlCode'],
                                                  'html.parser')
                    for img in code_contents.find_all('img'):
                        try:
                            img['src'] = self.write_url(img['src'],
                                                        directory='webimg')
                        except (requests.exceptions.HTTPError,
                                requests.exceptions.ConnectionError) as e:
                            LOGGER.warning(
                                "Error processing genial.ly at {} ({})".format(
                                    url, str(e)))
                    code['HtmlCode'] = code_contents.prettify()
                script_contents = script_contents.replace(
                    'r.a.get(c).then(function(e){return n(e.data)})',
                    'n({})'.format(json.dumps(genial_data)))
                script['class'] = ['skip-scrape']
                script['src'] = self.write_contents(
                    'genial-{}-embed.js'.format(genial_id),
                    script_contents,
                    directory="js")
Exemplo n.º 26
0
 def write_url(self, url, filename, directory=None):
     """ write_url: Write contents from url to filename in zip
         Args:
             url: (str) url to file to download
             filename: (str) name of file in zip
             directory: (str) directory in zipfile to write file to (optional)
         Returns: path to file in zip
     """
     filepath = "{}/{}".format(directory.rstrip("/"),
                               filename) if directory else filename
     if not self.contains(filepath):
         self._write_to_zipfile(filepath, read(url))
     return filepath
Exemplo n.º 27
0
    def scrape_video_page(self, url, title):
        """ Creates a video topic with all the videos on the page """
        IGNORED_VIDEOS = ['google', 'facebook']
        VIDEO_SCRAPERS = [who.WHOWebVideoScraper, who.WHOVideoScraper]

        video_topic = nodes.TopicNode(source_id=url, title=title)
        contents = BeautifulSoup(downloader.read(url), 'html.parser')

        # Scrape youtube embeds
        # e.g. https://www.who.int/emergencies/diseases/novel-coronavirus-2019/advice-for-public/videos
        for iframe in contents.findAll('iframe'):
            if not any(
                [test for test in IGNORED_VIDEOS if test in iframe['src']]):
                header = iframe.find_parent('div', {
                    'class': 'sf_colsIn'
                }).find('div', {
                    'class': 'section-heading'
                }).text.strip()
                LOGGER.info('      - Downloading {}'.format(
                    header.encode('utf-8')))
                scraper = guess_scraper(iframe['src'], scrapers=VIDEO_SCRAPERS
                                        )  # Might be native or youtube video
                video_node = scraper.to_contentnode(header,
                                                    license=LICENSE,
                                                    directory="videos")
                video_topic.add_child(video_node)

        # Scrape native videos
        # e.g. https://www.who.int/zh/emergencies/diseases/novel-coronavirus-2019/advice-for-public/videos
        for video in contents.findAll('div',
                                      {'class': 'sf-multimedia-item__video'}):
            header = video.find('h3').text.strip()
            LOGGER.info('      - Downloading {}'.format(
                header.encode('utf-8')))
            video_matches = re.search(r"\(\s*\"(.+)\"\,\s*\"(.+)\"\)",
                                      video.find('a')['onclick'])

            # Embedded youtube videos here refer to playlists, so skip them
            if 'YoutubeVideo' == video_matches.group(1):
                continue

            scraper = who.WHOVideoScraper(video_matches.group(2))
            video_node = scraper.to_contentnode(header,
                                                license=LICENSE,
                                                directory="videos")
            video_topic.add_child(video_node)

        return video_topic
Exemplo n.º 28
0
    def add_file(self,
                 path,
                 title,
                 download_url,
                 write_data=True,
                 ext=None,
                 license=None,
                 copyright_holder=None,
                 **node_data):
        """ add_file: Creates file in csv and writes file to zip
            Args:
                path: (str) where in zip to write file
                title: (str) content's title
                download_url: (str) url or local path to download from
                write_data: (boolean) indicates whether to add as a csv entry (optional)
                ext: (str) extension to use for file
                license (str): content's license
                copyright_holder (str): holder of content's license (required except for PUBLIC_DOMAIN)
                license_description (str): description of content's license (optional)
                source_id: (str) content's original id (optional)
                description: (str) description of content (optional)
                author (str): who created the content (optional)
                language (str): language of content (optional)
                thumbnail (str):  path to thumbnail in zip (optional)
            Returns: path to file in zip
        """
        if write_data:
            assert license, "Files must have a license"
            copyright_holder = None if not copyright_holder or copyright_holder.strip(
            ) == '' else copyright_holder
            assert license in NO_COPYRIGHT_HOLDER_REQUIRED or copyright_holder, "Licenses must have a copyright holder if they are not public domain"

        self._parse_path(path)
        if not ext:
            _name, ext = os.path.splitext(download_url or "")
            ext = ext.lower(
            )  # normalize to lowercase extensions inside zip archive
        filepath = "{}/{}{}".format(path, title, ext)
        if download_url and filepath:
            self._write_to_zip(filepath, read(download_url))
            if write_data:
                self._commit(filepath,
                             title,
                             license=license,
                             copyright_holder=copyright_holder,
                             **node_data)
            return filepath
Exemplo n.º 29
0
    def open(self, update=False):
        """
        Opens pdf file to read from.
        """
        filename = os.path.basename(self.source_path)
        folder, _ext = os.path.splitext(filename)
        self.path = os.path.sep.join([self.directory, folder, filename])
        if not os.path.exists(os.path.dirname(self.path)):
            os.makedirs(os.path.dirname(self.path))

        # Download full pdf if it hasn't already been downloaded
        if not os.path.isfile(self.path) or update:
            with open(self.path, "wb") as fobj:
                fobj.write(read(self.source_path))

        self.file = open(self.path, 'rb')
        self.pdf = CustomPDFReader(self.file)
    def open(self):
        """
        Opens the specified PDF file for editing. If the path is a URL, it will first download the file.
        """
        filename = os.path.basename(self.download_url)
        folder, _ext = os.path.splitext(filename)
        self.path = os.path.sep.join([self.directory, folder, filename])
        if not os.path.exists(os.path.dirname(self.path)):
            os.makedirs(os.path.dirname(self.path))

        # Download full pdf if it hasn't already been downloaded
        if not os.path.isfile(self.path):
            with open(self.path, "wb") as fobj:
                fobj.write(read(self.download_url))

        self.file = open(self.path, 'rb')
        self.pdf = CustomPDFReader(self.file)