def _process(self, page_no, outfile, text):
        print('Processing: {} {}'.format(self.reference, page_no))

        image = Image.objects.descendant_of(
            self.catalogue_entry).get(
                reference__endswith=page_no)

        if not self.textonly:
            temp_filename = self._temp_filename()

            with open(temp_filename, 'wb') as out:
                outfile.write(out)

            # Create a Document
            with open(temp_filename, 'rb') as f:
                document = Document()
                document.title = 'Transcript: {}'.format(image.reference)
                document.file.save('Transcript {}.pdf'.format(
                    image.reference), File(f), save=True)
                document.save()

                image.transcription = text
                image.transcription_pdf = document
                image.save()

            # Delete the temporary file!
            os.remove(temp_filename)
        else:
            image.transcription = text
            image.save()
Пример #2
0
    def setUp(self):
        self.document = Document(title="Test document")
        self.document_without_file = Document(title="Document without file")
        self.document.file.save('example.txt',
                                ContentFile("A boring example document"))
        self.image = CFGOVImage.objects.create(title='test',
                                               file=get_test_image_file())
        self.rendition = self.image.get_rendition('original')

        CACHE_PURGED_URLS[:] = []
    def handle(self, *args, **options):
        self.reference = options['reference']
        self.file_path = options['file_path']

        self.input_file = PdfFileReader(open(self.file_path, "rb"))
        self.catalogue_entry = CatalogueEntry.objects.get(title=self.reference)
        
        
        transcription_page = RichTextPage.objects.get(
            slug='transcriptions')

        trans_page = TranscriptionPage()
        trans_page.title = self.reference
        trans_page.page = self.catalogue_entry

        with open(self.file_path, 'rb') as f:
            document = Document()
            document.title = 'Transcript: {}'.format(self.reference)
            document.file.save('Transcript {}.pdf'.format(
                self.reference), File(f), save=True)
            document.save()

            trans_page.transcription_pdf = document
        transcription_page.add_child(instance=trans_page)
        trans_page.save()

        num_of_pages = self.input_file.getNumPages()

        res_page_no = None
        res_outfile = None
        res_text = None

        for p in range(num_of_pages):
            page = self.input_file.getPage(p)
            page_text = page.extractText()

            m = re.findall(r"\[f\.\s?([0-9]+)([r,v]+)\]", page_text)

            if m:
                if res_page_no is not None:
                    self._process(res_page_no, res_outfile, res_text)

                res_page_no = "{}{}".format(m[0][0].zfill(3), m[0][1])
                res_outfile = PdfFileWriter()
                res_outfile.addPage(page)
                res_text = page_text

            else:
                # Add Page
                res_outfile.addPage(page)
                res_text = '{} {}'.format(res_text, page_text)

            if p == num_of_pages and res_page_no is not None:
                self._process(res_page_no, res_outfile, res_text)
Пример #4
0
def parse_media_blocks(media_urls):
    media_blocks = []

    for url in media_urls.split(", "):
        domain = urlparse(url).netloc

        if domain in ["vimeo.com", "www.youtube.com"]:
            embed = get_embed(url)
            embed_tuple = ("embed", embed)
            media_blocks.append(embed_tuple)
        else:
            # The default should be to fetch a PDF or image file (i.e. from westernfriend.org)
            response = requests.get(url)
            content_type = response.headers["content-type"]
            file_name = url.split("/")[-1]
            file_bytes = BytesIO(response.content)

            if content_type == "application/pdf":
                # Create file
                document_file = File(file_bytes, name=file_name)

                document = Document(
                    title=file_name,
                    file=document_file,
                )

                document.save()

                document_link_block = ("document", document)

                media_blocks.append(document_link_block)
            elif content_type in ["image/jpeg", "image/png"]:
                # create image
                image_file = ImageFile(file_bytes, name=file_name)

                image = Image(
                    title=file_name,
                    file=image_file,
                )

                image.save()

                image_block = ("image", image)

                media_blocks.append(image_block)
            else:
                print(url)
                print(content_type)
                print("-----")

    return media_blocks
def download_document(url, document_filename):
    """
    Takes the attached document URL from the old database API,
    retrieves the document, then saves it with a new
    filename and attaches it to the post
    """
    if url:
        document_location = os.path.join('home/management/api/documents',
                                         document_filename)
        urllib.urlretrieve(url, document_location)
        document = Document(
            title=document_filename,
            file=File(open(document_location), name=document_filename[:90]),
        )
        document.save()
        return document.id
Пример #6
0
def dummy_wagtail_doc(request):
    if not Collection.objects.exists():  # pragma: no cover
        Collection.add_root()

    doc = Document(title='hello')
    doc.file.save('foo.txt', ContentFile('foo', 'foo.txt'))
    doc.save()
    doc = Document.objects.get(pk=doc.pk)  # Reload to ensure the upload took

    def nuke():
        try:  # Try cleaning up so `/var/media` isn't full of foo
            doc.file.delete()
            doc.delete()
        except:  # pragma: no cover
            pass

    request.addfinalizer(nuke)
    return doc
Пример #7
0
def dummy_wagtail_doc(request):
    if not Collection.objects.exists():  # pragma: no cover
        Collection.add_root()

    doc = Document(title='hello')
    doc.file.save('foo.txt', ContentFile('foo', 'foo.txt'))
    doc.save()
    doc = Document.objects.get(pk=doc.pk)  # Reload to ensure the upload took

    def nuke():
        try:  # Try cleaning up so `/var/media` isn't full of foo
            doc.file.delete()
            doc.delete()
        except:  # pragma: no cover
            pass

    request.addfinalizer(nuke)
    return doc
Пример #8
0
    def parse_results(self):
        media_files = self.results

        for r in media_files:
            sub_site = r.get('source')
            collection_name = SOURCES[sub_site]
            collection = Collection.objects.get(name=collection_name)
            source_url = r.get('source_url')
            media_type = r.get('media_type')
            media_name = source_url.split('/')[-1]
            response = requests.get(source_url)
            title = r.get('title')  # if the title id blank it causes an error
            if not title:
                title = 'No title was available'
            if response:

                if media_type == 'file':  # save to documents

                    media_file = File(BytesIO(response.content),
                                      name=media_name)
                    file = Document(title=title,
                                    file=media_file,
                                    collection=collection)
                    file.save()
                    file.created_at = r.get('date')
                    file.save()

                elif media_type == 'image':  # save to images

                    image_file = ImageFile(BytesIO(response.content),
                                           name=media_name)
                    image = Image(title=title,
                                  file=image_file,
                                  collection=collection)
                    image.save()
                    image.created_at = r.get('date')
                    image.save()

            else:
                sys.stdout.write(
                    '⚠️ Got no response. Error has been logged importer/log/import_media_files.txt\n'
                )
                with open('importer/log/import_media_files.txt',
                          'a') as the_file:
                    the_file.write('{}\n'.format(r))

        if self.next:
            time.sleep(self.sleep_between_fetches)
            self.fetch_url(self.next)
            self.parse_results()
        return Document.objects.count() + Image.objects.count(), 0
Пример #9
0
    def make_documents(self):

        if self.document['type_of_publication'] == 'heading':
            return self.create_heading(self.document['heading_text'])

        elif self.document['type_of_publication'] == 'document':
            document = self.document['document']
            if document:
                # lets get the file here, saves cluttering the block builder
                response = requests.get(document['url'])
                if response:
                    media_file = File(
                        BytesIO(response.content),
                        name=document['filename']
                    )
                    file = Document(
                        title=document['title'],
                        file=media_file,
                        collection=self.collection
                    )
                    file.save()
                    file.created_at = make_aware(
                        dateutil.parser.parse(document['date']))
                    file.save()
                    return self.create_document_type(file, document, self.document)
                else:
                    with open('importer/log/make_documents_list_errors.txt', 'a') as the_file:
                        the_file.write('{}: {}\n'.format(
                            self.publication, self.publication.id))

        elif self.document['type_of_publication'] == 'documentlink':
            # pass
            # document = self.data['document']
            return self.create_link_type(self.document)

        elif self.document['type_of_publication'] == 'audiovideo':
            # pass
            # document = self.data['document']
            return self.create_embed_type(self.document)

        elif self.document['type_of_publication'] == 'freetext':
            return self.create_free_text(self.document)

        # return self.stream_value

        """
Пример #10
0
    def prepare_links(self, link, page_path, page):

        path_list = page_path.split('/')  # a list of path segments
        # first is always '' so lets remove it
        del path_list[0]
        if not path_list[-1]:  # and remove the past one if none
            del path_list[-1]

        page_path = '/' + '/'.join(path_list) + '/'  # a string of the path

        # some links are anchors
        # is_anchor_link = False
        # if '#' in path_list[-1]:
        #     is_anchor_link = True

        page_path_live = 'https://www.england.nhs.uk' + page_path
        # print(page_path)

        home_page = Page.objects.filter(title='Home')[0]

        if not path_list:
            # home page
            page_link = self.make_page_link(link.text, home_page.id,
                                            home_page.title)
            self.change_links.append([link, page_link])

        elif path_list and path_list[0] == 'publication' or len(
                path_list) >= 2 and path_list[1] == 'publication':
            # find source url for publication ours are all in sub sites but links are not
            try:
                publication = Publication.objects.get(wp_link=page_path_live)
                page_link = self.make_page_link(link.text, publication.id,
                                                publication.title)
                self.change_links.append([link, page_link])
            except:
                with open('importer/log/parse_stream_fields_url_errors.txt',
                          'a') as log:
                    log.write('{} | {} | {}\n'.format(link, page_path, page))

        elif path_list and path_list[0] == 'news':
            # find source url for news ours are all in sub sites
            try:
                post = Post.objects.get(wp_link=page_path_live)
                page_link = self.make_page_link(link.text, post.id, post.title)
                self.change_links.append([link, page_link])
            except:
                with open('importer/log/parse_stream_fields_url_errors.txt',
                          'a') as log:
                    log.write('{} | {} | {}\n'.format(link, page_path, page))

        elif path_list and path_list[0] == 'blog':
            # find source url for blogs
            # print(page_path_live)
            try:
                blog = Blog.objects.get(wp_link=page_path_live)
                page_link = self.make_page_link(link.text, blog.id, blog.title)
                self.change_links.append([link, page_link])
            except:
                with open('importer/log/parse_stream_fields_url_errors.txt',
                          'a') as log:
                    log.write('{} | {} | {}\n'.format(link, page_path, page))

        elif path_list and path_list[0] == 'wp-content' or len(
                path_list
        ) >= 2 and path_list[
                1] == 'wp-content':  # becuse sometimes they are subsite links
            # a file link these arnt in the self.urls
            """ problem here is we cant link to a page within a document using #page=2 """
            if '#' in path_list[-1]:
                page_path = page_path.split('#')[0]
            document_id = None
            file = 'documents/' + path_list[-1]
            # print(file)

            try:
                document = Document.objects.get(file=file)
                document_id = document.id

            except Document.DoesNotExist:
                with open('importer/log/media_document_not_found.txt',
                          'a') as the_file:
                    the_file.write('{} | Linked from: {}\n'.format(
                        page_path, page))
                collection_root = Collection.get_first_root_node()
                remote_file = requests.get(page_path_live)
                media_file = File(BytesIO(remote_file.content),
                                  name=path_list[-1])
                file = Document(title=path_list[-1],
                                file=media_file,
                                collection=collection_root)
                file.save()
                pass

            if document_id:
                document_link = self.make_document_link(
                    link.text, document_id, path_list[-1])
                self.change_links.append([link, document_link])

        elif page_path in self.url_map_keys and page_path not in SKIP_ANCHOR_URLS:
            page_link = self.make_page_link(link.text,
                                            self.url_map[page_path]['id'],
                                            self.url_map[page_path]['title'])
            self.change_links.append([link, page_link])

        else:
            # print('using live')
            response = requests.get('https://www.england.nhs.uk' + page_path)
            url = ''
            is_post = False
            if response:
                url = response.url.split('/')
                del url[-1]
                del url[:3]
                # some urls have links to news items that start 2010/09 that needs to removed to find the url
                if url[0].isdigit() and url[1].isdigit(
                ) and not url[2].isdigit():  # is post
                    try:
                        page = Post.objects.get(wp_link=response.url)
                        id = page.id
                        title = page.title
                        page_link = self.make_page_link(link.text, id, title)
                        self.change_links.append([link, page_link])
                    except Post.DoesNotExist:
                        pass
                elif path_list[0].isdigit() and path_list[1].isdigit(
                ) and path_list[2].isdigit():
                    try:
                        blog = Blog.objects.get(wp_link=response.url)
                        id = blog.id
                        title = blog.title
                        page_link = self.make_page_link(link.text, id, title)
                        self.change_links.append([link, page_link])
                    except Blog.DoesNotExist:
                        pass
                else:  # is page
                    # print('could be a page')
                    actual_url = '/' + '/'.join(
                        url) + '/'  # a string of the path
                    if actual_url in self.url_map_keys:
                        id = self.url_map[actual_url]['id']
                        title = self.url_map[actual_url]['title']
                        page_link = self.make_page_link(link.text, id, title)
                        self.change_links.append([link, page_link])
                    else:
                        print('leaving the link alone')

            else:
                # print('not found')
                with open('importer/log/parse_stream_fields_url_errors.txt',
                          'a') as log:
                    log.write('{}\n'.format(page_path, page))
Пример #11
0
    def handle(self, *args, **options):
        def wrap_p(text):
            return "<p>{}</p>".format(text)

        MISSING_VALUES = ["", " ", ".", "NA", "N/A", "N.A."]

        # Box Auth
        config = JWTAuth.from_settings_file(
            os.path.join(settings.BASE_DIR, 'box_config.json'))
        client = Client(config)

        def recurse_items(folder_items, box_items):
            for item in folder_items:
                if type(item) is Folder:
                    sub_folder_items = client.folder(
                        folder_id=item.id).get_items()
                    box_items = recurse_items(sub_folder_items, box_items)
                else:
                    box_items[item.name.lower()] = item.id
            return box_items

        box_items = {}
        folder_items = client.folder(folder_id="93089112686").get_items()
        box_items = recurse_items(folder_items, box_items)

        # Create IA if not already present
        dataset_listing = DataSetListing.objects.live().first()
        if not dataset_listing:
            data_section = DataSectionPage.objects.live().first()
            if not data_section:
                home_page = HomePage.objects.live().first()
                data_section = DataSectionPage(title="Data")
                home_page.add_child(instance=data_section)
                data_section.save_revision().publish()
            dataset_listing = DataSetListing(title="Datasets")
            data_section.add_child(instance=dataset_listing)
            dataset_listing.save_revision().publish()

        # Fetch data and parse
        source_csv_url = "https://docs.google.com/spreadsheets/d/1pDbdncnm1TF41kJJX2WjZ2Wq9juOvUqU/export?format=csv&id=1pDbdncnm1TF41kJJX2WjZ2Wq9juOvUqU&gid=2086173829"
        dataset_csv_url = "https://docs.google.com/spreadsheets/d/1pDbdncnm1TF41kJJX2WjZ2Wq9juOvUqU/export?format=csv&id=1pDbdncnm1TF41kJJX2WjZ2Wq9juOvUqU&gid=1736754230"

        source_response = requests.get(source_csv_url)
        source_response.encoding = 'utf-8'
        source_text = source_response.iter_lines(decode_unicode=True)
        dataset_response = requests.get(dataset_csv_url)
        dataset_response.encoding = 'utf-8'
        dataset_text = dataset_response.iter_lines(decode_unicode=True)

        # Data sources
        """
        (Pdb) source_dict.keys()
        dict_keys(['Source ID', 'Source title', 'Organisation ', 'Long description of the data source', 'Date of access', 'Link to the source', 'Geography information', 'Keyword search', 'Internal notes', 'Analyst that worked on the data', 'Licence', 'Check', 'Signed-off and ready?'])
        """
        skip = True
        source_reader = csv.DictReader(source_text)
        for source_dict in source_reader:
            if skip:
                skip = False
            else:
                source_check = DataSource.objects.filter(
                    source_id=source_dict['Source ID'])
                if not source_check and source_dict[
                        'Source title'] not in MISSING_VALUES and source_dict[
                            'Signed-off and ready?'].lower() == "yes":
                    print("source: ", source_dict['Source title'])
                    if type(source_dict['Date of access']) is not datetime:
                        try:
                            date_of_access = datetime.strptime(
                                source_dict['Date of access'], "%d/%m/%Y")
                        except (ValueError, TypeError) as e:
                            date_of_access = None
                    else:
                        date_of_access = source_dict['Date of access']

                    # try:
                    #     tag_list = [tag.strip() for tag in source_dict['Keyword search'].split(",") if len(tag.strip()) < 100 and len(tag.strip()) > 0]
                    # except AttributeError:
                    #     tag_list = []
                    new_source = DataSource(
                        source_id=source_dict['Source ID'],
                        title=source_dict['Source title'],
                        organisation=source_dict['Organisation '],
                        description=source_dict[
                            'Long description of the data source'],
                        date_of_access=date_of_access,
                        link_to_data=source_dict['Link to the source'],
                        geography=source_dict['Geography information'],
                        internal_notes=source_dict['Internal notes'],
                        licence=source_dict['Licence'])
                    # new_source.topics.add(*tag_list)

                    # Authors
                    author_names = source_dict[
                        'Analyst that worked on the data']
                    authors = []
                    if author_names not in MISSING_VALUES:
                        author_names_list = [
                            author.strip()
                            for author in author_names.split(",")
                        ]
                        for author_name in author_names_list:
                            internal_author_page_qs = TeamMemberPage.objects.filter(
                                name=author_name)
                            if internal_author_page_qs:
                                author_obj = {
                                    "type": "internal_author",
                                    "value": internal_author_page_qs.first().pk
                                }
                            else:
                                author_obj = {
                                    "type": "external_author",
                                    "value": {
                                        "name": author_name,
                                        "title": "",
                                        "photograph": None,
                                        "page": ""
                                    }
                                }
                            authors.append(author_obj)
                    if authors:
                        new_source.authors = json.dumps(authors)
                    new_source.save()

        # Datasets
        """
        (Pdb) dataset_dict.keys()
        dict_keys(['Dataset ID', 'What is the title of the data set?', 'What DI publication is this dataset associated with?', 'What is a long description of the data set?', 'Release date?', 'Geography information', 'Geographic coding', 'Unit', 'Keyword search', 'Internal notes', 'Analyst that worked on the data', 'Licence', 'Suggested citation', 'Source 1', 'Source 2 (optional)', 'Source 3 (optional)', 'Source 4 (optional)', 'Source 5 (optional)', 'Source 6 (optional)', 'Source 7 (optional)', 'Source 8 (optional)', 'Source 9 (optional)', 'Done', 'File location Excel', 'File name Excel', 'File location csv', 'File name csv', 'File notes', 'Signed-off and ready?'])
        """
        source_keys = [
            'Source 1', 'Source 2 (optional)', 'Source 3 (optional)',
            'Source 4 (optional)', 'Source 5 (optional)',
            'Source 6 (optional)', 'Source 7 (optional)',
            'Source 8 (optional)', 'Source 9 (optional)'
        ]
        skip = True
        dataset_reader = csv.DictReader(dataset_text)
        for dataset_dict in dataset_reader:
            if skip:
                skip = False
            else:
                dataset_check = DatasetPage.objects.filter(
                    dataset_id=dataset_dict['Dataset ID'])
                if not dataset_check and dataset_dict[
                        'What is the title of the data set?'] not in MISSING_VALUES and dataset_dict[
                            'Signed-off and ready?'].lower() == "yes":
                    print("Dataset: ",
                          dataset_dict['What is the title of the data set?'])
                    if type(dataset_dict['Release date?']) is not datetime:
                        try:
                            release_date = datetime.strptime(
                                dataset_dict['Release date?'], "%d/%m/%Y")
                        except (ValueError, TypeError) as e:
                            release_date = datetime.now()
                    else:
                        release_date = dataset_dict['Release date?']

                    meta_json = []
                    if dataset_dict[
                            'What is a long description of the data set?'] not in MISSING_VALUES:
                        meta_json.append({
                            "type":
                            "description",
                            "value":
                            wrap_p(dataset_dict[
                                'What is a long description of the data set?'])
                        })
                    if dataset_dict[
                            'Geography information'] not in MISSING_VALUES:
                        meta_json.append({
                            "type":
                            "geography",
                            "value":
                            wrap_p(dataset_dict['Geography information'])
                        })
                    if dataset_dict['Geographic coding'] not in MISSING_VALUES:
                        meta_json.append({
                            "type":
                            "geographic_coding",
                            "value":
                            wrap_p(dataset_dict['Geographic coding'])
                        })
                    if dataset_dict['Unit'] not in MISSING_VALUES:
                        meta_json.append({
                            "type": "unit",
                            "value": wrap_p(dataset_dict['Unit'])
                        })
                    if dataset_dict['Internal notes'] not in MISSING_VALUES:
                        meta_json.append({
                            "type":
                            "internal_notes",
                            "value":
                            wrap_p(dataset_dict['Internal notes'])
                        })
                    if dataset_dict['Licence'] not in MISSING_VALUES:
                        meta_json.append({
                            "type":
                            "licence",
                            "value":
                            wrap_p(dataset_dict['Licence'])
                        })
                    if dataset_dict[
                            'Suggested citation'] not in MISSING_VALUES:
                        meta_json.append({
                            "type":
                            "citation",
                            "value":
                            wrap_p(dataset_dict['Suggested citation'])
                        })

                    new_dataset = DatasetPage(
                        title=dataset_dict[
                            'What is the title of the data set?'],
                        dataset_id=dataset_dict['Dataset ID'],
                        dataset_title=dataset_dict[
                            'What is the title of the data set?'],
                        release_date=release_date,
                        meta_data=json.dumps(meta_json))

                    # try:
                    #     tag_list = [tag.strip() for tag in dataset_dict['Keyword search'].split(",") if len(tag.strip()) < 100 and len(tag.strip()) > 0]
                    # except AttributeError:
                    #     tag_list = []
                    # new_dataset.topics.add(*tag_list)

                    dataset_listing.add_child(instance=new_dataset)

                    # Authors
                    author_names = dataset_dict[
                        'Analyst that worked on the data']
                    authors = []
                    if author_names not in MISSING_VALUES:
                        author_names_list = [
                            author.strip()
                            for author in author_names.split(",")
                        ]
                        for author_name in author_names_list:
                            internal_author_page_qs = TeamMemberPage.objects.filter(
                                name=author_name)
                            if internal_author_page_qs:
                                author_obj = {
                                    "type": "internal_author",
                                    "value": internal_author_page_qs.first().pk
                                }
                            else:
                                author_obj = {
                                    "type": "external_author",
                                    "value": {
                                        "name": author_name,
                                        "title": "",
                                        "photograph": None,
                                        "page": ""
                                    }
                                }
                            authors.append(author_obj)
                    if authors:
                        new_dataset.authors = json.dumps(authors)

                    new_dataset.save_revision().publish()

                    if dataset_dict[
                            'What DI publication is this dataset associated with?'] not in MISSING_VALUES:
                        pub_titles = [
                            pub_title.strip() for pub_title in dataset_dict[
                                'What DI publication is this dataset associated with?']
                            .split("|")
                        ]
                        for pub_title in pub_titles:
                            pub_check = Page.objects.filter(
                                title=pub_title).live()
                            if pub_check:
                                pub_page = pub_check.first().specific
                                if isinstance(pub_page, PublicationPage):
                                    PublicationPageDataset(
                                        item=pub_page,
                                        dataset=new_dataset).save()
                                elif isinstance(pub_page,
                                                PublicationSummaryPage):
                                    PublicationSummaryPageDataset(
                                        item=pub_page,
                                        dataset=new_dataset).save()
                                elif isinstance(pub_page,
                                                PublicationChapterPage):
                                    PublicationChapterPageDataset(
                                        item=pub_page,
                                        dataset=new_dataset).save()
                                elif isinstance(pub_page,
                                                PublicationAppendixPage):
                                    PublicationAppendixPageDataset(
                                        item=pub_page,
                                        dataset=new_dataset).save()
                                elif isinstance(pub_page,
                                                LegacyPublicationPage):
                                    LegacyPublicationPageDataset(
                                        item=pub_page,
                                        dataset=new_dataset).save()
                                elif isinstance(pub_page,
                                                ShortPublicationPage):
                                    ShortPublicationPageDataset(
                                        item=pub_page,
                                        dataset=new_dataset).save()

                    for source_key in source_keys:
                        key_val = dataset_dict[source_key]
                        if key_val not in MISSING_VALUES:
                            try:
                                related_datasource = DataSource.objects.get(
                                    title=key_val)
                                DataSetSource(
                                    page=new_dataset,
                                    source=related_datasource).save()
                            except DataSource.DoesNotExist:
                                pass

                    if dataset_dict["File name Excel"] not in MISSING_VALUES:
                        item_name = dataset_dict["File name Excel"].lower(
                        ) + ".xlsx"
                        try:
                            item_id = box_items[item_name]
                            f = BytesIO()
                            client.file(item_id).download_to(f)
                            doc = Document(
                                title=dataset_dict["File name Excel"])
                            doc.file.save(item_name, File(f), save=True)
                            doc.save()
                            download = DatasetDownloads(
                                page=new_dataset,
                                title=dataset_dict["File name Excel"],
                                file=doc)
                            download.save()
                        except KeyError:
                            self.stdout.write(
                                self.style.WARNING(item_name + " not found."))

                    if dataset_dict["File name csv"] not in MISSING_VALUES:
                        item_name = dataset_dict["File name csv"].lower(
                        ) + ".csv"
                        try:
                            item_id = box_items[item_name]
                            f = BytesIO()
                            client.file(item_id).download_to(f)
                            doc = Document(title=dataset_dict["File name csv"])
                            doc.file.save(item_name, File(f), save=True)
                            doc.save()
                            download = DatasetDownloads(
                                page=new_dataset,
                                title=dataset_dict["File name csv"],
                                file=doc)
                            download.save()
                        except KeyError:
                            self.stdout.write(
                                self.style.WARNING(item_name + " not found."))

        self.stdout.write(self.style.SUCCESS('Called successfully'))