def _process(self, page_no, outfile, text): print('Processing: {} {}'.format(self.reference, page_no)) image = Image.objects.descendant_of( self.catalogue_entry).get( reference__endswith=page_no) if not self.textonly: temp_filename = self._temp_filename() with open(temp_filename, 'wb') as out: outfile.write(out) # Create a Document with open(temp_filename, 'rb') as f: document = Document() document.title = 'Transcript: {}'.format(image.reference) document.file.save('Transcript {}.pdf'.format( image.reference), File(f), save=True) document.save() image.transcription = text image.transcription_pdf = document image.save() # Delete the temporary file! os.remove(temp_filename) else: image.transcription = text image.save()
def setUp(self): self.document = Document(title="Test document") self.document_without_file = Document(title="Document without file") self.document.file.save('example.txt', ContentFile("A boring example document")) self.image = CFGOVImage.objects.create(title='test', file=get_test_image_file()) self.rendition = self.image.get_rendition('original') CACHE_PURGED_URLS[:] = []
def handle(self, *args, **options): self.reference = options['reference'] self.file_path = options['file_path'] self.input_file = PdfFileReader(open(self.file_path, "rb")) self.catalogue_entry = CatalogueEntry.objects.get(title=self.reference) transcription_page = RichTextPage.objects.get( slug='transcriptions') trans_page = TranscriptionPage() trans_page.title = self.reference trans_page.page = self.catalogue_entry with open(self.file_path, 'rb') as f: document = Document() document.title = 'Transcript: {}'.format(self.reference) document.file.save('Transcript {}.pdf'.format( self.reference), File(f), save=True) document.save() trans_page.transcription_pdf = document transcription_page.add_child(instance=trans_page) trans_page.save() num_of_pages = self.input_file.getNumPages() res_page_no = None res_outfile = None res_text = None for p in range(num_of_pages): page = self.input_file.getPage(p) page_text = page.extractText() m = re.findall(r"\[f\.\s?([0-9]+)([r,v]+)\]", page_text) if m: if res_page_no is not None: self._process(res_page_no, res_outfile, res_text) res_page_no = "{}{}".format(m[0][0].zfill(3), m[0][1]) res_outfile = PdfFileWriter() res_outfile.addPage(page) res_text = page_text else: # Add Page res_outfile.addPage(page) res_text = '{} {}'.format(res_text, page_text) if p == num_of_pages and res_page_no is not None: self._process(res_page_no, res_outfile, res_text)
def parse_media_blocks(media_urls): media_blocks = [] for url in media_urls.split(", "): domain = urlparse(url).netloc if domain in ["vimeo.com", "www.youtube.com"]: embed = get_embed(url) embed_tuple = ("embed", embed) media_blocks.append(embed_tuple) else: # The default should be to fetch a PDF or image file (i.e. from westernfriend.org) response = requests.get(url) content_type = response.headers["content-type"] file_name = url.split("/")[-1] file_bytes = BytesIO(response.content) if content_type == "application/pdf": # Create file document_file = File(file_bytes, name=file_name) document = Document( title=file_name, file=document_file, ) document.save() document_link_block = ("document", document) media_blocks.append(document_link_block) elif content_type in ["image/jpeg", "image/png"]: # create image image_file = ImageFile(file_bytes, name=file_name) image = Image( title=file_name, file=image_file, ) image.save() image_block = ("image", image) media_blocks.append(image_block) else: print(url) print(content_type) print("-----") return media_blocks
def download_document(url, document_filename): """ Takes the attached document URL from the old database API, retrieves the document, then saves it with a new filename and attaches it to the post """ if url: document_location = os.path.join('home/management/api/documents', document_filename) urllib.urlretrieve(url, document_location) document = Document( title=document_filename, file=File(open(document_location), name=document_filename[:90]), ) document.save() return document.id
def dummy_wagtail_doc(request): if not Collection.objects.exists(): # pragma: no cover Collection.add_root() doc = Document(title='hello') doc.file.save('foo.txt', ContentFile('foo', 'foo.txt')) doc.save() doc = Document.objects.get(pk=doc.pk) # Reload to ensure the upload took def nuke(): try: # Try cleaning up so `/var/media` isn't full of foo doc.file.delete() doc.delete() except: # pragma: no cover pass request.addfinalizer(nuke) return doc
def parse_results(self): media_files = self.results for r in media_files: sub_site = r.get('source') collection_name = SOURCES[sub_site] collection = Collection.objects.get(name=collection_name) source_url = r.get('source_url') media_type = r.get('media_type') media_name = source_url.split('/')[-1] response = requests.get(source_url) title = r.get('title') # if the title id blank it causes an error if not title: title = 'No title was available' if response: if media_type == 'file': # save to documents media_file = File(BytesIO(response.content), name=media_name) file = Document(title=title, file=media_file, collection=collection) file.save() file.created_at = r.get('date') file.save() elif media_type == 'image': # save to images image_file = ImageFile(BytesIO(response.content), name=media_name) image = Image(title=title, file=image_file, collection=collection) image.save() image.created_at = r.get('date') image.save() else: sys.stdout.write( '⚠️ Got no response. Error has been logged importer/log/import_media_files.txt\n' ) with open('importer/log/import_media_files.txt', 'a') as the_file: the_file.write('{}\n'.format(r)) if self.next: time.sleep(self.sleep_between_fetches) self.fetch_url(self.next) self.parse_results() return Document.objects.count() + Image.objects.count(), 0
def make_documents(self): if self.document['type_of_publication'] == 'heading': return self.create_heading(self.document['heading_text']) elif self.document['type_of_publication'] == 'document': document = self.document['document'] if document: # lets get the file here, saves cluttering the block builder response = requests.get(document['url']) if response: media_file = File( BytesIO(response.content), name=document['filename'] ) file = Document( title=document['title'], file=media_file, collection=self.collection ) file.save() file.created_at = make_aware( dateutil.parser.parse(document['date'])) file.save() return self.create_document_type(file, document, self.document) else: with open('importer/log/make_documents_list_errors.txt', 'a') as the_file: the_file.write('{}: {}\n'.format( self.publication, self.publication.id)) elif self.document['type_of_publication'] == 'documentlink': # pass # document = self.data['document'] return self.create_link_type(self.document) elif self.document['type_of_publication'] == 'audiovideo': # pass # document = self.data['document'] return self.create_embed_type(self.document) elif self.document['type_of_publication'] == 'freetext': return self.create_free_text(self.document) # return self.stream_value """
def prepare_links(self, link, page_path, page): path_list = page_path.split('/') # a list of path segments # first is always '' so lets remove it del path_list[0] if not path_list[-1]: # and remove the past one if none del path_list[-1] page_path = '/' + '/'.join(path_list) + '/' # a string of the path # some links are anchors # is_anchor_link = False # if '#' in path_list[-1]: # is_anchor_link = True page_path_live = 'https://www.england.nhs.uk' + page_path # print(page_path) home_page = Page.objects.filter(title='Home')[0] if not path_list: # home page page_link = self.make_page_link(link.text, home_page.id, home_page.title) self.change_links.append([link, page_link]) elif path_list and path_list[0] == 'publication' or len( path_list) >= 2 and path_list[1] == 'publication': # find source url for publication ours are all in sub sites but links are not try: publication = Publication.objects.get(wp_link=page_path_live) page_link = self.make_page_link(link.text, publication.id, publication.title) self.change_links.append([link, page_link]) except: with open('importer/log/parse_stream_fields_url_errors.txt', 'a') as log: log.write('{} | {} | {}\n'.format(link, page_path, page)) elif path_list and path_list[0] == 'news': # find source url for news ours are all in sub sites try: post = Post.objects.get(wp_link=page_path_live) page_link = self.make_page_link(link.text, post.id, post.title) self.change_links.append([link, page_link]) except: with open('importer/log/parse_stream_fields_url_errors.txt', 'a') as log: log.write('{} | {} | {}\n'.format(link, page_path, page)) elif path_list and path_list[0] == 'blog': # find source url for blogs # print(page_path_live) try: blog = Blog.objects.get(wp_link=page_path_live) page_link = self.make_page_link(link.text, blog.id, blog.title) self.change_links.append([link, page_link]) except: with open('importer/log/parse_stream_fields_url_errors.txt', 'a') as log: log.write('{} | {} | {}\n'.format(link, page_path, page)) elif path_list and path_list[0] == 'wp-content' or len( path_list ) >= 2 and path_list[ 1] == 'wp-content': # becuse sometimes they are subsite links # a file link these arnt in the self.urls """ problem here is we cant link to a page within a document using #page=2 """ if '#' in path_list[-1]: page_path = page_path.split('#')[0] document_id = None file = 'documents/' + path_list[-1] # print(file) try: document = Document.objects.get(file=file) document_id = document.id except Document.DoesNotExist: with open('importer/log/media_document_not_found.txt', 'a') as the_file: the_file.write('{} | Linked from: {}\n'.format( page_path, page)) collection_root = Collection.get_first_root_node() remote_file = requests.get(page_path_live) media_file = File(BytesIO(remote_file.content), name=path_list[-1]) file = Document(title=path_list[-1], file=media_file, collection=collection_root) file.save() pass if document_id: document_link = self.make_document_link( link.text, document_id, path_list[-1]) self.change_links.append([link, document_link]) elif page_path in self.url_map_keys and page_path not in SKIP_ANCHOR_URLS: page_link = self.make_page_link(link.text, self.url_map[page_path]['id'], self.url_map[page_path]['title']) self.change_links.append([link, page_link]) else: # print('using live') response = requests.get('https://www.england.nhs.uk' + page_path) url = '' is_post = False if response: url = response.url.split('/') del url[-1] del url[:3] # some urls have links to news items that start 2010/09 that needs to removed to find the url if url[0].isdigit() and url[1].isdigit( ) and not url[2].isdigit(): # is post try: page = Post.objects.get(wp_link=response.url) id = page.id title = page.title page_link = self.make_page_link(link.text, id, title) self.change_links.append([link, page_link]) except Post.DoesNotExist: pass elif path_list[0].isdigit() and path_list[1].isdigit( ) and path_list[2].isdigit(): try: blog = Blog.objects.get(wp_link=response.url) id = blog.id title = blog.title page_link = self.make_page_link(link.text, id, title) self.change_links.append([link, page_link]) except Blog.DoesNotExist: pass else: # is page # print('could be a page') actual_url = '/' + '/'.join( url) + '/' # a string of the path if actual_url in self.url_map_keys: id = self.url_map[actual_url]['id'] title = self.url_map[actual_url]['title'] page_link = self.make_page_link(link.text, id, title) self.change_links.append([link, page_link]) else: print('leaving the link alone') else: # print('not found') with open('importer/log/parse_stream_fields_url_errors.txt', 'a') as log: log.write('{}\n'.format(page_path, page))
def handle(self, *args, **options): def wrap_p(text): return "<p>{}</p>".format(text) MISSING_VALUES = ["", " ", ".", "NA", "N/A", "N.A."] # Box Auth config = JWTAuth.from_settings_file( os.path.join(settings.BASE_DIR, 'box_config.json')) client = Client(config) def recurse_items(folder_items, box_items): for item in folder_items: if type(item) is Folder: sub_folder_items = client.folder( folder_id=item.id).get_items() box_items = recurse_items(sub_folder_items, box_items) else: box_items[item.name.lower()] = item.id return box_items box_items = {} folder_items = client.folder(folder_id="93089112686").get_items() box_items = recurse_items(folder_items, box_items) # Create IA if not already present dataset_listing = DataSetListing.objects.live().first() if not dataset_listing: data_section = DataSectionPage.objects.live().first() if not data_section: home_page = HomePage.objects.live().first() data_section = DataSectionPage(title="Data") home_page.add_child(instance=data_section) data_section.save_revision().publish() dataset_listing = DataSetListing(title="Datasets") data_section.add_child(instance=dataset_listing) dataset_listing.save_revision().publish() # Fetch data and parse source_csv_url = "https://docs.google.com/spreadsheets/d/1pDbdncnm1TF41kJJX2WjZ2Wq9juOvUqU/export?format=csv&id=1pDbdncnm1TF41kJJX2WjZ2Wq9juOvUqU&gid=2086173829" dataset_csv_url = "https://docs.google.com/spreadsheets/d/1pDbdncnm1TF41kJJX2WjZ2Wq9juOvUqU/export?format=csv&id=1pDbdncnm1TF41kJJX2WjZ2Wq9juOvUqU&gid=1736754230" source_response = requests.get(source_csv_url) source_response.encoding = 'utf-8' source_text = source_response.iter_lines(decode_unicode=True) dataset_response = requests.get(dataset_csv_url) dataset_response.encoding = 'utf-8' dataset_text = dataset_response.iter_lines(decode_unicode=True) # Data sources """ (Pdb) source_dict.keys() dict_keys(['Source ID', 'Source title', 'Organisation ', 'Long description of the data source', 'Date of access', 'Link to the source', 'Geography information', 'Keyword search', 'Internal notes', 'Analyst that worked on the data', 'Licence', 'Check', 'Signed-off and ready?']) """ skip = True source_reader = csv.DictReader(source_text) for source_dict in source_reader: if skip: skip = False else: source_check = DataSource.objects.filter( source_id=source_dict['Source ID']) if not source_check and source_dict[ 'Source title'] not in MISSING_VALUES and source_dict[ 'Signed-off and ready?'].lower() == "yes": print("source: ", source_dict['Source title']) if type(source_dict['Date of access']) is not datetime: try: date_of_access = datetime.strptime( source_dict['Date of access'], "%d/%m/%Y") except (ValueError, TypeError) as e: date_of_access = None else: date_of_access = source_dict['Date of access'] # try: # tag_list = [tag.strip() for tag in source_dict['Keyword search'].split(",") if len(tag.strip()) < 100 and len(tag.strip()) > 0] # except AttributeError: # tag_list = [] new_source = DataSource( source_id=source_dict['Source ID'], title=source_dict['Source title'], organisation=source_dict['Organisation '], description=source_dict[ 'Long description of the data source'], date_of_access=date_of_access, link_to_data=source_dict['Link to the source'], geography=source_dict['Geography information'], internal_notes=source_dict['Internal notes'], licence=source_dict['Licence']) # new_source.topics.add(*tag_list) # Authors author_names = source_dict[ 'Analyst that worked on the data'] authors = [] if author_names not in MISSING_VALUES: author_names_list = [ author.strip() for author in author_names.split(",") ] for author_name in author_names_list: internal_author_page_qs = TeamMemberPage.objects.filter( name=author_name) if internal_author_page_qs: author_obj = { "type": "internal_author", "value": internal_author_page_qs.first().pk } else: author_obj = { "type": "external_author", "value": { "name": author_name, "title": "", "photograph": None, "page": "" } } authors.append(author_obj) if authors: new_source.authors = json.dumps(authors) new_source.save() # Datasets """ (Pdb) dataset_dict.keys() dict_keys(['Dataset ID', 'What is the title of the data set?', 'What DI publication is this dataset associated with?', 'What is a long description of the data set?', 'Release date?', 'Geography information', 'Geographic coding', 'Unit', 'Keyword search', 'Internal notes', 'Analyst that worked on the data', 'Licence', 'Suggested citation', 'Source 1', 'Source 2 (optional)', 'Source 3 (optional)', 'Source 4 (optional)', 'Source 5 (optional)', 'Source 6 (optional)', 'Source 7 (optional)', 'Source 8 (optional)', 'Source 9 (optional)', 'Done', 'File location Excel', 'File name Excel', 'File location csv', 'File name csv', 'File notes', 'Signed-off and ready?']) """ source_keys = [ 'Source 1', 'Source 2 (optional)', 'Source 3 (optional)', 'Source 4 (optional)', 'Source 5 (optional)', 'Source 6 (optional)', 'Source 7 (optional)', 'Source 8 (optional)', 'Source 9 (optional)' ] skip = True dataset_reader = csv.DictReader(dataset_text) for dataset_dict in dataset_reader: if skip: skip = False else: dataset_check = DatasetPage.objects.filter( dataset_id=dataset_dict['Dataset ID']) if not dataset_check and dataset_dict[ 'What is the title of the data set?'] not in MISSING_VALUES and dataset_dict[ 'Signed-off and ready?'].lower() == "yes": print("Dataset: ", dataset_dict['What is the title of the data set?']) if type(dataset_dict['Release date?']) is not datetime: try: release_date = datetime.strptime( dataset_dict['Release date?'], "%d/%m/%Y") except (ValueError, TypeError) as e: release_date = datetime.now() else: release_date = dataset_dict['Release date?'] meta_json = [] if dataset_dict[ 'What is a long description of the data set?'] not in MISSING_VALUES: meta_json.append({ "type": "description", "value": wrap_p(dataset_dict[ 'What is a long description of the data set?']) }) if dataset_dict[ 'Geography information'] not in MISSING_VALUES: meta_json.append({ "type": "geography", "value": wrap_p(dataset_dict['Geography information']) }) if dataset_dict['Geographic coding'] not in MISSING_VALUES: meta_json.append({ "type": "geographic_coding", "value": wrap_p(dataset_dict['Geographic coding']) }) if dataset_dict['Unit'] not in MISSING_VALUES: meta_json.append({ "type": "unit", "value": wrap_p(dataset_dict['Unit']) }) if dataset_dict['Internal notes'] not in MISSING_VALUES: meta_json.append({ "type": "internal_notes", "value": wrap_p(dataset_dict['Internal notes']) }) if dataset_dict['Licence'] not in MISSING_VALUES: meta_json.append({ "type": "licence", "value": wrap_p(dataset_dict['Licence']) }) if dataset_dict[ 'Suggested citation'] not in MISSING_VALUES: meta_json.append({ "type": "citation", "value": wrap_p(dataset_dict['Suggested citation']) }) new_dataset = DatasetPage( title=dataset_dict[ 'What is the title of the data set?'], dataset_id=dataset_dict['Dataset ID'], dataset_title=dataset_dict[ 'What is the title of the data set?'], release_date=release_date, meta_data=json.dumps(meta_json)) # try: # tag_list = [tag.strip() for tag in dataset_dict['Keyword search'].split(",") if len(tag.strip()) < 100 and len(tag.strip()) > 0] # except AttributeError: # tag_list = [] # new_dataset.topics.add(*tag_list) dataset_listing.add_child(instance=new_dataset) # Authors author_names = dataset_dict[ 'Analyst that worked on the data'] authors = [] if author_names not in MISSING_VALUES: author_names_list = [ author.strip() for author in author_names.split(",") ] for author_name in author_names_list: internal_author_page_qs = TeamMemberPage.objects.filter( name=author_name) if internal_author_page_qs: author_obj = { "type": "internal_author", "value": internal_author_page_qs.first().pk } else: author_obj = { "type": "external_author", "value": { "name": author_name, "title": "", "photograph": None, "page": "" } } authors.append(author_obj) if authors: new_dataset.authors = json.dumps(authors) new_dataset.save_revision().publish() if dataset_dict[ 'What DI publication is this dataset associated with?'] not in MISSING_VALUES: pub_titles = [ pub_title.strip() for pub_title in dataset_dict[ 'What DI publication is this dataset associated with?'] .split("|") ] for pub_title in pub_titles: pub_check = Page.objects.filter( title=pub_title).live() if pub_check: pub_page = pub_check.first().specific if isinstance(pub_page, PublicationPage): PublicationPageDataset( item=pub_page, dataset=new_dataset).save() elif isinstance(pub_page, PublicationSummaryPage): PublicationSummaryPageDataset( item=pub_page, dataset=new_dataset).save() elif isinstance(pub_page, PublicationChapterPage): PublicationChapterPageDataset( item=pub_page, dataset=new_dataset).save() elif isinstance(pub_page, PublicationAppendixPage): PublicationAppendixPageDataset( item=pub_page, dataset=new_dataset).save() elif isinstance(pub_page, LegacyPublicationPage): LegacyPublicationPageDataset( item=pub_page, dataset=new_dataset).save() elif isinstance(pub_page, ShortPublicationPage): ShortPublicationPageDataset( item=pub_page, dataset=new_dataset).save() for source_key in source_keys: key_val = dataset_dict[source_key] if key_val not in MISSING_VALUES: try: related_datasource = DataSource.objects.get( title=key_val) DataSetSource( page=new_dataset, source=related_datasource).save() except DataSource.DoesNotExist: pass if dataset_dict["File name Excel"] not in MISSING_VALUES: item_name = dataset_dict["File name Excel"].lower( ) + ".xlsx" try: item_id = box_items[item_name] f = BytesIO() client.file(item_id).download_to(f) doc = Document( title=dataset_dict["File name Excel"]) doc.file.save(item_name, File(f), save=True) doc.save() download = DatasetDownloads( page=new_dataset, title=dataset_dict["File name Excel"], file=doc) download.save() except KeyError: self.stdout.write( self.style.WARNING(item_name + " not found.")) if dataset_dict["File name csv"] not in MISSING_VALUES: item_name = dataset_dict["File name csv"].lower( ) + ".csv" try: item_id = box_items[item_name] f = BytesIO() client.file(item_id).download_to(f) doc = Document(title=dataset_dict["File name csv"]) doc.file.save(item_name, File(f), save=True) doc.save() download = DatasetDownloads( page=new_dataset, title=dataset_dict["File name csv"], file=doc) download.save() except KeyError: self.stdout.write( self.style.WARNING(item_name + " not found.")) self.stdout.write(self.style.SUCCESS('Called successfully'))