def post_process_report(self): # import data df = pd.read_csv(filepath_or_buffer='data/non_html_page_report.csv') # explode so we have one attachment for each row df['attachment_path'] = df['attachment_path'].apply(ast.literal_eval) df_long = df.explode(column='attachment_path').copy() # extract links df_long['attachment_ext'] = df_long['attachment_path'].apply( lambda x: extract_from_path(data=x, part='ext')) # un-nest so can easily replace blanks df_long['attachment_ext'] = df_long['attachment_ext'].apply( lambda x: ''.join(x)) df_long['attachment_ext'] = df_long['attachment_ext'].replace( to_replace='', value=np.NaN) # remove non-attachment and empty rows df_long = df_long.dropna(subset=['attachment_path', 'attachment_ext'], how='any', axis=0) # filter for after Sep 2018 for Specialist and Travel Advice publishers df_long['first_published_at'] = df_long['first_published_at'].astype( 'datetime64[ns]') cond_one = (df_long['publishing_app'] == 'specialist-publisher') & ( df_long['first_published_at'] > '2018-09-30') cond_two = (df_long['publishing_app'] == 'travel-advice-publisher') & ( df_long['first_published_at'] > '2018-09-30') cond_three = df_long['publishing_app'].isin( ['publisher', 'service-manual-publisher']) df_long = df_long[cond_one | cond_two | cond_three].copy() # export three sets of files # i. all data in one file # ii. sample data in one file (for viewing purposes) # iii. all data but split by primary publishing organisation (for viewing purposes) # i. df_long.to_csv( path_or_buf='data/inaccessible_nonhtml_reports/full.csv', index=False) # ii. df_long.sample(n=10000, random_state=42).to_csv( path_or_buf='data/inaccessible_nonhtml_reports/sample.csv', index=False) # iii. df_long = df_long.set_index('publishing_app') for key in df_long.index.unique(): df_long.loc[key].to_csv( 'data/inaccessible_nonhtml_reports/{}.csv'.format(key), index=False, header=True)
def count_attachment_from_html(text: str) -> dict: """ Extracts attachments as identified by links from a GOV.UK webpage via looking at href tags. Very similar to extract_links_from_html() but returns more results. Example: government/publications/measles-mumps-and-rubella-lab-confirmed-cases-in-england-2019 Reference: - `src/helpers/prepreprocess_text/py` :param text: String of the HTML code to extract attachments from. :return: Dictionary of count of attachment extensions. """ try: soup = BeautifulSoup(text, 'html5lib') links = [ link.get('href') for link in soup.find_all(name='a', href=True) ] # extract extension attachments = extract_from_path(data=links, part='ext') # take valid attachments only attachments = [x for x in attachments if x in ATTACHMENTS] # take unique html attachments attachments_html = [html for html in links if html.startswith('/')] attachments_html = list(set(attachments_html)) # count repeated attachment elements in list attachment_counts = dict(Counter(attachments)) # add html counts html_count = len(attachments_html) # cast 0s to None to be consistent with other attachments if html_count == 0: html_count = None attachment_counts.update({'.html': html_count}) return attachment_counts except Exception as e: print("error @count_attachment_from_html", e)
def process_page(self, content_item, html): content_item['primary_publishing_organisation'] = extract_subtext( text=content_item['organisations'], key='primary_publishing_organisation', index=1) # ignore cases we do not want to return publishers = [ "publisher", "service-manual-publisher", "specialist-publisher", "travel-advice-publisher" ] if not content_item['publishing_app'] in publishers: return [] attachments = (".chm|.csv|.diff|.doc|.docx|.dot|.dxf|.eps|" + ".gif|.gml|.ics|.jpg|.kml|.odp|.ods|.odt|.pdf|" + ".png|.ppt|.pptx|.ps|.rdf|.ris|.rtf|.sch|.txt|" + ".vcf|.wsdl|.xls|.xlsm|.xlsx|.xlt|.xml|.xsd|.xslt|" + ".zip") if not any( re.findall(pattern=attachments, string=content_item['details'])): return [] if pd.isna(content_item['details']): return [] # extract attachment url # each method gives different results # need both methods to capture different ways attachments can be on webpage content_item['attachment_url_one'] = extract_links_from_html( text=content_item['details']) content_item['attachment_url_two'] = self.extract_attachment( text=content_item['details'], element='url') content_item['attachment_url_three'] = self.extract_attachment_smart( text=content_item['details']) # combine two lists content_item['attachment_path'] = content_item['attachment_url_one'] \ + content_item['attachment_url_two'] \ + content_item['attachment_url_three'] # remove duplicates content_item['attachment_path'] = list( dict.fromkeys(content_item['attachment_path'])) # extract file extension from attachment url content_item['attachment_ext'] = extract_from_path( data=content_item['attachment_path'], part='ext') # return only pages with attachments by ignoring empty lists if not content_item['attachment_ext']: return [] else: return [ content_item['base_path'], content_item['primary_publishing_organisation'], content_item['publishing_app'], content_item['document_type'], content_item['first_published_at'], content_item['attachment_path'] ]
names=list(CONTENT_STORE_HEADER.keys()), dtype=CONTENT_STORE_HEADER, parse_dates=CONTENT_STORE_DATE) # drop empty rows df_process = df.dropna(subset=['details']) # take one page test = df[df["base_path"] == "/government/publications/success-profiles"] test = df[df["base_path"] == "/government/publications/screening-tests-for-you-and-your-baby"] test = test['details'].iloc[0] test = BeautifulSoup(test, features='lxml') # get page links page_links = [link.get('href') for link in test.find_all('a', href=True)] page_attachments = extract_from_path(data=page_links, part='ext') page_html = [html for html in page_links if html.startswith('/')] # get valid attachments only page_attachments = [x for x in page_attachments if x in ATTACHMENTS] # get unique elements page_html = list(set(page_html)) # add html links page_attachments.extend(page_html) # count repeated attachment elements in list attachment_counts = dict(Counter(page_attachments)) # add html counts attachment_counts.update({'html': len(page_html)}) # try using existing function test = df[df["base_path"] == "/government/publications/success-profiles"] test = test['details'].iloc[0]
from src.helpers.preprocess_text import extract_from_path import pandas as pd import numpy as np import ast # import data df = pd.read_csv(filepath_or_buffer='data/non_html_page_report.csv') # explode so we have one attachment for each row df['attachment_path'] = df['attachment_path'].apply(ast.literal_eval) df_long = df.explode(column='attachment_path').copy() # extract links df_long['attachment_ext'] = df_long['attachment_path'].apply(lambda x: extract_from_path(data=x, part='ext')) # un-nest so can easily replace blanks df_long['attachment_ext'] = df_long['attachment_ext'].apply(lambda x: ''.join(x)) df_long['attachment_ext'] = df_long['attachment_ext'].replace(to_replace='', value=np.NaN) # remove non-attachment and empty rows df_long = df_long.dropna(subset=['attachment_path', 'attachment_ext'], how='any', axis=0) # filter for after Sep 2018 for Specialist and Travel Advice publishers df_long['first_published_at'] = df_long['first_published_at'].astype('datetime64[ns]') cond_one = (df_long['publishing_app'] == 'specialist-publisher') & (df_long['first_published_at'] > '2018-09-30') cond_two = (df_long['publishing_app'] == 'travel-advice-publisher') & (df_long['first_published_at'] > '2018-09-30') cond_three = df_long['publishing_app'].isin(['publisher', 'service-manual-publisher']) df_long = df_long[cond_one | cond_two | cond_three].copy()