def get_records(count=1): """Retrieve the requested number of records. Keyword Arguments: count (int): The number of records to retrieve. Returns: dict: The retrieved records. """ url = get_foia_url('Search/SubmitSimpleQuery') params = { 'collectionMatch': 'Clinton_Email', 'searchText': '*', 'beginDate': 'false', 'endDate': 'false', 'postedBeginDate': 'false', 'postedEndDate': 'false', 'caseNumber': 'false', 'page': 1, 'start': 0, 'limit': count } # SSL certificate verification fails. To get around this, # ignore verification of the SSL certificate. response = requests.get(url, params=params, verify=False) text = clean_timestamps(response.text) records = json.loads(text) return records
def download(email): """Process the provided dictionary of email metadata. Download the corresponding PDF and extract plain text from it. Arguments: email (dict): A dictionary of email metadata. For example, { 'from': 'H', 'pdfLink': 'DOCUMENTS/HRCEmail_August_Web/IPS-0128/DOC_0C05775316/C05775316.pdf', 'docDate': 1277956800000, 'documentClass': 'Clinton_Email_August_Release', 'messageNumber': '', 'to': 'preines', 'caseNumber': 'F-2014-20439', 'subject': 'TEST', 'originalLink': None, 'postedDate': 1440993600000 } Returns: dict: Containing the provided metadata, transformed if necessary, in addition to text from the downloaded PDF. """ if email['from'] not in INTERESTING_SENDERS: return # TODO: These timestamps only give dates, not times. However, the emails # themselves contain dates and times. Extract these. email['sent'] = datetime_from_timestamp(email.pop('docDate')) email['pdf_posted'] = datetime_from_timestamp(email.pop('postedDate')) # TODO: Don't download the email if it's present on disk. Return None # so that a duplicate record isn't written to the database. url = get_foia_url(email.pop('pdfLink')) email['pdf_link'] = url # SSL certificate verification fails. To get around this, # ignore verification of the SSL certificate. response = requests.get(url, verify=False) pdf = response.content filename = get_filename(url) email['document_id'] = filename pdf_path, text = save_and_extract(filename, pdf) email['pdf_path'] = pdf_path body, is_redacted = get_body(text) email['body'] = body email['is_redacted'] = is_redacted return email