def parse(self, response): msg = BytesParser(policy=default).parsebytes(response.body) attachments = list(msg.iter_attachments()) pdf_list = [ a for a in attachments if a.get_content_type() == "application/pdf" ] # List of tuples of filename, match string match_list = [] for pdf_obj in pdf_list: pdf_text = self._parse_pdf_text(pdf_obj.get_payload(decode=True)) meeting_match = re.search( r"Senior Citizens\s+Commission\n.*?(?=\n\n)", pdf_text, flags=re.I | re.M | re.DOTALL, ) if meeting_match: match_list.append( (pdf_obj.get_filename(), meeting_match.group())) if len(match_list) == 0: raise ValueError("Meeting not found in {} PDFs".format( len(pdf_list))) for pdf_name, meeting_str in match_list: year_match = re.search(r"\d{4}", pdf_list[0].get_filename()) year_str = None if year_match: year_str = year_match.group() start, end = self._parse_times(meeting_str, year_str) if not start: return meeting = Meeting( title="Senior Citizens Commission", description="", classification=COMMISSION, start=start, end=end, all_day=False, time_notes="", location=self._parse_location(meeting_str), links=[], source=response.url, ) meeting["status"] = self._get_status(meeting, text=meeting_str) meeting["id"] = self._get_id(meeting) yield meeting
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ msg = BytesParser(policy=default).parsebytes(response.body) attachments = list(msg.iter_attachments()) docx_list = [a for a in attachments if ".docx" in a.get_filename()] items = [] if len(docx_list) > 0: items.extend( self._parse_docx(docx_list[0].get_payload(decode=True))) items.extend(self._parse_email_text(msg)) yield from self._parse_meetings(items)
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ msg = BytesParser(policy=default).parsebytes(response.body) attachments = list(msg.iter_attachments()) pdf_list = [ a for a in attachments if a.get_content_type() == "application/pdf" ] if len(pdf_list) > 0: detail_text = self._parse_pdf_text( pdf_list[0].get_payload(decode=True)) else: detail_text = self._parse_email_text(msg) yield self._parse_detail(detail_text)
from email import policy from email.parser import BytesParser myfiles = [ '20140217-0121.eml.1c2dffd0', '20140217-0314.eml.14bac63d', '20140218-0722.eml.00fe7528', '20140219-0541.eml.74741be1', '20140219-0543.eml.1c20938f', '20140219-0608.eml.02af7d91', '20140219-0612.eml.0d9a2c0b', '20140224-2004.eml.6f36a877', '20140225-1702.eml.39a4225b' ] for filename in myfiles: msg = BytesParser(policy=policy.default).parse(open(filename, 'rb')) print('Processing %s' % (filename, )) for attachment in msg.iter_attachments(): fn = attachment.get_filename() print('Attachment filename is %s' % (fn, )) if fn: extension = os.path.splitext(attachment.get_filename())[1] else: extension = mimetypes.guess_extension( attachment.get_content_type()) f = io.BytesIO() data = attachment.get_content() with open(fn, 'wb') as f: if isinstance(data, str): # data is a string f.write(data.encode('utf-8')) else: # data is bytes