def parse_and_save(mbox_files, nntp=False): """Parse the mbox archives to extract the required information. Opens each local mbox specified by mbox_files and extracts the required information that is then saved to a database. """ # Connect to the database. try: conn = psycopg2.connect(database=DATABASE['name'], port=DATABASE['defaultport']) except psycopg2.OperationalError: conn = psycopg2.connect(database=DATABASE['name'], port=DATABASE['port']) cur = conn.cursor() current_lists = [] is_spam = False for url, files in mbox_files.iteritems(): mbox_file = mailbox.mbox(files) # Name of the mailing list and project. mbox_name = os.path.basename(files) mailing_list = os.path.basename(files).split('.')[0] project = mailing_list.rsplit('-', 2)[0] logging.info("Parsing '%s'" % mailing_list) for key, message in mbox_file.iteritems(): # The 'From' field value returns a string of the format: # email-address (Name) # from which the sender's name and email address is extracted. Note # that this is not considered as SPAM because if the 'From' header # is missing, it doesn't make sense to process other headers. from_field = message['From'] if from_field is None: continue # The Message-ID that can is used to check for errors. msg_id_raw = message['Message-ID'] if msg_id_raw is None: logging.warning('No Message-ID found, setting default ID') # Create a Message-ID: # sha1(archive_date + project) @ teammetrics-spam.debian.org. domain_str = '@teammetrics-spam.debian.org' hash_obj = hashlib.sha1() hash_string = str(archive_date) + project hash_obj.update(hash_string) msg_id = hash_obj.hexdigest() + '@teammetrics-spam.debian.org' logging.info(debug_msg) is_spam = True else: is_spam = False msg_id = msg_id_raw.strip('<>') # Set the debug message. debug_msg = ("\tMessage-ID %s of '%s' project in mbox file '%s'" % (msg_id, project, mbox_name)) # Get the name for two possible cases of formatting of 'From' header. # John Doe <*****@*****.**> # [email protected] (John Doe) if from_field.endswith('>'): # Get the position of < and > to parse the email. email_start_pos = from_field.find("<") email_end_pos = from_field.find(">") email_raw = from_field[email_start_pos+1:email_end_pos] email_addr = email_raw.replace(' at ', '@') name_raw = from_field[:email_start_pos-1].strip() name = name_raw.strip("""'"<>""") # For the second case. elif from_field.endswith(')'): # Get the position of ( and ) to parse the name. name_start_pos = from_field.find("(") name_end_pos = from_field.find(")") name_raw = from_field[name_start_pos+1: name_end_pos] name = name_raw.strip("""'"<>""") email_raw = from_field[:name_start_pos-1] email_addr = email_raw.replace(' at ', '@') # For no such case, it's better to skip since we need the Name. else: logging.error("No proper formatting for 'Name' found in %s" % msg_id) continue # Resolve the encodings but don't skip the message yet; let it # go through the SPAM checker. try: decoded_name = email.header.decode_header(name_raw) except ValueError as detail: logging.warning("Invalid 'Name' encoding: %s\n%s" % (detail, debug_msg)) try: name = u" ".join([unicode(text, charset or chardet.detect(text)['encoding']) for text, charset in decoded_name]) except TypeError: logging.error("Unable to detect 'Name' encoding for: %s" % msg_id) continue except (UnicodeDecodeError, LookupError) as detail: logging.error("Unable to decode 'Name': %s\n%s" % (detail, debug_msg)) if name.endswith('alioth.debian.org'): name = name.split()[0] # The date the message was sent. get_date = message['Date'] parsed_date = email.utils.parsedate(get_date) # last_f_date is used in case a message has an invalid date. # In such a case, the date of the previous date is used. last_f_date = '' # Some messages have faulty Date headers. Get the last_f_date in # such cases and even if that fails, skip the message. try: format_date = datetime.datetime(*parsed_date[:4]) except (ValueError, TypeError) as detail: if last_f_date: format_date = last_f_date else: logging.error("Invalid 'Date' header: %s\n%s" % (detail, debug_msg)) continue try: archive_date = format_date.strftime("%Y-%m-%d") except ValueError as detail: logging.error("Unable to parse 'Date' header: %s\n%s" % (detail, debug_msg)) continue try: raw_subject = ' '.join(message['Subject'].split()) except AttributeError as detail: logging.error("Invalid 'Subject' header: %s\n%s" % (detail, debug_msg)) raw_subject = '' try: decoded_subject = email.header.decode_header(raw_subject) except ValueError as detail: logging.warning("Invalid 'Subject' encoding: %s" % detail) except email.errors.HeaderParseError as detail: logging.warning("Unable to parse 'Subject' header: %s\n%s" % (detail, debug_msg)) try: subject = u" ".join([unicode(text, charset or chardet.detect(text)['encoding']) for text, charset in decoded_subject]) except (LookupError, TypeError) as detail: logging.error("Unable to detect 'Subject' encoding for %s: %s" % (msg_id, detail)) continue except (UnicodeDecodeError, LookupError) as detail: logging.warning("Unable to decode 'Subject': %s\n%s" % (detail, debug_msg)) # Get the message payload. if message.is_multipart(): # We are interested only in the plain text parts. msg_text_parts = [part for part in email.Iterators.typed_subpart_iterator(message, 'text', 'plain')] msg_body = [] for part in msg_text_parts: try: msg_body.append(unicode(part.get_payload(decode=True), chardet.detect(part.get_payload())['encoding'], "replace")) except (LookupError, TypeError) as detail: logging.error("Unable to detect payload encoding for %s: %s" % (msg_id, detail)) continue payload = u"\n".join(msg_body).strip() else: try: payload = unicode(message.get_payload(decode=True), chardet.detect(message.get_payload())['encoding'], "replace") except (LookupError, TypeError) as detail: logging.error("Unable to detect payload encoding for %s: %s" % (msg_id, detail)) continue is_spam_filter = False name, subject, reason, spam = spamfilter.check_spam(name, subject) # If the message is spam, set the is_spam_filter flag. if is_spam: reason = 'No Message-ID found' if spam: is_spam_filter = True logging.warning('Spam detected for %s. Reason: %s' % (msg_id, reason)) today_raw = datetime.date.today() today_date = today_raw.strftime("%Y-%m-%d") # The lines in the message body excluding blank lines. msg_blank_raw = [line.strip() for line in payload.splitlines() if line] msg_blank = [line for line in msg_blank_raw if line] msg_blank_len = len(msg_blank) # The lines in the message body excluding blank lines AND # quotes (starting with >). msg_quotes = [line for line in msg_blank if not line.startswith('>')] msg_quotes_len = len(msg_quotes) # The number of characters in the message body. msg_raw_len = len(''.join(element for element in msg_blank)) # The lines in the message body excluding blank lines AND # quotes and till the signature (-- ). try: msg_sig_len = len(msg_quotes[:msg_quotes.index('--')]) except ValueError: msg_sig_len = msg_blank_len # The netloc from the mailing list URL. netloc = urlparse.urlparse(url).netloc # Save the required information to the database. try: cur.execute( """INSERT INTO listarchives (project, domain, name, email_addr, subject, message_id, archive_date, today_date, msg_raw_len, msg_no_blank_len, msg_no_quotes_len, msg_no_sig_len, is_spam) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""", (project, netloc, name, email_addr, subject, msg_id, archive_date, today_date, msg_raw_len, msg_blank_len, msg_quotes_len, msg_sig_len, is_spam_filter) ) except psycopg2.DataError as detail: conn.rollback() logging.error(detail) logging.error(debug_msg) continue except psycopg2.IntegrityError as detail: # It happens that the very same message hits a mailing list twice # for instance because concerning two different bugs and BTS is # sending a copy for each bug separately. conn.rollback() logging.info('Message-ID %s already in database, skipping' % msg_id) continue conn.commit() # Save the date for later use. last_f_date = format_date current_lists.append(mbox_name) logging.info('Updating names') updatenames.update_names(conn, cur) cur.close() conn.close() # nntp is True when parse_and_save is saved is being called by nntpstat. if not nntp: # Write the checksums of the download mbox archives. if current_lists: write_parsed_lists(current_lists) # Remove the extracted mbox archives (in plain text). logging.info('Cleaning up extracted mbox archives') for each_mbox in mbox_files.itervalues(): os.remove(each_mbox) logging.info('Quit') sys.exit()
def main(): conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH, pipermail=False) counter = 0 skipped_messages = 0 fetched_messages = 0 for names, lists in conf_info.iteritems(): for lst in lists: list_fetched_messages = 0 lst_name = lst.rsplit('/')[-1] # In consecutive runs, the already parsed message are skipped without even being fetched. # Everything is set to type: Unicode because that is what BeautifulSoup returns. config_data = tuple(unicode(ele) for ele in read_config(lst_name)) if config_data: c_year = config_data[0] c_month = config_data[1] c_message = config_data[2] year_month_flag = message_flag = True else: year_month_flag = message_flag = False logging.info('\tList %d of %d' % (counter+1, total_lists)) logging.info("Fetching '%s'" % lst_name) try: url_read = urllib2.urlopen(lst) except urllib2.HTTPError as detail: logging.error('Invalid list name, skipping') counter += 1 continue # Get the links to the archives. soup = BeautifulSoup(url_read) all_links = soup.findAll('a', href=re.compile('threads.html')) links = [tag['href'] for tag in all_links] if year_month_flag: logging.info('Last run was on %s-%s/%s' % (c_year, c_month, c_message)) last_link = unicode('{0}/{1}-{0}{2}/threads.html'.format(c_year, lst_name, c_month)) links = links[links.index(last_link):] year_month_flag = False all_months = soup.body.findAll('ul')[1].findAll('li') start = all_months[0].text.split(None, 1)[0] end = all_months[-1].text.split(None, 1)[0] logging.info('List archives are from %s to %s' % (start, end)) for link in links: # Get the year for which the messages are to be fetched. month_url = '{0}{1}/{2}'.format(BASE_URL, lst_name, link) year_month = link.split('/')[-2].rsplit('-')[-1] year = year_month[:-2] month = year_month[-2:] try: month_read = urllib2.urlopen(month_url) except urllib2.URLError as detail: logging.error('Skipping month %s: unable to connect to lists.d.o' % link) logging.error('%s' % detail) continue soup = BeautifulSoup(month_read) messages = [] # There are multiple pages in an archive, check for them. all_pages_month = check_next_page(month_url) if all_pages_month: for each_month in all_pages_month: page_soup = BeautifulSoup(urllib2.urlopen(each_month)) messages.extend(fetch_message_links(page_soup)) else: messages.extend(fetch_message_links(soup)) if message_flag: upto_messages = [unicode('msg{0:05}.html'.format(e)) for e in range(int(c_message[3:].strip('.html'))+1)] messages = list(set(messages) - set(upto_messages)) message_flag = False # Sort the list before starting so as to match up to the notion of upto_messages. messages.sort() for message in messages: # Construct the message URL: message_url = '{0}{1}/{2}/{3}/{4}'.format(BASE_URL, lst_name, year, month, message) try: message_read = urllib2.urlopen(message_url) except urllib2.URLError as detail: logging.error('Skipping message: unable to connect to lists.d.o') skipped_messages += 1 continue soup = BeautifulSoup(message_read) # Now we are at a single message, so parse it. body = soup.body.ul all_elements = body.findAll('li') # Fetch the text of all elements in FIELDS. all_elements_text = [element.text for element in all_elements if element.text.startswith(FIELDS)] # Create a mapping of field to values. fields = {} for element in all_elements_text: field, value = element.split(':', 1) fields[field.strip()] = value.strip() # From field. # In case of a missing 'From' field, just skip because we don't need to parse the message then. if 'From' not in fields: continue # Name, Email parsing starts here. # Format the 'From' field to return the name and email address. # Foo Bar <[email protected]> name_email = fields.get('From') try: if name_email.endswith(')'): email_raw, name_raw = name_email.split('(', 1) name = name_raw.strip('()') email = email_raw else: name_raw, email_raw = name_email.strip().rsplit(None, 1) # Name. if name_raw.startswith('"') or name_raw.endswith('"'): name = name_raw.replace('"', '') else: name = name_raw # Email. if email_raw.startswith('<') and email_raw.endswith('>'): email = email_raw.replace('<', '').replace('>', '') else: email = email_raw except ValueError: # The name is the same as the email address. name = email = name_email.replace('<', '').replace('>', '') # Some names have the form: LastName, FirstName. if ',' in name: name = ' '.join(e for e in reversed(name.split())).replace(',', '').strip() name = HTMLParser.HTMLParser().unescape(name).strip() # Subject field. subject = fields.get('Subject', '') subject = HTMLParser.HTMLParser().unescape(subject) # Date field. date = fields.get('Date') if date is not None: # Let's parse the date now and fetch the day the message was sent. day_find = re.findall(r'\d{1,2}', date) # Can't parse the date, so set it to a random value. if day_find: day = day_find[0] else: day = '15' # If there is no 'Date' field. else: day = '15' final_day = day final_month = month final_year = year final_date = '{0}-{1}-{2}'.format(final_year, final_month, final_day) # Before storing the date, ensure that it is proper. If not, # this is usually due to the issue of the last day of a given # month being counted in the next. So default the day to 1. try: time.strptime(final_date, '%Y-%m-%d') except ValueError: final_date = '{0}-{1}-1'.format(final_year, final_month) today_raw = datetime.date.today() today_date = today_raw.strftime('%Y-%m-%d') # Message-id field. # If no Message-id field found, generate a random one. message_id = fields.get('Message-id', u'{0}-{1}-{2}@spam.lists.debian.org'.format(name.replace(' ', ''), final_month, final_day)) message_id = message_id.replace('<', '').replace('>', '') # In-reply-to and References field. in_reply_to = fields.get('In-reply-to', '') in_reply_to = HTMLParser.HTMLParser().unescape(in_reply_to) references = HTMLParser.HTMLParser().unescape(fields.get('References', '')) if '><' in references: references = make_multiple_lines(references) is_spam = False # Run it through the spam filter. name, subject, reason, spam = spamfilter.check_spam(name, subject) # If the message is spam, set the is_spam flag. if spam: is_spam = True logging.warning('Possible spam: %s. Reason: %s' % (message_id, reason)) # Now parse the message body, starting at the comment X-Body-of-Message # and continuing till X-Body-of-Message-End. This handles both plain text # as well HTML formatted messages. start_message = soup.find(text=lambda e: isinstance(e, Comment) and e==u'X-Body-of-Message') body = [] for e in start_message.findAllNext(text=True): if e == u'X-Body-of-Message-End': break body.append(e) # Extra formatting that helps frame the mbox structure properly. if body[-1] == u'\n' and '\n' not in body[-2]: body.append(u'\n\n') body = ''.join(HTMLParser.HTMLParser().unescape(e) for e in body) updated_date = nntpstat.asctime_update(date, message_id) if updated_date is None: logging.error('Unable to decode date, skipping message %s' % message_id) continue mbox_name = '{0}.{1}{2}'.format(lst_name, year, month) create_mbox(lst_name, mbox_name, name, email, date, updated_date, subject, message_id, body, in_reply_to, references) list_fetched_messages += 1 fetched_messages += 1 if messages: write_config(lst_name, final_year, final_month, message) logging.info("Finished processing '%s' (%s messages)" % (lst_name, list_fetched_messages)) counter += 1 if fetched_messages: logging.info('Fetched %s messages in the current run' % fetched_messages) else: logging.info('No messages were fetched in the current run') if skipped_messages: logging.info('Skipped %s messages in the current run' % skipped_messages) logging.info('Quit') sys.exit()
def main(conn, cur): conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH, pipermail=False) counter = 0 skipped_messages = 0 fetched_messages = 0 did_not_run = True for names, lists in conf_info.iteritems(): for lst in lists: list_fetched_messages = 0 lst_name = lst.rsplit('/')[-1] # In consecutive runs, the already parsed message are skipped without # even being fetched. Everything is set to type: Unicode because that # is what BeautifulSoup returns. config_data = tuple(unicode(ele) for ele in read_config(lst_name)) if config_data: check_year = config_data[0] check_month = config_data[1] check_message = config_data[2] year_month_flag = message_flag = True else: year_month_flag = message_flag = False logging.info('\tList %d of %d' % (counter+1, total_lists)) logging.info("Fetching '%s'" % lst_name) try: url_read = urllib2.urlopen(lst) except urllib2.HTTPError as detail: logging.error('Invalid list name, skipping') counter += 1 continue # Get the links to the archives. soup = BeautifulSoup(url_read) all_links = soup.findAll('a', href=re.compile('threads.html')) links = [tag['href'] for tag in all_links] if year_month_flag: logging.info('Last run was on %s-%s/%s' % (check_year, check_month, check_message)) last_link = unicode('{0}/{1}-{0}{2}/threads.html'.format(check_year, lst_name, check_month)) links = links[links.index(last_link):] year_month_flag = False all_months = soup.body.findAll('ul')[1].findAll('li') start = all_months[0].text.split(None, 1)[0] end = all_months[-1].text.split(None, 1)[0] logging.info('List archives are from %s to %s' % (start, end)) for link in links: # Get the year for which the messages are to be fetched. month_url = '{0}{1}/{2}'.format(BASE_URL, lst_name, link) year_month = link.split('/')[-2].rsplit('-')[-1] year = year_month[:-2] month = year_month[-2:] try: month_read = urllib2.urlopen(month_url) except urllib2.URLError as detail: logging.error('Skipping month %s: unable to connect to lists.d.o' % link) logging.error('%s' % detail) continue soup = BeautifulSoup(month_read) messages = [] # There are multiple pages in an archive, check for them. all_pages_month = check_next_page(month_url) if all_pages_month: for each_month in all_pages_month: page_soup = BeautifulSoup(urllib2.urlopen(each_month)) messages.extend(fetch_message_links(page_soup)) else: messages.extend(fetch_message_links(soup)) if message_flag: upto_messages = [unicode('msg{0:05}.html'.format(e)) for e in range(int(check_message[3:].strip('.html'))+1)] messages = list(set(messages) - set(upto_messages)) message_flag = False # Sort the list so that messages are fetched in the proper order. messages.sort() for message in messages: # Construct the message URL: message_url = '{0}{1}/{2}/{3}/{4}'.format(BASE_URL, lst_name, year, month, message) try: message_read = urllib2.urlopen(message_url) except urllib2.URLError as detail: logging.error('Skipping message: unable to connect to lists.d.o') skipped_messages += 1 continue # Even if a single message is fetched. did_not_run = False soup = BeautifulSoup(message_read) # Now we are at a single message, so parse it. body = soup.body.ul all_elements = body.findAll('li') # Fetch the text of all elements in FIELDS. all_elements_text = [element.text for element in all_elements if element.text.startswith(FIELDS)] # Create a mapping of field to values. fields = {} for element in all_elements_text: field, value = element.split(':', 1) fields[field.strip()] = value.strip() # From field. # In case of a missing 'From' field, just skip the message. if 'From' not in fields: continue # Name, Email parsing starts here. # Format the 'From' field to return the name and email address. # Foo Bar <[email protected]> name_email = fields.get('From') try: if name_email.endswith(')'): email_raw, name_raw = name_email.split('(', 1) name = name_raw.strip('()') email = email_raw else: name_raw, email_raw = name_email.strip().rsplit(None, 1) # Name. if name_raw.startswith('"') or name_raw.endswith('"'): name = name_raw.replace('"', '') else: name = name_raw # Email. if email_raw.startswith('<') and email_raw.endswith('>'): email = email_raw.replace('<', '').replace('>', '') else: email = email_raw except ValueError: # The name is the same as the email address. name = email = name_email.replace('<', '').replace('>', '') # Some names have the form: LastName, FirstName. if ',' in name: name = ' '.join(e for e in reversed(name.split())).replace(',', '').strip() name = HTMLParser.HTMLParser().unescape(name).strip() # Subject field. subject = fields.get('Subject', '') subject = HTMLParser.HTMLParser().unescape(subject) # Date field. date = fields.get('Date') if date is not None: # Let's parse the date now and fetch the day the message was sent. day_find = re.findall(r'\d{1,2}', date) # Can't parse the date, so set it to a random value. if day_find: day = day_find[0] else: day = '15' # If there is no 'Date' field. else: day = '15' final_day = day final_month = month final_year = year final_date = '{0}-{1}-{2}'.format(final_year, final_month, final_day) # Before storing the date, ensure that it is proper. If not, # this is usually due to the issue of the last day of a given # month being counted in the next. So default the day to 1. try: time.strptime(final_date, '%Y-%m-%d') except ValueError: final_date = '{0}-{1}-1'.format(final_year, final_month) today_raw = datetime.date.today() today_date = today_raw.strftime('%Y-%m-%d') # Message-id field. # If no Message-id field found, generate a random one. message_id = fields.get('Message-id', u'{0}-{1}-{2}@spam.lists.debian.org'.format(name.replace(' ', ''), final_month, final_day)) message_id = message_id.replace('<', '').replace('>', '') is_spam = False # Run it through the spam filter. name, subject, reason, spam = spamfilter.check_spam(name, subject) # If the message is spam, set the is_spam flag. if spam: is_spam = True logging.warning('Possible spam: %s. Reason: %s' % (message_id, reason)) # Now populate the 'listarchives' table. try: cur.execute( """INSERT INTO listarchives (project, domain, name, email_addr, subject, message_id, archive_date, today_date, is_spam) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);""", (lst_name, 'lists.debian.org', name, email, subject, message_id, final_date, today_date, is_spam) ) except psycopg2.DataError as detail: conn.rollback() logging.error(detail) continue except psycopg2.IntegrityError: conn.rollback() continue conn.commit() list_fetched_messages += 1 fetched_messages += 1 if messages: write_config(lst_name, final_year, final_month, message) logging.info("Finished processing '%s' (%s messages)" % (lst_name, list_fetched_messages)) counter += 1 if fetched_messages: logging.info('Fetched %s messages in the current run' % fetched_messages) else: logging.info('No messages were fetched in the current run') if skipped_messages: logging.info('Skipped %s messages in the current run' % skipped_messages) if not did_not_run: logging.info('Updating names') updatenames.update_names(conn, cur) logging.info('Quit') sys.exit()