def __retrieve_from_gmane(self, mailing_list): """Download mboxes from gmane interface""" gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias from_msg = self.__get_gmane_total_count(mailing_list.location, gmane_url) archives = [] while (True): to_msg = from_msg + GMANE_LIMIT url = gmane_url + '/' + str(from_msg) + '/' + str(to_msg) arch_url = gmane_url + '/' + str(from_msg) filename = os.path.join(mailing_list.compressed_dir, str(from_msg)) self.__print_output('Retrieving %s...' % url) retrieve_remote_file(url, filename, self.web_user, self.web_password) # Check whether we have read the last message. # In Gmane, an empty page means we reached the last msg with open(filename, 'r') as f: content = f.read() if not content: break from_msg = to_msg archives.append(MBoxArchive(filename, arch_url)) return archives
def __retrieve_from_gmane(self, mailing_list): """Download mboxes from gmane interface""" gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias from_msg = self.__get_gmane_total_count(mailing_list.location, gmane_url) archives = [] while(True): to_msg = from_msg + GMANE_LIMIT url = gmane_url + '/' + str(from_msg) + '/' + str(to_msg) arch_url = gmane_url + '/' + str(from_msg) filename = os.path.join(mailing_list.compressed_dir, str(from_msg)) self.__print_output('Retrieving %s...' % url) fp, size = retrieve_remote_file(url, filename, self.web_user, self.web_password) # Check whether we have read the last message. # In Gmane, an empty page means we reached the last msg if not size: break from_msg = to_msg archives.append(MBoxArchive(filename, arch_url)) return archives
def __retrieve_from_mailman(self, mailing_list): """Download mboxes from mailman interface""" # Get all the links listed in the URL # # The archives are usually retrieved in descending # chronological order (newest archives are always # shown on the top of the archives). Reverse the list # to analyze in chronological order. htmlparser = MyHTMLParser(mailing_list.location, self.web_user, self.web_password) links = htmlparser.get_mboxes_links(self.force) archives = [] for link in links: basename = os.path.basename(link) destfilename = os.path.join(mailing_list.compressed_dir, basename) try: # If the URL is for the current month, always retrieve. # Otherwise, check visited status & local files first this_month = find_current_month(link) if this_month: self.__print_output('Current month detected: ' 'Found substring %s in URL %s...' % (this_month, link)) self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) elif os.path.exists(destfilename): self.__print_output('Already downloaded %s' % link) else: self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) except IOError: self.__print_output("Unknown URL: " + link + ". Skipping.") continue archives.append(MBoxArchive(destfilename, link)) return archives
def __retrieve_from_mailman(self, mailing_list): """Download mboxes from mailman interface""" # Get all the links listed in the URL # # The archives are usually retrieved in descending # chronological order (newest archives are always # shown on the top of the archives). Reverse the list # to analyze in chronological order. htmlparser = MyHTMLParser(mailing_list.location, self.web_user, self.web_password) links = htmlparser.get_mboxes_links(self.force) archives = [] for link in links: basename = os.path.basename(link) destfilename = os.path.join(mailing_list.compressed_dir, basename) try: # If the URL is for the current month, always retrieve. # Otherwise, check visited status & local files first this_month = find_current_month(link) if this_month: self.__print_output( 'Current month detected: ' 'Found substring %s in URL %s...' % (this_month, link)) self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) elif os.path.exists(destfilename): self.__print_output('Already downloaded %s' % link) else: self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) except IOError: self.__print_output("Unknown URL: " + link + ". Skipping.") continue archives.append(MBoxArchive(destfilename, link)) return archives
def __analyze_remote(self, url): """Download the archives from the remote url, stores and parses them.""" # Check directories to stored the archives target = re.sub('^(http|ftp)[s]{0,1}://', '', url) compressed_dir = os.path.join(self.COMPRESSED_DIR, target) mbox_dir = os.path.join(self.MBOX_DIR, target) if not os.path.exists(compressed_dir): os.makedirs(compressed_dir) if not os.path.exists(mbox_dir): os.makedirs(mbox_dir) # If the file is for the current month (MailMan filename # YYYY-MMM.txt.gz) don't mark as visited, and download again # Assuming this is run daily, it's better to take yesterday's date, # to ensure we get all of last month's email when the month rolls over. yesterday = datetime.datetime.today() + datetime.timedelta(days=-1) this_month = yesterday.strftime(mailmanfmt) # Get all the links listed in the URL htmlparser = MyHTMLParser(url, self.web_user, self.web_password) links = htmlparser.get_mboxes_links() filepaths = [] for link in links: basename = os.path.basename(link) destfilename = os.path.join(compressed_dir, basename) # If the URL is for the current month, always retrieve. # Otherwise, check visited status & local files first if link.find(this_month) >= 0: self.__print_output('Found substring %s in URL %s...' % (this_month, link)) self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) elif os.path.exists(destfilename): # Check if already downloaded self.__print_output('Already downloaded %s' % link) else: self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) filepaths.append((link, destfilename)) files_to_analyze = {} url_list = [] for link, filepath in filepaths: # Check if already analyzed status = self.db.check_compressed_file(filepath) # If the file is for the current month, reimport current_month = -1 != filepath.find(this_month) if current_month: self.__print_output('Found substring %s in URL %s...' % \ (this_month, filepath)) # If already visited, ignore, unless it's for the current month if status == self.db.VISITED and not current_month: self.__print_output('Already analyzed %s' % filepath) continue # If not, set visited # (before uncompressing, otherwise the db will point towards # the uncompressed temporary file) today = datetime.datetime.today().strftime(datetimefmt) self.db.set_visited_url(link, url, today, self.db.NEW) # Check if compressed extension = check_compressed_file(filepath) if extension: # If compressed, uncompress and get the raw filepath filepaths = uncompress_file(filepath, extension, mbox_dir) # __uncompress_file returns a list containing # the path to all the uncompressed files # (for instance, a tar file may contain more than one file) files_to_analyze.setdefault(link, []).extend(filepaths) else: # File was not uncompressed, so there is only # one file to append files_to_analyze.setdefault(link, []).append(filepath) url_list.append(link) # The archives are usually retrieved in descending # chronological order (because the newest archives are always # shown on the top of the archives) # So we will analyze the list of files in the order inversed # to the order in they were retrieved url_list.reverse() return self.__analyze_list_of_files(url, url_list, files_to_analyze)