示例#1
0
class DataCollector():

    def __init__(self):
        self.config = BotConfig()
        try:
            # Opening the links file with utf encoding is required because unicode data
            # has to be written to the file at times -- specifically the em character (\u2014)
            # which represents an underscore in the username
            self.link_log = codecs.open(self.config.get_link_path(), "a", encoding="utf-8")
            self.link_log.write(self.get_date() + "\n")
        except IOError:
            "links log can't be opened."

        # matches all valid urls but not ie. google.com
        # taken from regexlib.com
        self.pattern = """(((http|ftp|https|ftps|sftp)://)|(www\.))+(([a-zA-Z
                          0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.
                          [0-9]{1,3}\.[0-9]{1,3}))(/[a-zA-Z0-9\&%_\./-~-]*)?"""

        # pattern used to filter links
        # keywords included in this pattern will NOT be included in the links log
        self.filter_pat = "(codepad|pastebin|pocoo)"

    def extract_links(self, user, chan, msg):
        # extracts all urls found in message and store in an iterator
        matches = re.finditer(self.pattern, msg.strip(), re.IGNORECASE + re.VERBOSE)
        for link in matches:
            # link.group() contains the full valid url extracted by the regex
            link = link.group()
            if not re.search(self.filter_pat, link, re.IGNORECASE) and not user == "ChanServ":
                title = self.get_title(link)
                self.link_log.write("%s - %s: %s > %s\n" % (chan, user, link, title))
                self.link_log.flush()

    def close(self):
        self.link_log.close()

    def get_date(self):
        # Return date in format: Sunday, August 01, 2010
        return datetime.now().strftime("%A, %B %d, %Y")

    def get_title(self, url):
        # Get contents of <title> tag in the url
        try:
            source = urllib.urlopen(url).read()
            return BeautifulSoup(source).title.text
        except:
            return "No Title"