def search_emails(self):

        # use try/finally to show results in case a cancel occurs
        try:
            urls = self.select_urls()
            wc = webcrawler(urls, self._regexps, True, self.depth, self.proxy)
            wc.crawl()

        finally:
            # forbiden string in 'first_name last_name'
            forbiden_subtrings = ['escribi', 'resumes', 'write', 'email', 'cv ', 'comments', 'e-mail', 'contact', 'support', 'him ', 'please', ' from']
            forbiden_suffixes = [' to', ' list', ' lists', ' as', ' a']
            # normalize found addresses
            for email in wc.matches:
                # convert result to lowercase, since the DB is case insensitive
                email = email.lower()

                # replace ' at ' with '@' and ' dot ' with '.'
                email = email.replace(' at ', '@').replace(' dot ', '.')
                # remove <b> and </b> tags
                email = email.replace('<b>', '').replace('</b>', '')
                # replace '&#64;' with '@'
                email = email.replace('&#64;', '@')
                
                if len(re.findall('@', email)) > 1:
                    continue
                s = email.split(' ')
                if len(s) > 1: # has first_name last_name
                    names = s[:2]
                    email = ' '.join(s[2:])
                    for forb in forbiden_subtrings:
                        if forb in ' '.join(names):
                            names = []
                    for forb in forbiden_suffixes:
                        if ' '.join(names).endswith(forb):
                            names = []
                    # no numbers
                    if re.search('[0-9]+', ' '.join(names)):
                        names = []
                else:
                    names = []
                    email = ' '.join(s)

                if email.endswith('.'):
                    email = email[:-1]
                email = email.replace('&gt','')

                if not '...' in email:
                    if not email in self.emails or self.emails[email] == '':
                        self.emails[email] = ' '.join(names)
                elif self.canFixEmail(email, names):
                    email = self.fixEmail(email, names)
                    if not email in self.emails or self.emails[email] == '':
                        self.emails[email] = ' '.join(names)
Пример #2
0
import data_load
import searcher
import indexer
import webcrawler

data_load.traverser()
webcrawler.webcrawler()
d = indexer.process_data("raw_data.pickle", "webdata.pickle")

searcher.search("fortune_shelve")
Пример #3
0
import data_load
import searcher
import indexer
import webcrawler


data_load.traverser()
webcrawler.webcrawler()
d=indexer.process_data("raw_data.pickle","webdata.pickle")

searcher.search("fortune_shelve")