Пример #1
0
        def find_email(responce, contact_link):
            email_pattern = '[A-Za-z0-9]*@{1}[A-Za-z0-9]*\.(com|org|de|edu|gov|uk){1}'

            soup = BeautifulSoup(response.body, 'lxml')
            for script in soup(["script", "style"]):
                script.extract()
            #text = soup.get_text().split('\n')
            for each in soup.get_text().split('\n'):
                try:
                    email_re = re.findall(email_pattern, each)
                    #print(email_re)
                    if len(email_re) > 0:
                        print(email_re)
                    if len(email_re[0]) > 5 and len(email_re[0]) < 75:
                        email = email_re[0]

                except Exception as e:
                    #print(e)
                    pass
                    try:
                        assert len(contact_link) > 0
                        if len(email) == 0:
                            request = SeleniumRequest.follow(url=contact_link, callback=find_email, meta={'splash': {'endpoint': 'render.html', 
                                                                                                                    'args': {'html': 1,
                                                                                                                            'png': 1,
                                                                                                                            'width': 600,
                                                                                                                            'render_all': 1,
                                                                                                                            'wait': 0.5}}})
                    except Exception as e:
                        email = None

            return email
Пример #2
0
        def find_email(contact_link):

            r = requests.get(contact_link)
            email_pattern = '[A-Za-z0-9]*@{1}[A-Za-z0-9]*\.(com|org|de|edu|gov|uk){1}'
            #email_pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'

            soup = BeautifulSoup(r.text, 'lxml')
            for script in soup(["script", "style"]):
                script.extract()
            #text = soup.get_text().split('\n')
            for each in soup.get_text().split('\n'):
                try:
                    email_re = re.search(email_pattern, each)
                    if len(email_re.group(0)) > 5 and len(
                            email_re.group(0)) < 75:
                        email = email_re.group(0)

                except Exception as e:
                    try:
                        assert len(contact_link) > 0
                        if len(email) == 0:
                            request = SeleniumRequest.follow(
                                url=contact_link,
                                callback=find_email,
                                meta={
                                    'splash': {
                                        'endpoint': 'render.html',
                                        'args': {
                                            'html': 1,
                                            'png': 1,
                                            'width': 600,
                                            'render_all': 1,
                                            'wait': 0.5
                                        }
                                    }
                                })
                    except Exception as e:
                        #print(e)
                        email = None
            try:
                return email
            except Exception as e:
                print('emails failed:', e)
                return None