def find_email(responce, contact_link): email_pattern = '[A-Za-z0-9]*@{1}[A-Za-z0-9]*\.(com|org|de|edu|gov|uk){1}' soup = BeautifulSoup(response.body, 'lxml') for script in soup(["script", "style"]): script.extract() #text = soup.get_text().split('\n') for each in soup.get_text().split('\n'): try: email_re = re.findall(email_pattern, each) #print(email_re) if len(email_re) > 0: print(email_re) if len(email_re[0]) > 5 and len(email_re[0]) < 75: email = email_re[0] except Exception as e: #print(e) pass try: assert len(contact_link) > 0 if len(email) == 0: request = SeleniumRequest.follow(url=contact_link, callback=find_email, meta={'splash': {'endpoint': 'render.html', 'args': {'html': 1, 'png': 1, 'width': 600, 'render_all': 1, 'wait': 0.5}}}) except Exception as e: email = None return email
def find_email(contact_link): r = requests.get(contact_link) email_pattern = '[A-Za-z0-9]*@{1}[A-Za-z0-9]*\.(com|org|de|edu|gov|uk){1}' #email_pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)' soup = BeautifulSoup(r.text, 'lxml') for script in soup(["script", "style"]): script.extract() #text = soup.get_text().split('\n') for each in soup.get_text().split('\n'): try: email_re = re.search(email_pattern, each) if len(email_re.group(0)) > 5 and len( email_re.group(0)) < 75: email = email_re.group(0) except Exception as e: try: assert len(contact_link) > 0 if len(email) == 0: request = SeleniumRequest.follow( url=contact_link, callback=find_email, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'html': 1, 'png': 1, 'width': 600, 'render_all': 1, 'wait': 0.5 } } }) except Exception as e: #print(e) email = None try: return email except Exception as e: print('emails failed:', e) return None