def get_dom(url): try: s_content = URL(url).download(timeout=120, cached=False) except (URLError, HTTP404NotFound): print "Error downloading article" return None # for AJE compatibility try: s_content = s_content.decode("unicode_escape") except (UnicodeEncodeError): pass return Document(s_content)
def get_dom(url): try: s_content = URL(url).download(timeout=120, cached=False) except (URLError, HTTP404NotFound): print "Error downloading article" return None #for AJE compatibility try: s_content = s_content.decode('unicode_escape') except (UnicodeEncodeError): pass return Document(s_content)
index = None for j in range(3): for tweet in twitter.search('artificial intelligence', start=index, count=3): print(tweet.text) index = tweet.id # ### Converting HTML Data to Plain Text from pattern.web import URL, plaintext html_content = URL( 'https://stackabuse.com/python-for-nlp-introduction-to-the-textblob-library/' ).download() cleaned_page = plaintext(html_content.decode('utf-8')) print(cleaned_page) # ### Parsing PDF Documments # #### Using Pattern PDF module (doesn't work) # # This doesn't work # from pattern.web import URL, PDF # pdf_doc = URL('http://demo.clab.cs.cmu.edu/NLP/syllabus_f18.pdf').download() # # pdf_doc2 = URL('https://courses.cs.ut.ee/LTAT.01.001/2020_spring/uploads/Main/Lecture1_Introduction.pdf').download() # print(PDF(pdf_doc2.decode('utf-8'))) # #### Using PyPDF2 library [4]
from pattern.web import download page_html = download('https://en.wikipedia.org/wiki/Artificial_intelligence', unicode=True) #You can also download files from webpages, for example, images using the URL method: from pattern.web import URL, extension page_url = URL( 'https://upload.wikimedia.org/wikipedia/commons/f/f1/RougeOr_football.jpg') file = open('football' + extension(page_url.page), 'wb') file.write(page_url.download()) file.close() #%%%Finding URLs within Text #You can use the findurl method to extract URLs from text strings. Here is an example: from pattern.web import find_urls print(find_urls('To search anything, go to www.google.com', unique=True)) #%%Parsing PDF Documments #The Pattern library contains PDF object that can be used to parse a PDF document. PDF (Portable Document Format) is a cross platform file which contains images, texts, and fonts in a stand-alone document. from pattern.web import URL, PDF pdf_doc = URL('http://demo.clab.cs.cmu.edu/NLP/syllabus_f18.pdf').download() pdf_doc print(PDF(pdf_doc.decode('utf-8'))) #%% Clearning Cache from pattern.web import cache cache.clear()