Пример #1
0
def get_dom(url):

    try:
        s_content = URL(url).download(timeout=120, cached=False)
    except (URLError, HTTP404NotFound):
        print "Error downloading article"
        return None

    # for AJE compatibility
    try:
        s_content = s_content.decode("unicode_escape")
    except (UnicodeEncodeError):
        pass

    return Document(s_content)
Пример #2
0
def get_dom(url):
    
    try:
        s_content = URL(url).download(timeout=120, cached=False)
    except (URLError, HTTP404NotFound):
        print "Error downloading article"
        return None

    #for AJE compatibility
    try:
        s_content = s_content.decode('unicode_escape')
    except (UnicodeEncodeError):
        pass
    
    return Document(s_content)
Пример #3
0
index = None
for j in range(3):
    for tweet in twitter.search('artificial intelligence',
                                start=index,
                                count=3):
        print(tweet.text)
        index = tweet.id

# ### Converting HTML Data to Plain Text

from pattern.web import URL, plaintext

html_content = URL(
    'https://stackabuse.com/python-for-nlp-introduction-to-the-textblob-library/'
).download()
cleaned_page = plaintext(html_content.decode('utf-8'))
print(cleaned_page)

# ### Parsing PDF Documments

# #### Using Pattern PDF module (doesn't work)

# # This doesn't work
# from pattern.web import URL, PDF

# pdf_doc = URL('http://demo.clab.cs.cmu.edu/NLP/syllabus_f18.pdf').download()
# # pdf_doc2 = URL('https://courses.cs.ut.ee/LTAT.01.001/2020_spring/uploads/Main/Lecture1_Introduction.pdf').download()
# print(PDF(pdf_doc2.decode('utf-8')))

# #### Using PyPDF2 library [4]
Пример #4
0
from pattern.web import download
page_html = download('https://en.wikipedia.org/wiki/Artificial_intelligence',
                     unicode=True)

#You can also download files from webpages, for example, images using the URL method:

from pattern.web import URL, extension
page_url = URL(
    'https://upload.wikimedia.org/wikipedia/commons/f/f1/RougeOr_football.jpg')
file = open('football' + extension(page_url.page), 'wb')
file.write(page_url.download())
file.close()

#%%%Finding URLs within Text
#You can use the findurl method to extract URLs from text strings. Here is an example:
from pattern.web import find_urls
print(find_urls('To search anything, go to www.google.com', unique=True))

#%%Parsing PDF Documments
#The Pattern library contains PDF object that can be used to parse a PDF document. PDF (Portable Document Format) is a cross platform file which contains images, texts, and fonts in a stand-alone document.

from pattern.web import URL, PDF
pdf_doc = URL('http://demo.clab.cs.cmu.edu/NLP/syllabus_f18.pdf').download()
pdf_doc
print(PDF(pdf_doc.decode('utf-8')))

#%% Clearning Cache
from pattern.web import cache
cache.clear()