示例#1
0
from pattern import web
from pattern.web import URL, Element

url = URL('https://docs.google.com/spreadsheets/d/1J2I40hglES63YZHROcOL3oAjDPqiiKLRPE_ikAWsR-Q/pubhtml?gid=1267634591').read()
dom = Element(url)
dom = dom.by_tag('tbody')[0]

#date Get the date from the header
date = dom.by_class('s0')[1].content


#places Read the place from available class='s4' inside <td>
places = []
for ix in dom.by_class('s4'):
       		places.append(ix.content)

try:
       reading_row = [4, 10, 16, 22]
       pol_reading = []

       for row in reading_row:
               	reading = dom.by_tag('tr')[row]
               	reading = reading.by_tag('td')
               	for i in reading:
               		if len(i) >= 1:
               			pol_reading.append(i.content)


       pol_updated_row = [5, 11, 17, 23]
       pol_updated = []
示例#2
0
文件: 07-dom.py 项目: Grepsy/cityment
import os, sys
sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.web import URL, Document, plaintext
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

# The web module has a number of convenient search engines,
# but often you will need to handle the HTML in web pages of your interest manually.
# The Document object can be used for this, similar to the Javascript DOM.

# For example:
url = URL("http://www.reddit.com/top/")
dom = Document(url.download(cached=True))
for e in dom.get_elements_by_tagname("div.entry")[:5]:  # Top 5 reddit entries.
    for a in e.get_elements_by_tagname(
            "a.title")[:1]:  # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attributes["href"]
        print

# The Document object is a tree of Element and Text objects.
# All objects inherit from Node, Document also inherits from Element.

# Node.type          => NODE, TEXT, COMMENT, ELEMENT, DOCUMENT
# Node.parent        => Parent Node object.
# Node.children      => List of child Node objects.
# Node.next          => Next Node in Node.parent.children.
# Node.previous      => Previous Node in Node.parent.children.

# Document.head      => Element with tag name "head".
# Document.body      => Element with tag name "body".
示例#3
0
finder.apply_word_filter(filter_ngrams)
finder.nbest(bigram_measures.pmi, 10)


print scored

## readability
import pprint
printp =pprint.PrettyPrinter()
printp.pprint(text)


## Pattern
from pattern.web import URL, plaintext
url = URL(url, method = GET)
text = url.download()
simple_text = plaintext(text, indetation = True)







#### Scoring:

scored = finder.score_ngrams( bigram_measures.likelihood_ratio  )
 
# Group bigrams by first word in bigram.                                        
prefix_keys = collections.defaultdict(list)
示例#4
0
def extract_tvseries(dom):

    # declare the observed URL
    url = URL(TARGET_URL)

    # download the URL to read the code
    dom = DOM(url.download(cached=True))

    # initialize list for the data we need
    data = []

    # iterate over parts of the list of tv series
    for e in dom('div[class="lister-item-content"]'):
        # look for title and accept all symbols
        title = e('a')[0].content.encode("utf-8")

        # look for rating
        rating = e('div[class="inline-block \
                    ratings-imdb-rating"] strong')[0].content

        # initialize list for genres
        genre = []

        # calculate amount of genres
        alengte = len(e('span[class="genre"]'))

        # iterate over all genres
        for a in range(alengte):
            # look for genres and accept all symbols
            genres = e('span[class="genre"]')[a].content.encode("utf-8")

            # add new genre to list of genres
            genre.append(genres)

        # remove extra whitespace and join list with commas in between
        stripped_genres = (",".join([s.strip() for s in genre]))

        # initialize list for actors
        actors = []

        # calculate amount of actors
        blengte = len(e('a'))

        # start looking at position of first actor
        for b in range(12, blengte):
            # look for actor and make sure all symbols are accepted
            actor = e('a')[b].content.encode("utf-8")

            # add new actor to list
            actors.append(actor)

        # join actors with commas in between
        stripped_actors = (", ".join(actors))

        # look for runtime
        runtime = e('span[class="runtime"]')[0].content

        # keep the digits, throw away 'min'
        runtime_digits = ''.join([i for i in runtime if i.isdigit()])

        # add the new data to data list
        data.append(
            (title, rating, stripped_genres, stripped_actors, runtime_digits))

    # return the collected data
    return data
            col_len=2)

        xls_set_class.open_excel_and_process_block_data()
        xls_set_class.data_label_list

    if choice ==7:
        """ use the price extract here to pull """


    if choice == 8:
        """ Get the short sell volume. Need to get the shortsell vs the volumne traded that day
            Url need change with the current date

        """
        target_url = 'http://sgx.com/wps/wcm/connect/sgx_en/home/market_info/short_sale/short_sale_daily/DailyShortSell20150417.txt'
        url = URL(target_url)
        url_data = url.download(timeout=50)
        shortsell_list = pandas.io.html.read_html(url_data)
        shortsell_df = shortsell_list[1]
        #ned to remove the first ropws
        shortsell_df.rename(columns={
            0: 'Security',
            1: 'Short Sale Volume',
            2: 'Currency',
            3: 'Short Sale Value',
        },
                            inplace=True)
        shortsell_df = shortsell_df[1:-3]

    if choice == 9:
        """ combine the shortsell with the current price."""
示例#6
0
except:
    csv = Datasheet()
    seen = set()

for (level, bias, label, name), url in sources.items():
    try:
        f = Newsfeed()
        f = f.search(url, cached=False)
    except:
        continue

    for r in f:

        # 1) Download source & parse the HTML tree:
        try:
            src = URL(r.url).download(cached=True)
            dom = DOM(src)
        except Exception as e:
            continue

        # 2) Find article text w/ CSS selectors:
        for selector in (
                "article[class*='node-article']",  # The Hill
                "span[itemprop='articleBody']",
                "div[itemprop='articleBody']",
                "div[id='rcs-articleContent'] .column1",  # Reuters
                "div[class='story-body']",
                "div[class='article-body']",
                "div[class='article-content']",
                "div[class^='tg-article-page']",
                "div[class^='newsArticle']",
示例#7
0
r_url = redis.StrictRedis(host='localhost', port=6379, db=0)
r_text = redis.StrictRedis(host='localhost', port=6379, db=1)
# c=0 # debug counter
# output = open('test_output_redis.txt','w') # deprecated, was for debug

while True:
    # take url, add to redis URL store WITH expire time set for EXPIRE_IN seconds.
    # if result of redis INCR command is > 1, it means the URL was already there (but we still updated its TTL)
    # so if result > 1, we should also resolve the url semantically (this should be another beanstalk tube, another job?)
    # resolving the url means, fetch it in pattern and check the mimetype to ensure we only parse text -containing stuff
    # then use pattern to get chunks and noun phrases and shove them in another redis store
    # (where key is the phrase, and value is just INCR?)

    job = beanstalk.reserve(
    )  # this is blocking, waits till there's something on the stalk
    url = URL(job.body)
    pipe = r_url.pipeline(transaction=True)
    redis_response = pipe.incr(url).expire(url, EXPIRE_IN).execute(
    )  # should I be updating the TTL? Experience-design question more than anything
    # print redis_response

    if (redis_response[0] < 2):
        print 'new url, we think', url
        try:
            s = url.download(cached=True)
            print url.mimetype
            if (url.mimetype in MIMETYPE_WEBPAGE) or (url.mimetype
                                                      in MIMETYPE_PLAINTEXT):
                s = plaintext(s)
                '''
                parsetree(string,
示例#8
0
#print books(1)
#print books(2)

# We can use Chrome's Developer Tools to inspect the HTML of the overview page.
# It turns out each link to each book is contained in a <div class="prod"> element.

# In Pattern, the DOM (Document Object Model) is a tree of nested HTML elements,
# along with useful methods to traverse and search the tree.
# http://www.clips.ua.ac.be/pages/pattern-web#DOM
# It is easy to fetch each <div class="prod">:

corpus = Datasheet()

for i in range(45):  # How many pages?
    url = books(i + 1)
    url = URL(url)
    html = url.download(cached=True)  # Cache the HTML source locally.
    for product in DOM(html).by_class("prod"):
        #print product.source

        # The link to each book page looks something like:
        # http://www.amazon.fr/dieux-voyagent-toujours-incognito/dp/2266219154/
        a = product.by_tag("a")[0]
        a = a.attributes["href"]
        #print a

        # After some searching with Chrome,
        # I found that there is a page with 10 reviews about this book:
        # http://www.amazon.fr/product-reviews/2266219154/
        # So we want to parse the book id from the first link and mine its reviews page:
        id = a.split("/")[-2]
示例#9
0
import csv
from pattern.web import URL, DOM, plaintext
from collections import defaultdict

filterlist = ["1","2","3","4","5","6","7","8","9","0","/","-"]

playerlist = list()

players = ["3975/stephen-curry", "9/ray-allen", "552/reggie-miller", "841/jason-terry", "662/paul-pierce", "429/jason-kidd", "136/vince-carter", "165/jamal-crawford", "63/chauncey-billups", "2011/kyle-korver", "469/rashard-lewis", "813/peja-stojakovic", "1007/joe-johnson", "110/kobe-bryant"]

htmllink = "http://espn.go.com/nba/player/stats/_/id/"

output_file = open('total3pointers.json', 'w')

for player in players:
    TARGET_URL = URL(htmllink + player)
    dom = DOM(TARGET_URL.download(cached=True))
    dataofyear = dict()

    print player
    total3pointers = 0;

    for e in dom.by_tag("div.mod-container mod-table mod-player-stats"):
        for a in e.by_tag("div.mod-content")[1:2]:
            for tablehead in a.by_class("tablehead"):
                year = -1
                for oddrow in tablehead.by_class("oddrow"):
                    madeshots = oddrow[4].content[:3]
                    madeshots = int(madeshots.replace("-", ""))

                    total3pointers += int(madeshots)
示例#10
0
    url = link
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    gdata = soup.find_all('div',{'class':'row'})
    for g in gdata:
        baselink = 'http://www.txlottery.org'
        try:
            if re.findall('[0-9]{2} [A-Za-z]{3} [0-9]{2}',str(g)):
                date = re.findall('[0-9]{2} [A-Za-z]{3} [0-9]{2}',str(g))
        except:
            pass
        for link in g('a'):
            try:
                if re.search('pdf$',str(link.get('href')),flags=re.IGNORECASE):
                    link = baselink+link.get('href')
                    pdfurl = URL(link)
                    pdfext = '/Users/macuser/Desktop/smithpdf'+link[link.rfind('/'):]
                    pdfext = re.sub('[!@#\$%\^&*]','',str(pdfext))
                    f = open(pdfext, 'wb')
                    f.write(pdfurl.download(cached=False))
                    f.close()
                    links.append(link)
                    callThis = 'pdftotext '+pdfext+' '+'/Users/macuser/Desktop/smithtxt'+pdfext[pdfext.rfind('/'):-4]+'.txt'
                    subprocess.call(callThis,shell=True)
            except:
                logging.exception('')
                pass


patht = '/Users/macuser/Desktop/smithtxt'
pathp = '/Users/macuser/Desktop/smithpdf'
示例#11
0
        self.load_stock_symbol_fr_file()


if __name__ == '__main__':

    print "start processing"

    choice = 2

    if choice == 1:
        """try the download format of  YQL"""
        url_address = 'https://query.yahooapis.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.finance.keystats%20WHERE%20symbol%3D%27BN4.SI%27&format=json&diagnostics=true&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback='
        savefile = r'c:\data\temptryyql.json'

        cache.clear()
        url = URL(url_address)
        f = open(savefile, 'wb')  # save as test.gif
        f.write(url.download())
        f.close()

    if choice == 2:
        """ Handling Json file
            how to include the multiple keys per --> use  w['query']['results']['stats'].keys()

        """

        savefile = r'c:\data\temptryyql.json'
        w = json.load(open(r'c:\data\temptryyql.json', 'r'))
        com_data_stock_list = list()
        for indivdual_set in w['query']['results']['stats']:
            temp_dict_data = {}
示例#12
0
文件: update.py 项目: waytai/pattern
#--- DOWNLOAD & UPDATE -------------------------------------------------------------------------------

for p in ("-", "-web", "-db", "-en", "-es", "-de", "-nl", "-search", "-vector",
          "-graph", "-canvas", "-metrics", "-shell", "stop-words", "mbsp-tags",
          "-dev"):
    # We include some useful pages (Penn Treebank tags, stop words) referenced in the documentation.
    if p.startswith("-"):
        p = "pattern" + p.rstrip("-")
        title = p.replace("-", ".")
    if p == "stop-words":
        title = "Stop words"
    if p == "mbsp-tags":
        title = "Penn Treebank II tag set"
    # Download the online documentation pages.
    print "Retrieving", url + p
    html = URL(url + p).download(cached=False)
    # Parse the actual documentation, we don't need the website header, footer, navigation, search.
    html = Document(html)
    html = html.by_id("content-area")
    html = html.by_class("node-type-page")[0]
    html = html.source
    html = strip_javascript(html)
    html = strip_between('<div id="navbar">', '/#navbar -->', html)
    html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->',
                         html)
    html = strip_between('<div id="footer">', '/#footer -->', html)
    html = strip_between('<a href="http://twitter.com/share"', '</a>', html)
    # Link to local pages and images.
    # Link to online media.
    html = html.replace('href="/pages/MBSP"',
                        'href="%sMBSP"' % url)  # MBSP docs (online)
from pattern.web import URL, DOM, extension, MIMETYPE_IMAGE
from pattern.web import Element, download
import urllib
import datetime

#libraries to check urllib (legacy vs not), pattern, requests
url = URL("http://www.dot.ca.gov/dist1/d1tmc/allcams.php")
dom = DOM(url.download(cached = True))
i = 0
try :
    for e in dom.by_tag('img'):
        if (extension(e.attr['src']) == '.jpg'):
            print(e.attr['src'])
            urllib.request.urlretrieve(e.attr['src'], "data/test/urllib{0}.jpg".format(i))
            #image = download(e.attr['src'], unicode= False, timeout= 5)
            #f = open("data/test/pattern{0}.jpg".format(i), 'wb')
            #f.write(image)
            i += 1
except:
    print ("error")
        
"""
image = "http://www1.dot.ca.gov/cwwp2/data/d1/cctv/image/us101northofcushingcreeklookingsouth/us101northofcushingcreeklookingsouth.jpg"
url = URL(image)
print (url.mimetype in MIMETYPE_IMAGE)
urllib.request.urlretrieve(image, 'data/test2.jpg')
"""

示例#14
0
文件: 12-dom.py 项目: Abhishek-1/temp
# The DOM object can be used for this, similar to the Javascript DOM.
# The DOM (Document Object Model) parses a string of HTML
# and returns a tree of nested Element objects.
# The DOM elements can then be searched by tag name, CSS id, CSS class, ...

# For example, top news entries on Reddit are coded as:
# <div class="_1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah">
#     ...
#     <span class="y8HYJ-y_lTUHkQIc1mdCq yj3st6-1 kYJFRo">
#     ...
#         <a class="SQnoC3ObvgnGjWt90zD9Z " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a>
#     ...
# </div>
#
# ... which - naturally - is a picture of a cat.
url = URL("http://www.reddit.com/top/")
dom = DOM(url.download(cached=True))
#print(dom.body.content)
for e in dom.by_tag("div._1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah"
                    )[:5]:  # Top 5 reddit entries.
    for a in e.by_tag("a.SQnoC3ObvgnGjWt90zD9Z")[:1]:
        print(plaintext(a.content))
        print(a.attrs["href"])
        print("")

# The links in the HTML source code may be relative,
# e.g., "../img.jpg" instead of "www.domain.com/img.jpg".
# We can get the absolute URL by prepending the base URL.
# However, this can get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
示例#15
0
# The pattern.web module has a number of convenient search engines, as demonstrated.
# But often you will need to handle the HTML in web pages of your interest manually.
# The DOM object can be used for this, similar to the Javascript DOM.
# The DOM (Document Object Model) parses a string of HTML
# and returns a tree of nested Element objects.
# The DOM elements can then be searched by tag name, CSS id, CSS class, ...

# For example, top news entries on Reddit are coded as:
# <div class="entry">
#     <p class="title">
#         <a class="title " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a>
#     ...
# </div>
#
# ... which - naturally - is a picture of a cat.
url = URL("http://www.reddit.com/top/")
dom = DOM(url.download(cached=True))
#print dom.body.content
for e in dom.by_tag("div.entry")[:5]:  # Top 5 reddit entries.
    for a in e.by_tag("a.title")[:1]:  # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attrs["href"]

# The links in the HTML source code may be relative,
# e.g., "../img.jpg" instead of "www.domain.com/img.jpg".
# We can get the absolute URL by prepending the base URL.
# However, this can get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in DOM(url.download()).by_tag("a"):
示例#16
0
package_categories = {}

with open(sys.argv[1], 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        package_categories[row[0]] = row[1]

package_titles = [x.replace(".apk","").replace("\n","") for x in open(sys.argv[2]).readlines()]

counter = 0
for package_title in package_titles:
    if package_title in package_categories:
        continue

    try:
        url = URL("https://play.google.com/store/apps/details?id=%s" % package_title)
        dom = DOM(url.download(cached=True))
        for e in dom('a.category'):
            category = e.href.split("/")[-1]
            package_categories[package_title] = category
    except:
        print "Request failed on %s" % package_title
        pass

    counter += 1

    if counter % 5 == 0:
        with open(sys.argv[1], 'wb') as f:
            writer = csv.writer(f)
            writer.writerows([(key, value) for key, value in package_categories.items()])
        print counter
示例#17
0
from pattern.web import Google, URL
from pattern.web import Document, plaintext

# An interesting experiment on how to use the Google API 
# and http://amplicate.com for opinion mining.
# (let's hope we get a real Amplicate API soon!)

query = "smurf"

# An example result, containing all the information we need:
#   URL: http://amplicate.com/love/george-w-bush
# Title: <b>George</b> W <b>Bush</b> Hate - 64% People Agree (803 opinions)
for r in Google().search(query+" site:amplicate.com"):
    print r.title
    u = URL(r.url)
    if "love" in u.path \
    or "hate" in u.path:
        b = True
        p = u.page.lower().replace("-", "")
        for i, w in enumerate(query.lower().replace("-", " ").split()):
            if i == 0 and not p.startswith(w):
                b=False; break
            if w not in p: 
                b=False; break
        if b:
            love = "love" in u.path
            f = int(re.search("- ([0-9]{1,3})%", r.title).group(1)) * 0.01
            n = int(re.search("\(([0-9]+) opinions", r.title).group(1))
            print r.title
            print r.url
示例#18
0
        use_audio = 0

        if not use_audio:

            selected_video_obj = video.getbest('mp4')
            if selected_video_obj == None:
                selected_video_obj = video.getbest('flv')

            if selected_video_obj:
                download_fullpath = os.path.join(
                    r'c:\data\temp\youtube_videos',
                    selected_video_obj.filename)
                if not os.path.isfile(download_fullpath):
                    selected_video_obj.download(download_fullpath, quiet=True)

        else:
            bestaudio = video.getbestaudio()
            download_fullpath = os.path.join(r'c:\data\temp\youtube_videos',
                                             bestaudio.filename)
            if not os.path.isfile(download_fullpath):
                bestaudio.download(download_fullpath, quiet=True)

    if choice == 5:
        """ Retrieve indvidvial file """
        url_target = 'https://www.youtube.com/results?search_query=ogt+cat'
        url = URL(url_target)
        dom_object = DOM(url.download(cached=True))
        w = dom_object(
            'div[class="yt-lockup-content"] h3[class="yt-lockup-title"] a')
示例#19
0
        ##            ss.quick_set_symbol_and_param_type('S58.SI', 'analyst_opinion')
        ss.form_full_url()
        ##            print
        ##            print ss.get_list_of_param_selector_avaliable()
        ##            print ss.full_url_str
        ##            #ss.parse_company_desc()
        ##            d = ss.parse_analyst_opinion()
        ##            print ss.header_list, ss.value_list
        ss.parse_all_parameters()
        print ss.individual_stock_df
        ss.individual_stock_df.to_csv(r'c:\data\check.csv')

    if choice == 3:

        url_str = 'https://sg.finance.yahoo.com/q/ks?s=S24.SI'
        url = URL(url_str)
        dom_object = DOM(url.download(cached=True))
        #get the yeear
        w = dom_object('td[class="yfnc_tabledata1"]')
        w = dom_object('td[class="yfnc_tablehead1]')
        for n in range(len(w)):
            print n
            print w[n].content

    if choice == 2:
        yf = YFinanceDirectScrape()
        yf.create_dom_object()
        # b tag inside the td
        # based on css selectors
        for n in yf.tag_element_results(yf.dom_object,
                                        'td[class="yfnc_modtitle1"] b'):
示例#20
0
import pandas as pd
pd.core.common.is_list_like = pd.api.types.is_list_like
from pandas_datareader import data as pdr
import fix_yahoo_finance
from pattern.web import URL

tickers = pd.read_csv('static/wilshire5000.csv', delimiter=',')
tickers.head()

#pdr.get_data_yahoo(tickers['Ticker'][0], '2015-01-01', '2015-01-08')

for stock in tickers['Ticker']:
    webpage = "http://financials.morningstar.com/ajax/exportKR2CSV.html?t=%s&culture=en-CA&region=USA&order=asc&r=314562" % stock
    url = URL(webpage)
    f = open('%s_keyratios.csv' % stock, 'wb')
    f.write(url.download())
    f.close()
示例#21
0
from s3pipeline import S3Connector
import json
from pattern.web import URL

if __name__ == "__main__":
    okrMain = S3Connector()
    okrMain.accessBucket('sumit_okr')

    okrPdfBc = S3Connector()
    okrPdfBc.accessBucket('sumit_okr_pdf')

    for key in okrMain.bucket:
        if okrPdfBc.bucket.get_key(key):
            print 'key exists... skipping %s' % key
            continue
        else:
            print 'saving .pdf of %s' % key
            record = json.loads(okrMain.getStringContent(key))
            pdfUrl = record['pdfUrl']
            try:
                url = URL(pdfUrl)
                okrPdfBc.storeStringContent(key, url.download(cached=False))
            except:
                print '%s is not a valid URL' % pdfUrl
                okrPdfBc.storeStringContent(key, '<No pdf for this article>')
 def isPDF(self, param):
     url = URL(param)
     if "pdf" in extension(url.page):
         return 1
     else:
         return 0
示例#23
0
        print e
        print e.src
        print e.src.read()
        return {}, {}
    return bot_replies, user_replies


if __name__ == '__main__':
    import json
    from pattern.web import URL, Twitter

    # Tweet to post:
    tweet = "test tweet"

    url = URL("https://api.twitter.com/1.1/statuses/update.json",
              method="post",
              query={"status": tweet})

    twitter = Twitter(license=ccpattern)

    url = twitter._authenticate(url)

    try:
        # Send the post request.
        a = json.loads(url.open().read())
        reply_id = a["id"]
        print reply_id
    except Exception as e:
        print e
        print e.src
        print e.src.read()
def save_csv(f, tvseries):
    '''
    Output a CSV file containing highest rated TV-series.
    '''
    writer = csv.writer(f)
    writer.writerow(['Title', 'Runtime', 'Genre', 'Rating', 'Actors'])
    
    # WRITE THE TV-SERIES TO DISK
    movies = 10 
    for i in range(movies):
        writer.writerow(tvseries[i][:])

if __name__ == '__main__':
    # Download the HTML file
    url = URL(TARGET_URL)
    html = url.download()

    # Save a copy to disk in the current directory, this serves as an backup
    # of the original HTML, will be used in grading.
    with open(BACKUP_HTML, 'wb') as f:
        f.write(html)

    # Parse the HTML file into a DOM representation
    dom = DOM(html)

    # Extract the tv series (using the function you implemented)
    tvseries = extract_tvseries(dom)

    # Write the CSV file to disk (including a header)
    with open(OUTPUT_CSV, 'wb') as output_file:
示例#25
0
#!/usr/bin/env python
# Name:Zelda Zeegers
# Student number:11397705
'''
This script scrapes IMDB and outputs a CSV file with highest rated tv series.
'''
import csv
import re
from pattern.web import URL, DOM
import unicodedata

TARGET_URL = URL(
    "http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series"
)
dom = DOM(TARGET_URL.download(cached=True))
BACKUP_HTML = 'tvseries.html'
OUTPUT_CSV = 'tvseries.csv'


def extract_tvseries(dom):
    '''
    Extract a list of highest rated TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Rating
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''