def get_player_var_set(url, var_set): tmp_html = web.URL(url).download(cached=False) regM = re.search('href="(/Matches.{1,128}?)".{1,10}Match Centre', tmp_html, re.DOTALL) match_centre_url = 'http://www.whoscored.com' + regM.group(1) player_stats_url = match_centre_url.replace('Live', 'LiveStatistics') html = web.URL(player_stats_url).download(cached=False) regM = re.search('var initialData = (.*?);', html, re.DOTALL) data = regM.group(1) while ',,' in data: data = data.replace(',,', ",' ',") data = ast.literal_eval(data) match_overview = data[0][0] match_details = data[0][1] print match_overview[2], match_overview[3] for team in match_details: player_stats = team[4] for p in player_stats: for var in p[3][0]: var_set.add(var[0])
def get_links_from_page(number_of_pages): # get initial url url = web.URL('http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1990,2012') # create an empty array to populate with the urls pages = [] # the loop to get the links for page_index in range(number_of_pages): if page_index == 0: # the first page has only next button so the DOM is different dom = web.DOM(url.download(cached = False)) # to see which part of the DOM to use right click in Chrome # and use 'Inspect Element' entry = dom('span.pagination')[1].by_tag('a') href = 'http://www.imdb.com/' + entry[0].attributes.get('href') pages.append(href) print(pages) url = web.URL(href) else: # after the first page you have both previous and next butoon so you select next dom = web.DOM(url.download(cached = False)) entry = dom('span.pagination')[1].by_tag('a') href = 'http://www.imdb.com/' + entry[1].attributes.get('href') pages.append(href) print(pages) url = web.URL(href) # return a list that handles empty urls return list(set(pages))
def scrape_1_year_data_of_1_league(seed_url, start_week_num, num_weeks): # seed_url = 'http://www.whoscored.com/Regions/252/Tournaments/2/Seasons/1849' # start_week_num = 30 # num_weeks = 48 html = web.URL('https://www.whoscored.com/Accounts/').download( cached=False, user_agent='Mozilla/5.0') regM = re.search('<h2 class="tournament-tables-header">(.*?)</h2>', html, re.DOTALL) if regM is not None: league_name = regM.group(1).replace('Tables', '').strip() if league_name == 'Primera Division': league_name = 'LIGA BBVA' regM = re.search('<div id="sub-navigation".*?>.*?<a href="(.*?)"', html, re.DOTALL) regM = re.search('Stages/(\d+)/', regM.group(1), re.DOTALL) league_season_id = regM.group(1) regM = re.search("min = new Date\((\d+),", html, re.DOTALL) start_year = int(regM.group(1)) # outfile = open('premier-league-2013-2014.txt', 'wb') # var_set = Set() conn = sqlite3.connect('whoscored.db') db_cur = conn.cursor() for i in range(start_week_num, start_week_num + num_weeks): y = start_year + i / 52 w = i % 52 ajax_request_url = 'http://www.whoscored.com/tournamentsfeed/' + league_season_id + '/Fixtures/?d=' + str( y) + 'W' + str(w) + '&isAggregate=false' ajax_return_str = web.URL(ajax_request_url).download(cached=False) matches_of_week = ast.literal_eval(ajax_return_str) if matches_of_week != '[]': for m in matches_of_week: match_id = m[0] match_url = 'http://www.whoscored.com/Matches/' + str( match_id) + '/MatchReport' print str(y) + '-W' + str(w) + ': ', match_url scrape_single_match( league_name, str(start_year) + '-' + str(start_year + 1), match_url, match_id, db_cur, conn) # outfile.write(url+os.linesep) # get_player_var_set(url, var_set) conn.close()
def get_data_from_pages(links): # open an empty array data = [] #create the loop to get the links that you created from the previous function for urltext in links: #parse the url url = web.URL(urltext) # print them for "matrix" like effect (slower, comment this line if you do not want it) print "Getting data from: ", url try: # the main scraping loop, it all about DOM manipulation # learn more about DOM at http://code.tutsplus.com/tutorials/javascript-and-the-dom-series-lesson-1--net-3134 dom = web.DOM(url.download(cached=False)) for movie in dom.by_tag('td.title'): title = movie.by_tag('a')[0].content print title genres = movie.by_tag('span.genre')[0].by_tag('a') genres = [g.content for g in genres] print genres director = movie.by_tag('span.credit')[0].by_tag('a')[0].content print director first_actor = movie.by_tag('span.credit')[0].by_tag('a')[1].content print first_actor second_actor = movie.by_tag('span.credit')[0].by_tag('a')[2].content print second_actor runtime = movie.by_tag('span.runtime')[0].content print runtime rating = movie.by_tag('span.value')[0].content print rating data.append((title, genres, director, first_actor, second_actor, runtime, rating)) except KeyboardInterrupt: break # to be able to interrupt the Ctrl+c without losing the data except: pass # to not stop in case of missing data return data
def test_url_query(self): # Assert URL.query and URL.querystring. v = web.URL(self.url) v.query["page"] = 10 v.query["user"] = None self.assertEqual(v.query, {"q": 1, "page": 10, "user": None}) self.assertEqual(v.querystring, "q=1&page=10&user="******"ünîcødé": 1.5}, "%C3%BCn%C3%AEc%C3%B8d%C3%A9=1.5") v.query = q[0] self.assertEqual(v.querystring, q[1]) # Assert URL.query decodes unicode arguments. v = web.URL("http://domain.com?" + q[1]) self.assertEqual(v.query, q[0]) print "pattern.web.URL.query" print "pattern.web.URL.querystring"
def test_url_string(self): # Assert URL._set_string(). v = web.URL("") v.string = "https://domain.com" self.assertEqual(v.parts[web.PROTOCOL], "https") self.assertEqual(v.parts[web.DOMAIN], "domain.com") self.assertEqual(v.parts[web.PATH], []) print "pattern.web.URL.string"
def _test_search_image_size(self, api, source, license, Engine): # Assert image URL's for different sizes actually exist. if api == "Yahoo" and license == ("",""): return e = Engine(license, throttle=0.25) for size in (web.TINY, web.SMALL, web.MEDIUM, web.LARGE): v = e.search("cats", type=web.IMAGE, count=1, size=size, cached=False) self.assertEqual(web.URL(v[0].url).exists, True) print "pattern.web.%s.search(type=IMAGE, size=%s)" % (api, size.upper())
def test_url_open(self): # Assert URLError. v = web.URL(self.live.replace("http://", "htp://")) self.assertRaises(web.URLError, v.open) self.assertEqual(v.exists, False) # Assert HTTPError. v = web.URL(self.live + "iphone/android.html") self.assertRaises(web.HTTPError, v.open) self.assertRaises(web.HTTP404NotFound, v.open) self.assertEqual(v.exists, False) # Assert socket connection. v = web.URL(self.live) self.assertTrue(v.open() != None) self.assertEqual(v.exists, True) # Assert user-agent and referer. self.assertTrue(v.open(user_agent=web.MOZILLA, referrer=web.REFERRER) != None) print "pattern.web.URL.exists" print "pattern.web.URL.open()"
def test_url_download(self): t = time.time() v = web.URL(self.live).download(cached=False, throttle=0.25, unicode=True) t = time.time() - t # Assert unicode content. self.assertTrue(isinstance(v, unicode)) # Assert download rate limiting. self.assertTrue(t >= 0.25) print "pattern.web.URL.download()"
def test_url_parts(self): # Assert URL._parse and URL.parts{}. v = web.URL(self.url) for a, b in ((web.PROTOCOL, self.parts["protocol"]), (web.USERNAME, self.parts["username"]), (web.PASSWORD, self.parts["password"]), (web.DOMAIN, self.parts["domain"]), (web.PORT, self.parts["port"]), (web.PATH, self.parts["path"]), (web.PAGE, self.parts["page"]), (web.QUERY, self.parts["query"]), (web.ANCHOR, self.parts["anchor"])): self.assertEqual(v.parts[a], b) print "pattern.web.URL.parts"
def test_url(self): # Assert URL.copy(). v = web.URL(self.url) v = v.copy() # Assert URL.__setattr__(). v.username = "******" v.password = "******" # Assert URL.__getattr__(). self.assertEqual(v.method, web.GET) self.assertEqual(v.protocol, self.parts["protocol"]) self.assertEqual(v.username, "new-username") self.assertEqual(v.password, "new-password") self.assertEqual(v.domain, self.parts["domain"]) self.assertEqual(v.port, self.parts["port"]) self.assertEqual(v.path, self.parts["path"]) self.assertEqual(v.page, self.parts["page"]) self.assertEqual(v.query, self.parts["query"]) self.assertEqual(v.anchor, self.parts["anchor"]) print "pattern.web.URL"
# data = data.replace(',,', ",' ',") # data = ast.literal_eval(data) # for p in data[0][1][0][4]: # print p[0:3], p[4:] # end of test # return html seed_url = 'http://www.whoscored.com/Regions/252/Tournaments/2/Seasons/3853' start_week_num = 69 num_weeks = 10 html = web.URL(seed_url).download(cached=False) regM = re.search("<h1>(.*?)</h1>", html, re.DOTALL) league_name = regM.group(1) regM = re.search("img/customstageheaders/(\d+)\.jpg", html, re.DOTALL) league_season_id = regM.group(1) regM = re.search("min = new Date\((\d+),", html, re.DOTALL) start_year = int(regM.group(1)) # outfile = open('premier-league-2013-2014.txt', 'wb') # var_set = Set() conn = sqlite3.connect('whoscored.db') db_cur = conn.cursor() for i in range(start_week_num, start_week_num + num_weeks):
def test_url_redirect(self): # Assert URL redirected URL (this depends on where you are). # In Belgium, it yields "http://www.google.be/". v = web.URL(self.live).redirect print "pattern.web.URL.redirect: " + self.live + " => " + str(v)
def test_url_headers(self): # Assert URL headers. v = web.URL(self.live).headers["content-type"].split(";")[0] self.assertEqual(v, "text/html") print "pattern.web.URL.headers"
def test_url_mimetype(self): # Assert URL MIME-type. v = web.URL(self.live).mimetype self.assertTrue(v in web.MIMETYPE_WEBPAGE) print "pattern.web.URL.mimetype"
def extract_data(package): (page, query) = package print "Checking %s" % page new_webpage = Webpage() new_webpage.url = page try: url = web.URL(page) mimetype = url.mimetype new_webpage.mimetype = mimetype print "Checking mimetype..." if mimetype == 'text/html': print "Mimetype ok (text/html)" #only load Webpages!!! domain = url.domain # u'domain.com' url_feed = '' redirected_page = url.redirect # Actual URL after redirection, or None. path = url.path # [u'path'] # different options to open a webpage print "Opening %s" % page html = url.download(user_agent=choice(user_agents), cached=False) #html = urllib2.urlopen(page).read() else: print 'Wrong mimetype (not text/html)' new_webpage.successful_open = True except: print "Could not open page: %s" % page new_webpage.successful_open = False try: if check_query(query, str( html)): #on s'assure d'abord que ça roule pour le full html new_webpage.successful_open = True dom = web.Document(html) try: title = dom.by_tag('title')[0] title = repr(web.plaintext(title.content)) print "Setting page title to %s" % title except: print "No title found for %s" % page title = '' #two methods for charset detection: charset = None # option to detect page encoding from dom structure => does not seem to work utf-8 systematically retrieved...??? # try: # metas=dom.by_tag('meta') # charset=looking4charset(metas) # print 'charset',charset, 'in page',page # except: # charset=None # # chardet library use # if charset==None: # encoding = chardet.detect(html) # html=html.decode(encoding['encoding']) # else: # html=html.decode(charset) query_result, text_summary, html_summary = check_page_against_query( html, title, query) # charset guess can be used to decode results # if charset==None: # encoding = chardet.detect(html) # html=html.decode(encoding['encoding']) # else: # html=html.decode(charset) #save in a repertory output textual summaries #fileout=open('temp/'+page[7:20]+'.htm','w') #print 'temp/'+page+'.htm' #fileout.write(html_summary) #fileout.close() #if query_result: # dom = web.Document(html_summary) # try: # date = dom.by_tag('date')[0] # date = repr(plaintext(date.content)) # except: # date='' # print '######date',date dateregexp = re.compile(r'(\d{4})-|\\(\d{2})-|\\(\d{2})') date = '' if not redirected_page == None: print 'plus redirection: ', redirected_page try: date = dateregexp.search(redirected_page).groups() new_webpage.date = '-'.join(date) except: pass else: try: date = dateregexp.search(page).groups() new_webpage.date = '-'.join(date) except: pass #print '#############date',date if date == '': date_txt = pattern_date_fr.search(str(text_summary)) if not date_txt == None: date = date_txt.groups() new_webpage.date = '-'.join(date) #date_txt=pattern_date_fr.search("Samedi 6 août 2011606/08/Août/201120:29") if query_result: try: print 'page: ', new_webpage.url, ' with title: ', title, ' and date', new_webpage.date, 'was assessed as ', query_result except: pass #print 'date_txt' #print 'date_txt:',str(date_txt) #feed webpage details with informations new_webpage.url_redirected = redirected_page new_webpage.html = html new_webpage.html_summary = html_summary new_webpage.text_summary = text_summary new_webpage.domain = domain new_webpage.query_result = query_result new_webpage.url_feed = url_feed new_webpage.path = path new_webpage.charset = charset new_webpage.title = title new_webpage.opened = new_webpage.opened + 1 new_webpage.md5 = hashlib.sha224(text_summary).hexdigest() new_webpage.text_html = web.plaintext(html, keep=[], replace=web.blocks, linebreaks=2, indentation=False) #new_webpage.display_page() #new_webpage.links=None else: #the query is not even in the raw html new_webpage.successful_open = True new_webpage.query_result = False except: #print "*** Could not extract data from %s" % page pass return new_webpage
corpus_out = '/'.join(path.split('/')[:-1]) + '/'+query print corpus_out unzip_file_into_dir(path,corpus_out) path=corpus_out print 'Path: ',path if seeks_search == 1: print "Seeks search enabled. Creating Seeks file in %s" % path make_seeds(query,path,nb_results=nb_results) dirList=os.listdir(path) print 'List of files in path: ',dirList for fname in dirList[:]: pagelist =os.path.join(path,fname) try: url=web.URL(pagelist) chaine=url.download(cached=False) new_urls = map(lambda x: url_uniformer(x.split('">')[0]),web.find_urls(chaine, unique=True)) if 'Google Search' in pagelist: new_urls = map(lambda x:x.split("&")[0],new_urls) for new_url in new_urls[:]: print "Checking for forbidden URL..." if not check_forbidden((new_url,'')) and not new_url in pages: pages[new_url]=inlinks_min except: pass print 'Pages init: ', len(pages) print 'Pages: ', pages print "Naming database..." db_name=os.path.join(result_path,query+'_crawl.db')
def scrape_single_match(league_name, season, url, match_id, db_cur, conn): tmp_html = web.URL(url).download(cached=False) regM = re.search('href="(/Matches.{1,128}?)".{1,10}Match Centre', tmp_html, re.DOTALL) match_centre_url = 'http://www.whoscored.com' + regM.group(1) player_stats_url = match_centre_url.replace('Live', 'LiveStatistics') html = web.URL(player_stats_url).download(cached=False) regM = re.search('var initialData = (.*?);', html, re.DOTALL) data = regM.group(1) while ',,' in data: data = data.replace(',,', ",' ',") data = ast.literal_eval(data) match_overview, match_details = data[0][0:2] match_time = match_overview[4] print match_overview[4], ': ', match_overview[2], ' vs. ', match_overview[ 3], ', ', match_overview[12] for idx, team in enumerate(match_details): is_home_team = 'Y' if idx == 0 else 'N' team_id = team[0] team_name = team[1] player_stats = team[4] for player in player_stats: sql_stat = 'replace into player_stats(league_name,season,match_id,match_time,is_home_team,team_id,team_name,player_id,player_name,player_score,pos_category,pos_cur_match,substitution_flag,substitution_minute' player_id = player[0] player_name, player_score = player[1:3] pos_category, pos_cur_match = player[4:6] substitution_flag, substitution_minute = player[7:9] player_stats_detail = player[3][0] var_name_list = [var[0] for var in player_stats_detail] # check if there is variable doesn't exist in current table. If not, add new column. var_name_set = set(var_name_list) table_schema = db_cur.execute( 'PRAGMA table_info(player_stats);').fetchall() existing_var_list = [row[1] for row in table_schema] existing_var_set = set(existing_var_list) if not var_name_set.issubset(existing_var_set): new_vars = var_name_set.difference(existing_var_set) for v in new_vars: db_cur.execute( 'alter table player_stats add column {0} float;'. format(v)) print 'Add New Variable: {0}'.format(v) var_name_str = ',' + ','.join(var_name_list) var_value_str = ',' + ','.join([ "'" + var[1][0] + "'" if isinstance(var[1][0], str) else str(var[1][0]) for var in player_stats_detail ]) sql_stat = sql_stat + var_name_str + ") values('{0}','{1}',{2},'{3}','{4}',{5},'{6}',{7},'{8}',{9},{10},'{11}',{12},{13}".format( league_name.replace("'", " "), season, match_id, match_time, is_home_team, team_id, team_name.replace("'", " "), player_id, player_name.replace( "'", " "), player_score, pos_category, pos_cur_match, substitution_flag, substitution_minute) + var_value_str + ");" db_cur.execute(sql_stat) conn.commit()