def extract_tvseries(dom): url = URL(TARGET_URL) dom = DOM(url.download(cached=True)) #print dom.body.content x = 0 csv_row = [] for series in dom.by_tag('td.title'): title = series.by_tag('a')[0].content.encode('ascii', 'ignore') ranking = series.by_tag('span.value')[0].content.encode('ascii', 'ignore') genres = series.by_tag('span.genre')[0].by_tag('a') genres = [g.content.encode('ascii', 'ignore') for g in genres] actors = series.by_tag('span.credit')[0].by_tag('a') actors = [a.content.encode('ascii', 'ignore') for a in actors] x = x + 1 try: runtime = series.by_tag('span.runtime')[0].content.encode('ascii', 'ignore') except: runtime = "Unknown" #print x, title, ranking, genres, actors, runtime csv_titles = title csv_ranking = ranking csv_genres = genres csv_actors = actors csv_runtime = runtime row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime] csv_row.append(row) return csv_row
def scrape_education(county_num): if county_num<10: county_num = '0' + str(county_num) else: county_num = str(county_num) print county_num #url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=01,ALAMEDA&cType=T&cGender=&Submit=1' url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=' + county_num + '01,ALAMEDA&cType=T&cGender=&Submit=1' abs_url = URL(string = url) dom = DOM(abs_url.download(cached=True))#download the DOM other = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[12].content.replace(',','') associates = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[11].content.replace(',','') bachelors = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[9].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[10].content.replace(',',''))) masters = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[4].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[5].content.replace(',',''))) jurisdoctor = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[3].content.replace(',','') doctorate = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[2].content.replace(',','') bachelors_and_less = str(int(bachelors) + int(associates) + int(other)) post_grad = str(int(masters) + int(jurisdoctor) + int(doctorate)) county = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("a")[0].content # write all the collected data to a new row of the output file writer.writerow([county, bachelors_and_less, post_grad, associates, bachelors, masters, jurisdoctor, doctorate])
def summarize(query=None, k=4,url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build() lsa1.calc() summary =[(sentences[i], norm(dot(diag(lsa1.S),lsa1.Vt[:,b]),2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))] sorted(summary, key=itemgetter(1)) summary = dict((v[0],v) for v in sorted(summary, key=lambda summary: summary[1])).values() return '.'.join([a for a, b in summary][len(summary)-(k):])
def download_pdfs(): """download pdfs from fda""" # where to save pdfs path = 'classifier_docs/pdfs/' # create directory if it doesn't exist if not os.path.exists(path): os.makedirs(path) # load in non-standard pdf urls from 2012 to serve as control text # note: had to lookup urls manually # drugs are erivedge (203388) and sirturo (204384) # also, menhibrix (125363) has no medical review available urls = ['http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/203388Orig1s000MedRpdf.pdf', 'http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/204384Orig1s000MedR_.pdf'] for url in urls: m = re.search('20..\/(\d{6})', url) app_num = m.group(1) url = URL(url) # make sure that url points to PDF, print error otherwise if url.mimetype in MIMETYPE_PDF: # write pdf for medical review if it doesn't exist fn = path + app_num + '.pdf' if not os.path.exists(fn): print "writing {} from {}".format(fn, url) f = open(fn, 'w') f.write(url.download(cached = False)) f.close() else: print "{} already exists".format(fn) else: print "warning: {} did not resolve to pdf".format(url) return
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. top_250_url = URL(url) top_250_html = top_250_url.download(cached=True) top_250_dom = DOM(top_250_html) for a in top_250_dom.by_tag("td.titleColumn")[:1]: for b in a.by_tag("a"): link_ext = b.attrs["href"].encode("utf-8") link_base = "http://www.imdb.com" link = link_base+link_ext movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def get_patent_urls(keyword, limit=10): keyword = urllib.quote_plus(keyword) base_url = "http://www.lens.org" url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) + "&q=" + keyword) dom = DOM(url.download()) links = [base_url + a.attributes.get("href") for a in dom("a.link")] return links
def downloading_csv(self, download_type = 'hist'): """ Download the csv information for particular stock. download_type can be hist or div. If hist, will download the hist price. If div, will download dividend history. Kwargs: download_type (str): hist or div (default hist). """ self.download_fault = 0 if download_type == 'hist': target_url = self.hist_quotes_full_url sav_filename = os.path.join(self.hist_quotes_csvfile_path,'hist_stock_price_'+ self.individual_stock_sym+ '.csv') elif download_type == 'div': target_url = self.div_history_full_url sav_filename = os.path.join(self.hist_quotes_csvfile_path,'div_hist_'+ self.individual_stock_sym+ '.csv') else: print 'wrong download type' raise url = URL(target_url) f = open(self.tempfile_sav_location, 'wb') # save as test.gif try: f.write(url.download())#if have problem skip except: if self.__print_download_fault: print 'Problem with processing this data: ', target_url self.download_fault =1 f.close() if not self.download_fault: if self.enable_save_raw_file: shutil.copyfile(self.tempfile_sav_location,sav_filename )
def extract_data(stock_ticker): url_base = 'http://financials.morningstar.com/ajax/exportKR2CSV.html?&callback=?&t=' url_end = '®ion=usa&culture=en-US&cur=&order=asc' # May add more exchanges later on, but these cover the main US stock exchanges: Nasdaq, New York SE, and Pink Sheets (OTC stocks), respectively # Loops through main stock exchanges to get proper URL for data extraction stock_exchange_list = ['XNAS:','XNYS:','PINX:'] for exchange in stock_exchange_list: test = URL(url_base+exchange+stock_ticker+url_end) if sys.getsizeof(test.download()) > 35: #A broken URL produces an empty string, which has memory size 33; size 35 allows for minor variation in the size break temp_data = 'C:/Users/Owner/Documents/temp.csv' f = open(temp_data, mode='w') try: f.write(test.download()) except: raise IOError('There was an error processing this data') sys.exit(1) f.close() try: stock_data_df = pd.read_csv(temp_data, header=2,thousands =',',index_col=0,skiprows=[19,20,31,41,42,43,48,58,53,64,65,72,73,95,101,102]) except: raise IOError('Problem downloading files') os.remove(temp_data) sys.exit(1) os.remove(temp_data) stock_data_df = stock_data_df.transpose() return(stock_data_df)
def summarize_evaluation(query=None, url=None, summary=None): j=[] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa.parse(sentence) else: lsa = LSA(stopwords, ignore_characters) for sentence in query: lsa.parse(sentence) lsa.build() lsa.calc() lsa2 = LSA(stopwords, ignore_characters) for sentence in summary: lsa2.parse(sentence) lsa2.build() lsa2.calc() vectors =[(dot(lsa.S,lsa.U[0,:]),dot(lsa.S,lsa.U[i,:])) for i in range(len(lsa.U))] vectors2 =[(dot(lsa2.S,lsa2.U[0,:]),dot(lsa2.S,lsa2.U[i,:])) for i in range(len(lsa2.U))] angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a in vectors for b in vectors2] return str(abs(1 - float(angles[1])/float(pi/2)))
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # Download the HTML file url = URL(url) html = url.download() # Parse the HTML file into a DOM representation dom = DOM(html) # Iterate through all 250 table rows on the index page for movies in dom('.lister-list > tr'): # take the movie's href attribute and put it in href href = movies('td.titleColumn a')[0].attrs["href"] # append the href attribute to the string, but also add http://www.imdb.com/ in front of it movie_urls.append("http://www.imdb.com/" + href) # return the list of URLs of each movie's page on IMDB return movie_urls
def cats(self, namespace=0, start=None, acmin=1, count=100, cached=True, **kwargs): """ Returns an iterator over all article titles (for a given namespace id). """ kwargs.setdefault("unicode", True) kwargs.setdefault("throttle", self.throttle) # Fetch article titles (default) or a custom id. id = kwargs.pop("_id", "title") id = "*" # Loop endlessly (= until the last request no longer yields an "apcontinue"). # See: http://www.mediawiki.org/wiki/API:Allpages while start != -1: url = URL(self._url, method=GET, query={ "action": "query", "list": "allcategories", "acfrom": start or "", "aclimit": min(count, 500), "acprop": "size", "acmin": max(1, acmin), "format": "json" }) data = url.download(cached=cached, **kwargs) data = json.loads(data) for x in data.get("query", {}).get("allcategories", {}): # print(x) if x.get(id): # yield x[id] x['name'] = x.pop('*') yield x start = data.get("query-continue", {}).get("allcategories", {}) start = start.get("accontinue", start.get("acfrom", -1)) raise StopIteration
def getRandomHistoryDOM(language): url = URL("http://"+language+".wikipedia.org/wiki/Special:Random") #Gets the url only of the page this redirects to redirectUrl = url.redirect try: #Grab the name of the wikipedia article from the url urlComponents = string.split(redirectUrl, '/') except AttributeError: #Use some recursion if we encounter a page with no history, or some other error return getRandomHistoryDOM(language) #Get the history section of the article redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history" print "Current article is: " +str(urlComponents[4]) #print redirectUrl url = URL(redirectUrl); dom = DOM(url.download(cached=False)) try: historyList = dom.by_id("pagehistory").by_tag("li") return historyList, urlComponents[4] except AttributeError: #Use some recursion if we encounter a page with no history, or some other error dom = getRandomHistoryDOM(language) return getRandomHistoryDOM(language)
def process_page(): url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series") dom = DOM(url.download(cached=True)) domIndex = 0 for title in dom.by_class("title"): theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace') titleCatalog.append(Title(theTitle)) try: match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace')) #print match.group(1) # titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace')) titleCatalog[domIndex].addRunTime(match.group(1)) except Exception, e: pass try: titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace')) except Exception, e: pass
def main(): table = Datasheet() tel = '' street = '' locality = '' title = '' for i in range(3): page = i+1 url = URL("http://torino.paginegialle.it/pgol/4-veterinari/3-torino/p-%s?mr=50" % page) print "collecting from %s" % url connection = url.open() doc = Document( connection.read() ) items = doc.by_class('item_sx') row = [] for j, item in enumerate(items): divs = item.by_class('address') try: title = item.by_class('item_head')[0].by_tag('a')[0].content except IndexError, e: print >> sys.stderr, "%s" % j, e pass for z, div in enumerate(divs): if div != None: try: street = div.by_class('street-address')[0].content locality = div.by_class('locality')[0].content tel = div.by_class('tel')[0].by_class('value')[0].content except IndexError, e: print >> sys.stderr, "%s" % z, e pass save = "%s, %s %s, %s \n" % ( plaintext(title), plaintext(street).replace(",", ""), plaintext(locality).replace('(TO)', ''), plaintext(tel).replace(",", "") ) print >> sys.stderr, save row.append(save)
def download_single_image(self, url_link, pic_prefix_str): """ Download data according to the url link given. Args: url_link (str): url str. pic_prefix_str (str): pic_prefix_str for unique label the pic """ self.download_fault = 0 file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext temp_filename = pic_prefix_str + file_ext temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename ) valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive url = URL(url_link) if url.redirect: return # if there is re-direct, return if file_ext not in valid_image_ext_list: return #return if not valid image extension f = open(temp_filename_full_path, 'wb') # save as test.gif print url_link self.pic_info_list.append(pic_prefix_str + ': ' + url_link ) try: f.write(url.download())#if have problem skip except: #if self.__print_download_fault: print 'Problem with processing this data: ', url_link self.download_fault =1 f.close()
def getQuotes(sym): frontUrl = "http://real-chart.finance.yahoo.com/table.csv?s=" endUrl = "&a=10&b=8&c=1997&d=10&e=8&f=2015&g=d&ignore=.csv" failed = [] count = 1 for ticker in sym: fname = "quotes/" + ticker + ".csv" df = object() tickerUrl = frontUrl + ticker + endUrl url = URL(tickerUrl) f = open(fname, 'wb') try: f.write(url.download()) except: print "quotes csv download failed: " + ticker failed.append(ticker) count += 1 continue f.close() count+=1 print "progress: " + str(count) + "/" + str(len(sym)) return failed
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] url = URL(url) html = url.download() dom = DOM(html) homeUrl = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. for e in dom.by_tag("td.titleColumn"): absoluteUrl = '' for a in e.by_tag("a"): link = a.attributes.get("href","") absoluteUrl = homeUrl + link movie_urls.append(absoluteUrl) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_starrtest(county_num): if county_num<10: county_num = '0' + str(county_num) else: county_num = str(county_num) print county_num #url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1' url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1' abs_url = URL(string = url) dom = DOM(abs_url.download(cached=True))#download the DOM #sciend_num = dom.by_class("rm")[4].content scicst_num = dom.by_class("rm")[3].content math_num = dom.by_class("rm")[2].content hist_num = dom.by_class("rm")[1].content ela_num = dom.by_class("rm")[0].content #sciend_percent = dom.by_class("rs")[4].content[:5] scicst_percent = dom.by_class("rs")[3].content[:5] math_percent = dom.by_class("rs")[2].content[:5] hist_percent = dom.by_class("rs")[1].content[:5] ela_percent = dom.by_class("rs")[0].content[:5] county = dom.by_tag("h2")[0].content # write all the collected data to a new row of the output file writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])
def scrape_top_250(url): """ Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). """ # This piece of code is needed to use the dom structure while it is not given as argument. TOP_250_URL = "http://www.imdb.com/chart/top" top_250_url = URL(TOP_250_URL) top_250_html = top_250_url.download(cached=True) dom = DOM(top_250_html) movie_urls = [] """ Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film. Uses CSS selectors to find the right urls and subsequently places them in a list """ for e in dom.by_tag("td.titleColumn"): for a in e.by_tag("a")[:1]: main = "http://www.imdb.com" Locallink = main + a.attrs["href"] movie_urls.append(Locallink) # return the list of URLs of each movie's page on IMDB return movie_urls
def download_single_image(url_link, pic_prefix_str, target_folder, image_size): """ Download data according to the url link given. Args: url_link (str): url str. pic_prefix_str (str): pic_prefix_str for unique label the pic """ file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext temp_filename = pic_prefix_str + ".jpg" temp_filename_full_path = os.path.join(target_folder, temp_filename) valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive url = URL(url_link) if url.redirect: return # if there is re-direct, return if file_ext not in valid_image_ext_list: return #return if not valid image extension # save as test.gif print url_link try: response = url.download() img = resize_image(response, image_size) img.save(temp_filename_full_path, "JPEG") except Exception as e: #if self.__print_download_fault: print 'Problem with processing this data: ', str(e), url_link
def convertMapData(): print '[2/2] Convert map data' # output dictionary d3mapData = {} # download the file url = URL(DATASET3) data = url.download() # create array data = list(json.loads(data)) # fill output dictionary for dataRow in data: if dataRow['Year'] == '2014': population = dataRow['Value'] fillColor = defineColor(dataRow['Value']) d3mapData[dataRow['Country Code']] = {'population': population, 'fillKey': fillColor} print '[2/2] Write to json' # write output dictionary to json file with open('D3LinkedViews/data_map.json', 'wb') as output_file: json.dump(d3mapData, output_file) print '[2/2] Finish'
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] #absolute_url = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. url = URL(url) dom = DOM(url.download(cached=True)) # return the dom value for e in dom('.titleColumn'): for link in e('a'): movie_urls.append(abs(link.attributes.get('href')), ) # return url list return movie_urls
def get_by_year(year): url = URL("http://www.imdb.com/event/ev0000003/" + str(year)) dom = DOM(url.download(cached=True)) dictAll = {} awards = dom.by_class('award') awardTitles = awards[0].by_tag('h2') awardList = [] for award in awardTitles: awardList.append(award.content) prize = awards[0].by_tag('blockquote') for index, title in enumerate(prize[1:25]): winner = title.by_tag('strong')[0].by_tag('a')[0].content winner_id = str(title.by_tag('strong')[0].by_tag('a')[0].attrs['href'][-8:-1]) nomineeList = [] for each in title.by_tag('strong')[1::]: name = each.by_tag('a')[0].content id = str(each.by_tag('a')[0].attrs['href'][-8:-1]) nomineeList.append((clean_unicode(name),id)) winnersAndNominees = {} winnersAndNominees['winner'] = (clean_unicode(winner),winner_id) winnersAndNominees['nominees'] = nomineeList dictAll[awardList[index]] = winnersAndNominees return dictAll
def dl_byUrllib2(url, filename): myurl = URL(url) if os.path.exists(filename): return with open(filename,'wb') as fp: fp.write(myurl.download(cached=False)) fp.close()
def getContributorInfo(devUrl): url = URL(devUrl) contribInfo = json.loads(url.download()) """
def downloadPDFs(dictListJSON, state, jsonExists = False): #state = dictListJSON[0, 2] dlJSONFile = open(dictListJSON, "r") dictJSON = json.load(dlJSONFile) dlJSONFile.close() #some condition to check if the JSON already exists if jsonExists: pdfDictList = dictJSON else: pdfDictList = findPDFLinks(dictJSON, state) count = 0 for dict in pdfDictList: #test if date > 01/01/13 fileName = "".join(str(dict["AdvertiserInfo"]).split()) print "Writing to " + fileName url = dict["PDFLink"] url = re.sub(' ', '%20', url) print url if url != "NO URL": urlOpened = URL(url) f = open(fileName, 'wb') #download to state pdfs directory f.write(urlOpened.download(cached=False)) f.close() count += 1 if count > 4: break
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. from pattern.web import abs url = URL("http://www.imdb.com/chart/top") dom = DOM(url.download(cached = True)) for e in dom.by_tag("td.titleColumn")[:250]: for link in e.by_tag("a"): link = link.attrs.get("href","") link = abs(link, base=url.redirect or url.string) movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def get_patent(url): url = URL(url + "/fulltext") html = url.download() dom = DOM(html) title = plaintext(dom("h3 a")[0].content) body = plaintext(dom("#contents")[0].content) return [title, body]
def loadPage(numPage): #Load the content from the given page url = URL(url_estruc_1 + str(numPage) + url_estruc_2) dom = DOM(url.download(cached=True)) for row in dom(ROWS_PATH)[1:]: #pprint.pprint(plaintext(row(CELLS_PATH)[0].content)) RESULTS.append({"place": plaintext(row(CELLS_PATH)[0].content), "place_gender": plaintext(row(CELLS_PATH)[1].content) }) pprint.pprint(str(numPage + 1) + "/" + str(last_page))
def read_web(url): html = '' start = etime() try: uri = URL(url) html = uri.download(cached=True) except Exception, e: print 'HTTP Error:' + str(e.message)
def get_dom(url): try: s_content = URL(url).download(timeout=120, cached=False) except (URLError, HTTP404NotFound): print "Error downloading article" return None #for AJE compatibility try: s_content = s_content.decode('unicode_escape') except (UnicodeEncodeError): pass return Document(s_content)
def download_single_image(self, url_link, pic_prefix_str): """ Download data according to the url link given. Args: url_link (str): url str. pic_prefix_str (str): pic_prefix_str for unique label the pic """ self.download_fault = 0 file_ext = os.path.splitext(url_link)[ 1] # use for checking valid pic ext temp_filename = pic_prefix_str + file_ext temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename) valid_image_ext_list = [ '.png', '.PNG', '.jpg', '.jpeg', '.JPG', '.JPEG', '.gif', '.GIF', '.bmp', '.BMP', '.tiff', '.TIFF' ] # not comprehensive if type(url_link) is int: return url_link = urllib.unquote(url_link).decode('utf8') print(url_link) if self.is_image_watermarked(url_link): return url = URL(url_link) try: if url.redirect: return # if there is re-direct, return if file_ext not in valid_image_ext_list: return # return if not valid image extension self.pic_info_list.append(pic_prefix_str + ': ' + url_link) downloaded_img = url.download() if len(downloaded_img) > 0: # sometimes download is empty f = open(temp_filename_full_path, 'wb') # save as test.gif f.write(downloaded_img) # if have problem skip f.close() except: # if self.__print_download_fault: print 'Problem with processing this data: ', url_link self.download_fault = 1
def scrape(url, f): week = url.split("/") week = week[-1] url = URL(url) dom = DOM(url.download(cached=True)) # geeft de week i = 1 # de lijst van de top 40 selecteren for l in dom.by_tag("ol.top40"): # per nummer selecteren= print "lijst top 40" for e in l.by_tag("div.clearfix")[0:40]: muziekGegevens = "" #positie in de top 40 muziekGegevens += str(i) + "," print i, 'positie' i += 1 # opletten met resetten # de artiest selecteren for artiest in e.by_class( "credit"): #error niet te veel elementen! muziekGegevens += artiest.content + "," #positie for inner in e.by_tag("strong")[1:2]: print inner.content, "1:2" muziekGegevens += inner.content + "," # hoogste notering for inner in e.by_tag("strong")[2:3]: print inner.content, "2:3" muziekGegevens += inner.content + "," # aantal punten for inner in e.by_tag("strong")[3:4]: print inner.content, "3:4" muziekGegevens += inner.content + "," # jaar van het nummer for inner in e.by_tag("strong")[4:5]: print inner.content.strip(), "4:5" muziekGegevens += inner.content.strip() h = HTMLParser.HTMLParser() muziekGegevens = h.unescape(muziekGegevens) if not whatisthis(muziekGegevens): muziekGegevens = unicode(muziekGegevens, "utf-8") print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf' f.write(muziekGegevens + "\n") else: f.write(muziekGegevens + "\n")
def agregarInformacionDocumento(self, url, contenido): """Metodo para obtener diferentes partes del documento""" try: unaUrl = URL(url) if not 'pdf' in extension(unaUrl.page): html = contenido unElemento = Element(self.descargarContenidoHtml(url)) body = self.getBody(unElemento) urlValues = self.getUrlValues(unElemento) titulo = self.getTitulo(unElemento) html = self.verificarContenidoVacio(html) body = self.verificarContenidoVacio(body) urlValues = self.verificarContenidoVacio(urlValues) titulo = self.verificarContenidoVacio(titulo) self.mongodb.setInformacionDocumento(html, url, titulo, urlValues, body) else: html = self.verificarContenidoVacio(contenido) body = "" urlValues = "" titulo = "" self.mongodb.setInformacionDocumento(html, url, titulo, urlValues, body) except Exception as e: print str(e)
def download_single_image(self, url_link, pic_prefix_str): self.download_fault = 0 file_ext = os.path.splitext(url_link)[1] #print(pic_prefix_str, file_ext) temp_filename = pic_prefix_str + str(file_ext) temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename ) valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive url = URL(url_link) if url.redirect: print("RD") return if file_ext not in valid_image_ext_list: print("Invalid file type") return f = open(temp_filename_full_path, 'wb') print(url_link) self.pic_info_list.append(pic_prefix_str + ': ' + url_link ) try: urllib.request.URLopener.version = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134" #f.write(url.download(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134")) f.write(urllib.request.urlopen(url_link).read()) #urllib.request.urlretrieve(url_link, temp_filename_full_path) except: print('Problem with processing this data: ', url_link) self.download_fault =1 f.close()
def busca_google(self, service, versao='1.0', inicio=0, quant=8, lingua='pt_br', ip=None): if not ip: ip = socket.gethostbyname(socket.gethostname()) query = urllib.urlencode({ 'v': versao, 'start': str(inicio), 'rsz': str(quant), 'hl': lingua, 'q': self.termo, 'userip': ip }) apiurl = "http://ajax.googleapis.com/ajax/services/search/" queryurl = apiurl + service + '?' + query try: search_results = URL(queryurl).download() json = simplejson.loads(search_results) results = json['responseData']['results'] return results except (TypeError, URLError): print 'erro para a busca Google', service, self.termo pass
def busca(self): apiurl = 'http://data.alexa.com/data?cli=10&dat=snbamz&url=' queryurl = apiurl + self.url try: return URL(queryurl).download() except: pass
def startClouds(self, urls): """ statClouds: Recibe un conjunto de url's en la lista urls. Por cada URL Esta creando un grafo dirigido, ubicando como unico nodo, al que esta en n[0], para ello crea un objeto Structure, al cual se le añade el grafo dirigido de un elemento y el dominio de la url, obtenido por url.domain """ clouds = list() for n in urls: #Recorre cada url presente en urls (urls es una lista en la que cada url es a su vez una lista) url = URL( n[0] ) #Crea un objeto pattern.URL, enviando la url contenida en la posicion 0 de n graph = nx.DiGraph( ) #Inicializa un grafo dirigido (Apunta a uno nodo en especifico) vacio (permite auto apuntado) graph.add_node( n[0], select=True, ID=0, weight_VSM=0.0, weight_WA=0.0, weight_OKAPI=0.0, weight_SVM=0.0, weight_CRANK=0.0, totalScore=0.0, link=n[0], methodData=None, ) clouds.append(Structure(graph, url.domain)) #Crea un objeto Structure return clouds
def crawl(topic, N=100, Nbatch=25): t = Twitter() # language='en','id' M = N // Nbatch #integer i, Tweets, keepCrawling = None, [], True for j in tqdm(range(M)): if keepCrawling: for tweet in t.search(topic, start=i, count=Nbatch): try: Tweets.append(tweet) i = tweet.id except: print("Twitter Limit reached") keepCrawling = False # Second Break (outer loop) break else: break print('Making sure we get the full tweets, please wait ...') for i, tweet in enumerate(tqdm(Tweets)): try: webPage = URL(tweet.url).download() soup = bs(webPage, 'html.parser') full_tweet = soup.find_all( 'p', class_='TweetTextSize')[0] #modify this to get all replies full_tweet = bs(str(full_tweet), 'html.parser').text Tweets[i]['fullTxt'] = full_tweet except: Tweets[i]['fullTxt'] = tweet.txt print('Done!... Total terdapat {0} tweet'.format(len(Tweets))) return Tweets
def main(): ''' Crawl the IMDB top 250 movies, save CSV with their information. Note: This function also makes backups of the HTML files in a sub-directory called HTML_BACKUPS (those will be used in grading). ''' # Create a directory to store copies of all the relevant HTML files (those # will be used in testing). print 'Setting up backup dir if needed ...' create_dir(BACKUP_DIR) # Make backup of the IMDB top 250 movies page print 'Access top 250 page, making backup ...' top_250_url = URL(TOP_250_URL) top_250_html = top_250_url.download(cached=True) make_backup(os.path.join(BACKUP_DIR, 'index.html'), top_250_html) # extract the top 250 movies print 'Scraping top 250 page ...' url_strings = scrape_top_250(top_250_url) # grab all relevant information from the 250 movie web pages rows = [] for i, url in enumerate(url_strings): # Enumerate, a great Python trick! print 'Scraping movie %d ...' % i # Grab web page try: movie_html = URL(url).download(cached=True) except urllib2.URLError: print "Url timeout" time.sleep(wait_period=10000) # Extract relevant information for each movie movie_dom = DOM(movie_html) rows.append(scrape_movie_page(movie_dom)) # Save one of the IMDB's movie pages (for testing) if i == 83: html_file = os.path.join(BACKUP_DIR, 'movie-%03d.html' % i) make_backup(html_file, movie_html) # Save a CSV file with the relevant information for the top 250 movies. print 'Saving CSV ...' save_csv(os.path.join(SCRIPT_DIR, 'top250movies.csv'), rows)
def retrieve_shortsell_info(self): """ Retrieve the shortsell information. will form the url and retrieved the information using pandas to make into table. The function will set to self_shortsell_info_df. make it iterat over the days to get the latest data """ for last_effective_date in range(7): self.form_shortsell_url(last_effective_date) url = URL(self.shortsell_full_url) try: #see data is available for that current date url_data = url.download(timeout=50) shortsell_list = pandas.io.html.read_html(url_data) self.shortsell_info_df = shortsell_list[1] except: continue #continue if there is no data if len(self.shortsell_info_df) == 0: continue self.shortsell_info_df.rename(columns={ 0: 'Security', 1: 'Short Sale Volume', 2: 'Currency', 3: 'Short Sale Value', }, inplace=True) self.shortsell_info_df = self.shortsell_info_df[1:-3] #change type of the columns self.shortsell_info_df[['Short Sale Volume', 'Short Sale Value' ]] = self.shortsell_info_df[[ 'Short Sale Volume', 'Short Sale Value' ]].astype(float) #need a rank on the short sell self.shortsell_info_df[ 'ranked_shortsell'] = self.shortsell_info_df[ 'Short Sale Volume'].rank(method='min', ascending=False) self.shortsell_info_df[ 'shortsell_lastdate'] = self.set_last_desired_date( last_effective_date) #need percentage as well # have a sorting of data? return print 'No suitable data found within time frame.' return
def detect(self,link): url=URL(link) #print url.domain if re.search(self.badLinks,url.domain)!=None: bad=True else: bad=False return bad
def get_dom_object(self, url_target): """ Get dom object based on element for scraping Take into consideration that there might be query problem. Args: url_target (str): url link to be searched. Returns: (DOM): dom object correspond to the url. """ try: url = URL(url_target) dom_object = DOM(url.download(cached=True)) except: print 'Problem retrieving data for this url: ', url_target self.url_query_timeout = 1 return dom_object
def url_site_download(self): """ Download the csv information for particular stock depending on the retrieval type. Retrieval type determine by self.retrieval_type Return: (str): output html from url. """ self.download_fault = 0 self.form_url_str() url = URL(self.target_full_url) try: return url.download() except: if self.__print_download_fault: print('Problem with processing this data: ', self.target_full_url) self.download_fault = 1 return None
def busca(self, service, query): apiurl = 'https://socialgraph.googleapis.com/' queryurl = apiurl + service + '?' + query try: search_results = URL(queryurl).download() results = simplejson.loads(search_results) return results except: print 'erro socialgraph'
def detect(link): badLinks = 'youtube|linkedin|amazon|books.google|facebook|twitter|instagram|plus.google|yahoo|ebay|ebayinc|flickr|t.co|.google.|youtu.be|microsoft|microsoftstore' url = URL(link) #print url.domain if re.search(badLinks, url.domain) != None: bad = True else: bad = False return bad
def download_single_image(self, url_link, pic_prefix_str): """ Download data according to the url link given. Args: url_link (str): url str. pic_prefix_str (str): pic_prefix_str for unique label the pic """ self.download_fault = 0 file_ext = os.path.splitext(url_link)[ 1] # use for checking valid pic ext temp_filename = pic_prefix_str + file_ext temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename) temp_filename_full_path = temp_filename_full_path.replace("+", " ") folder_name = temp_filename_full_path.split("/") if not os.path.exists( temp_filename_full_path.replace(folder_name[-1], "")): os.makedirs(temp_filename_full_path.replace(folder_name[-1], "")) valid_image_ext_list = [ '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff' ] # not comprehensive url = URL(url_link.replace("%2F", "/").replace("%3A", ":")) try: if url.redirect: return # if there is re-direct, return if file_ext not in valid_image_ext_list: return # return if not valid image extension f = open(temp_filename_full_path, 'wb') # save as test.gif print(url_link) self.pic_info_list.append(pic_prefix_str + ': ' + url_link) image = url.download() # import matplotlib.pyplot as p # p.imshow(image) # p.show(image) f.write(image) # if have problem skip # if self.__print_download_fault: print('Problem with processing this data: ', url_link) self.download_fault = 1 f.close() except: pass
def parseUrl(urlString): match = re.search('//', urlString) if not match: urlString = '//' + urlString url = urlparse.urlsplit(urlString) if not url.scheme: url = url._replace(scheme='http') return URL(url.geturl())
def busca_topsy(self, service): apiurl = 'http://otter.topsy.com/' queryurl = apiurl + service + '.json?' + self.query try: search_results = URL(queryurl).download() json = simplejson.loads(search_results) resultados = json['response'] return resultados except: pass
def __init__(self, main_website): self.main_website = main_website self.browser_main = webdriver.PhantomJS() self.browser_main.set_window_size(1024, 768) self.browser_main.get(self.main_website) self.website_main = self.browser_main.page_source self.browser_main.quit() self.dom = web.Element(self.website_main) self.links = self.dom.by_class('expanded') self.main_url = URL(self.main_website)
def procesarSumario(url_sumario, allDocs): url_sumario = url_sumario print url_sumario content = URL(url_sumario).download() xml = etree.XML(content) ids = etree.XPath("//item/@id") for id in ids(xml): url_doc = url_boe.format(id) allDocs.append(url_doc)
def box_office_titles(): # download the webpage html = URL(BOX_OFFICE_URL).download() dom = DOM(html) # find the movie titles title_elements = dom(MOVIE_TITLE_TAG) titles = map(lambda x: x.content, title_elements) return titles
def inflect(word, language="italian"): inflections = {} url = "http://en.wiktionary.org/wiki/" + word.replace(" ", "_") dom = DOM(URL(url).download(throttle=10, cached=True)) pos = "" # Search the header that marks the start for the given language: # <h2><span class="mw-headline" id="Italian">Italian</span></h2> e = dom("#" + language)[0].parent while e is not None: # e = e.next_sibling if e.type == "element": if e.tag == "hr": # Horizontal line = next language. break if e.tag == "h3": # <h3>Adjective [edit]</h3> pos = plaintext(e.content.lower()) pos = pos.replace("[edit]", "").strip()[:3].rstrip("ouer") + "-" # Parse inflections, using regular expressions. s = plaintext(e.content) # affetto m (f affetta, m plural affetti, f plural affette) if s.startswith(word): for gender, regexp, i in ( ("m" , r"(" + word + r") m", 1), ("f" , r"(" + word + r") f", 1), ("m" , r"(" + word + r") (mf|m and f)", 1), ("f" , r"(" + word + r") (mf|m and f)", 1), ("m" , r"masculine:? (\S*?)(,|\))", 1), ("f" , r"feminine:? (\S*?)(,|\))", 1), ("m" , r"(\(|, )m(asculine)? (\S*?)(,|\))", 3), ("f" , r"(\(|, )f(eminine)? (\S*?)(,|\))", 3), ("mp", r"(\(|, )m(asculine)? plural (\S*?)(,|\))", 3), ("fp", r"(\(|, )f(eminine)? plural (\S*?)(,|\))", 3), ( "p", r"(\(|, )plural (\S*?)(,|\))", 2), ( "p", r"m and f plural (\S*?)(,|\))", 1)): m = re.search(regexp, s, re.I) if m is not None: # {"adj-m": "affetto", "adj-fp": "affette"} inflections[pos + gender] = m.group(i) #print s e = e.next_sibling return inflections
def get_raw_wikihow_page(title=None): if title is not None and 'how to' in title.lower(): title = title.lower().replace('how to', '', 1).strip() # keep to "human" articles #allowed_cats = ['Youth', 'Family Life', 'Relationships', 'Personal Care and Style', 'Work World'] allowed_cats = ['Youth', 'Family Life', 'Relationships'] main_cat = "" s = "" while main_cat not in allowed_cats: try: s = URL('http://www.wikihow.com/{}'.format(title)).read() if title is not None \ else URL('http://www.wikihow.com/Special:Randomizer').read() main_cat = Element(s)('ul#breadcrumb li a')[2].string print(main_cat) except: time.sleep(5) return s
def downloadText(link, dir, filename, sleep): """Downloads PDF file at given link and places in dir""" cur = os.getcwd() if not os.path.exists(dir): os.makedirs(dir) os.chdir(dir) Ddir = os.getcwd() files = [ f for f in os.listdir(Ddir) if os.path.isfile(os.path.join(Ddir, f)) ] if filename + '.pdf' not in files: url = URL(link) try: f = open(filename + '.pdf', 'wb') f.write(url.download(cached=False)) f.close() print filename + ' stored' time.sleep(sleep) except pattern.web.HTTP500InternalServerError, e: print '\n ' + filename + ' link broken' + '\n '
def extract_data_ML(i): url = 'http://macaulaylibrary.org/audio/%s' % i page = URL(url).download() dom = DOM(page) description = dom('meta')[0].attr['content'] result = [x.content for x in dom('script') if 'jwplayer(' in x.content][0] result = [ x.strip() for x in result.split('\n') if x.strip().startswith('file') ][0] path_to_mp3 = result.split('"')[1] return {'index': i, 'desc': description, 'mp3': path_to_mp3}
def summarize_evaluation(query=None, url=None, summary=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [ word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word ] j = ' '.join(j) lsa = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [ sentence for sentence in sentences if len(sentence) > 1 and sentence != '' ] for sentence in sentences: lsa.parse(sentence) else: lsa = LSA(stopwords, ignore_characters) for sentence in query: lsa.parse(sentence) lsa.build() lsa.calc() lsa2 = LSA(stopwords, ignore_characters) for sentence in summary: lsa2.parse(sentence) lsa2.build() lsa2.calc() vectors = [(dot(lsa.S, lsa.U[0, :]), dot(lsa.S, lsa.U[i, :])) for i in range(len(lsa.U))] vectors2 = [(dot(lsa2.S, lsa2.U[0, :]), dot(lsa2.S, lsa2.U[i, :])) for i in range(len(lsa2.U))] angles = [ arccos(dot(a, b) / (norm(a, 2) * norm(b, 2))) for a in vectors for b in vectors2 ] return str(abs(1 - float(angles[1]) / float(pi / 2)))
def reply_tweet(tweet, reply_id, reply_user="******"): from pattern.web import URL, Twitter tweet = reply_user + " " + tweet url = URL("https://api.twitter.com/1.1/statuses/update.json", method="post", query={ "status": tweet, "in_reply_to_status_id": reply_id }) twitter = Twitter(license=ccpattern) url = twitter._authenticate(url) try: # Send the post request. url.open() except Exception as e: print e print e.src print e.src.read()
def research_on(self, what, where): url = URL( "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" + what + "&ou=" + where + "&proximite=0") dom = DOM(url.download(cached=True)) for a in dom.by_tag("div.main-title pj-on-autoload "): for e in a.by_tag("span.denombrement"): number_of_results = int( self.decode_if_unicode(plaintext(e.content))[:3]) number_of_page_results = number_of_results / 20 if (number_of_results % 20 > 0): number_of_page_results += 1 self.exctract_values(dom, self.myInfo) for i in range(2, number_of_page_results + 1): url = URL( "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" + what + "&ou=" + where + "&proximite=0+" "&page=" + str(i)) dom = DOM(url.download(cached=True)) self.exctract_values(dom, self.myInfo) self.myInfo.sort_and_merge()
def get(self): url = URL("http://www.ltconline.ca/WebWatch/ada.aspx") try: dom = DOM(url.download(cached=True)) except (HTTP404NotFound, URLTimeout): return { "message": "LTC WebWatch service looks down", "status": 408, }, 408 routes = [] for a in dom("a.ada"): a_split = a.content.split(",") route = a_split[0].strip() try: route = int(route) except ValueError: pass routes.append({ "route": route, "name": a.content.split(", ", 1)[1].strip().title(), }) return routes