def extract_data(stock_ticker): url_base = 'http://financials.morningstar.com/ajax/exportKR2CSV.html?&callback=?&t=' url_end = '®ion=usa&culture=en-US&cur=&order=asc' # May add more exchanges later on, but these cover the main US stock exchanges: Nasdaq, New York SE, and Pink Sheets (OTC stocks), respectively # Loops through main stock exchanges to get proper URL for data extraction stock_exchange_list = ['XNAS:','XNYS:','PINX:'] for exchange in stock_exchange_list: test = URL(url_base+exchange+stock_ticker+url_end) if sys.getsizeof(test.download()) > 35: #A broken URL produces an empty string, which has memory size 33; size 35 allows for minor variation in the size break temp_data = 'C:/Users/Owner/Documents/temp.csv' f = open(temp_data, mode='w') try: f.write(test.download()) except: raise IOError('There was an error processing this data') sys.exit(1) f.close() try: stock_data_df = pd.read_csv(temp_data, header=2,thousands =',',index_col=0,skiprows=[19,20,31,41,42,43,48,58,53,64,65,72,73,95,101,102]) except: raise IOError('Problem downloading files') os.remove(temp_data) sys.exit(1) os.remove(temp_data) stock_data_df = stock_data_df.transpose() return(stock_data_df)
def research_on(self, what, where): url = URL( "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" + what + "&ou=" + where + "&proximite=0") dom = DOM(url.download(cached=True)) for a in dom.by_tag("div.main-title pj-on-autoload "): for e in a.by_tag("span.denombrement"): number_of_results = int( self.decode_if_unicode(plaintext(e.content))[:3]) number_of_page_results = number_of_results / 20 if (number_of_results % 20 > 0): number_of_page_results += 1 self.exctract_values(dom, self.myInfo) for i in range(2, number_of_page_results + 1): url = URL( "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" + what + "&ou=" + where + "&proximite=0+" "&page=" + str(i)) dom = DOM(url.download(cached=True)) self.exctract_values(dom, self.myInfo) self.myInfo.sort_and_merge()
def cats(self, namespace=0, start=None, acmin=1, count=100, cached=True, **kwargs): """ Returns an iterator over all article titles (for a given namespace id). """ kwargs.setdefault("unicode", True) kwargs.setdefault("throttle", self.throttle) # Fetch article titles (default) or a custom id. id = kwargs.pop("_id", "title") id = "*" # Loop endlessly (= until the last request no longer yields an "apcontinue"). # See: http://www.mediawiki.org/wiki/API:Allpages while start != -1: url = URL(self._url, method=GET, query={ "action": "query", "list": "allcategories", "acfrom": start or "", "aclimit": min(count, 500), "acprop": "size", "acmin": max(1, acmin), "format": "json" }) data = url.download(cached=cached, **kwargs) data = json.loads(data) for x in data.get("query", {}).get("allcategories", {}): # print(x) if x.get(id): # yield x[id] x['name'] = x.pop('*') yield x start = data.get("query-continue", {}).get("allcategories", {}) start = start.get("accontinue", start.get("acfrom", -1)) raise StopIteration
def getRandomHistoryDOM(language): url = URL("http://"+language+".wikipedia.org/wiki/Special:Random") #Gets the url only of the page this redirects to redirectUrl = url.redirect try: #Grab the name of the wikipedia article from the url urlComponents = string.split(redirectUrl, '/') except AttributeError: #Use some recursion if we encounter a page with no history, or some other error return getRandomHistoryDOM(language) #Get the history section of the article redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history" print "Current article is: " +str(urlComponents[4]) #print redirectUrl url = URL(redirectUrl); dom = DOM(url.download(cached=False)) try: historyList = dom.by_id("pagehistory").by_tag("li") return historyList, urlComponents[4] except AttributeError: #Use some recursion if we encounter a page with no history, or some other error dom = getRandomHistoryDOM(language) return getRandomHistoryDOM(language)
def getContributorInfo(devUrl): url = URL(devUrl) contribInfo = json.loads(url.download()) """
def download_pdfs(): """download pdfs from fda""" # where to save pdfs path = 'classifier_docs/pdfs/' # create directory if it doesn't exist if not os.path.exists(path): os.makedirs(path) # load in non-standard pdf urls from 2012 to serve as control text # note: had to lookup urls manually # drugs are erivedge (203388) and sirturo (204384) # also, menhibrix (125363) has no medical review available urls = ['http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/203388Orig1s000MedRpdf.pdf', 'http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/204384Orig1s000MedR_.pdf'] for url in urls: m = re.search('20..\/(\d{6})', url) app_num = m.group(1) url = URL(url) # make sure that url points to PDF, print error otherwise if url.mimetype in MIMETYPE_PDF: # write pdf for medical review if it doesn't exist fn = path + app_num + '.pdf' if not os.path.exists(fn): print "writing {} from {}".format(fn, url) f = open(fn, 'w') f.write(url.download(cached = False)) f.close() else: print "{} already exists".format(fn) else: print "warning: {} did not resolve to pdf".format(url) return
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # Download the HTML file url = URL(url) html = url.download() # Parse the HTML file into a DOM representation dom = DOM(html) # Iterate through all 250 table rows on the index page for movies in dom('.lister-list > tr'): # take the movie's href attribute and put it in href href = movies('td.titleColumn a')[0].attrs["href"] # append the href attribute to the string, but also add http://www.imdb.com/ in front of it movie_urls.append("http://www.imdb.com/" + href) # return the list of URLs of each movie's page on IMDB return movie_urls
def download_single_image(url_link, pic_prefix_str, target_folder, image_size): """ Download data according to the url link given. Args: url_link (str): url str. pic_prefix_str (str): pic_prefix_str for unique label the pic """ file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext temp_filename = pic_prefix_str + ".jpg" temp_filename_full_path = os.path.join(target_folder, temp_filename) valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive url = URL(url_link) if url.redirect: return # if there is re-direct, return if file_ext not in valid_image_ext_list: return #return if not valid image extension # save as test.gif print url_link try: response = url.download() img = resize_image(response, image_size) img.save(temp_filename_full_path, "JPEG") except Exception as e: #if self.__print_download_fault: print 'Problem with processing this data: ', str(e), url_link
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] url = URL(url) html = url.download() dom = DOM(html) homeUrl = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. for e in dom.by_tag("td.titleColumn"): absoluteUrl = '' for a in e.by_tag("a"): link = a.attributes.get("href", "") absoluteUrl = homeUrl + link movie_urls.append(absoluteUrl) # return the list of URLs of each movie's page on IMDB return movie_urls
def extract_tvseries(dom): url = URL(TARGET_URL) dom = DOM(url.download(cached=True)) #print dom.body.content x = 0 csv_row = [] for series in dom.by_tag('td.title'): title = series.by_tag('a')[0].content.encode('ascii', 'ignore') ranking = series.by_tag('span.value')[0].content.encode('ascii', 'ignore') genres = series.by_tag('span.genre')[0].by_tag('a') genres = [g.content.encode('ascii', 'ignore') for g in genres] actors = series.by_tag('span.credit')[0].by_tag('a') actors = [a.content.encode('ascii', 'ignore') for a in actors] x = x + 1 try: runtime = series.by_tag('span.runtime')[0].content.encode('ascii', 'ignore') except: runtime = "Unknown" #print x, title, ranking, genres, actors, runtime csv_titles = title csv_ranking = ranking csv_genres = genres csv_actors = actors csv_runtime = runtime row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime] csv_row.append(row) return csv_row
def scrape_education(county_num): if county_num<10: county_num = '0' + str(county_num) else: county_num = str(county_num) print county_num #url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=01,ALAMEDA&cType=T&cGender=&Submit=1' url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=' + county_num + '01,ALAMEDA&cType=T&cGender=&Submit=1' abs_url = URL(string = url) dom = DOM(abs_url.download(cached=True))#download the DOM other = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[12].content.replace(',','') associates = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[11].content.replace(',','') bachelors = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[9].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[10].content.replace(',',''))) masters = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[4].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[5].content.replace(',',''))) jurisdoctor = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[3].content.replace(',','') doctorate = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[2].content.replace(',','') bachelors_and_less = str(int(bachelors) + int(associates) + int(other)) post_grad = str(int(masters) + int(jurisdoctor) + int(doctorate)) county = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("a")[0].content # write all the collected data to a new row of the output file writer.writerow([county, bachelors_and_less, post_grad, associates, bachelors, masters, jurisdoctor, doctorate])
def summarize(query=None, k=4,url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build() lsa1.calc() summary =[(sentences[i], norm(dot(diag(lsa1.S),lsa1.Vt[:,b]),2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))] sorted(summary, key=itemgetter(1)) summary = dict((v[0],v) for v in sorted(summary, key=lambda summary: summary[1])).values() return '.'.join([a for a, b in summary][len(summary)-(k):])
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. top_250_url = URL(url) top_250_html = top_250_url.download(cached=True) top_250_dom = DOM(top_250_html) for a in top_250_dom.by_tag("td.titleColumn")[:1]: for b in a.by_tag("a"): link_ext = b.attrs["href"].encode("utf-8") link_base = "http://www.imdb.com" link = link_base+link_ext movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def summarize(query=None, k=4, url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [ word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word ] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [ sentence for sentence in sentences if len(sentence) > 1 and sentence != '' ] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build() lsa1.calc() summary = [(sentences[i], norm(dot(diag(lsa1.S), lsa1.Vt[:, b]), 2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))] sorted(summary, key=itemgetter(1)) summary = dict( (v[0], v) for v in sorted(summary, key=lambda summary: summary[1])).values() return '.'.join([a for a, b in summary][len(summary) - (k):])
def dl_byUrllib2(url, filename): myurl = URL(url) if os.path.exists(filename): return with open(filename,'wb') as fp: fp.write(myurl.download(cached=False)) fp.close()
def get_patent_urls(keyword, limit=10): keyword = urllib.quote_plus(keyword) base_url = "http://www.lens.org" url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) + "&q=" + keyword) dom = DOM(url.download()) links = [base_url + a.attributes.get("href") for a in dom("a.link")] return links
def get_patent(url): url = URL(url + "/fulltext") html = url.download() dom = DOM(html) title = plaintext(dom("h3 a")[0].content) body = plaintext(dom("#contents")[0].content) return [title, body]
def google_search(targetword, itemlist,targetpath): resultnum=0 engine = Google(license=None) file = codecs.open(targetpath,'a','utf-8') outtext= '' patt = ur'\W+' for item in itemlist: for i in range(1,5): for result in engine.search(item, type=SEARCH, start=i): url = URL(result.url) text = url.download(unicode=True) text = plaintext(text) text = correctPersianString(text) text = text.replace('\n',' ') lines = text.split('.') for line in lines: if targetword in line: match = re.findall(patt,line) output = ' '.join(match) for item in punclist: if item in line: line = line.replace(item,' ') print output file.write(output) file.write('\n') print str(resultnum)+" found in web" file.close()
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] url = URL(url) html = url.download() dom = DOM(html) homeUrl = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. for e in dom.by_tag("td.titleColumn"): absoluteUrl = '' for a in e.by_tag("a"): link = a.attributes.get("href","") absoluteUrl = homeUrl + link movie_urls.append(absoluteUrl) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. from pattern.web import abs url = URL("http://www.imdb.com/chart/top") dom = DOM(url.download(cached = True)) for e in dom.by_tag("td.titleColumn")[:250]: for link in e.by_tag("a"): link = link.attrs.get("href","") link = abs(link, base=url.redirect or url.string) movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_starrtest(county_num): if county_num<10: county_num = '0' + str(county_num) else: county_num = str(county_num) print county_num #url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1' url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1' abs_url = URL(string = url) dom = DOM(abs_url.download(cached=True))#download the DOM #sciend_num = dom.by_class("rm")[4].content scicst_num = dom.by_class("rm")[3].content math_num = dom.by_class("rm")[2].content hist_num = dom.by_class("rm")[1].content ela_num = dom.by_class("rm")[0].content #sciend_percent = dom.by_class("rs")[4].content[:5] scicst_percent = dom.by_class("rs")[3].content[:5] math_percent = dom.by_class("rs")[2].content[:5] hist_percent = dom.by_class("rs")[1].content[:5] ela_percent = dom.by_class("rs")[0].content[:5] county = dom.by_tag("h2")[0].content # write all the collected data to a new row of the output file writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] #absolute_url = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. url = URL(url) dom = DOM(url.download(cached=True)) #return dom for e in dom('.titleColumn'): for link in e('a'): movie_urls.append(abs(link.attributes.get('href')), ) # return the list of URLs of each movie's page on IMDB return movie_urls
def summarize_evaluation(query=None, url=None, summary=None): j=[] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa.parse(sentence) else: lsa = LSA(stopwords, ignore_characters) for sentence in query: lsa.parse(sentence) lsa.build() lsa.calc() lsa2 = LSA(stopwords, ignore_characters) for sentence in summary: lsa2.parse(sentence) lsa2.build() lsa2.calc() vectors =[(dot(lsa.S,lsa.U[0,:]),dot(lsa.S,lsa.U[i,:])) for i in range(len(lsa.U))] vectors2 =[(dot(lsa2.S,lsa2.U[0,:]),dot(lsa2.S,lsa2.U[i,:])) for i in range(len(lsa2.U))] angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a in vectors for b in vectors2] return str(abs(1 - float(angles[1])/float(pi/2)))
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] #absolute_url = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. url = URL(url) dom = DOM(url.download(cached=True)) # return the dom value for e in dom('.titleColumn'): for link in e('a'): movie_urls.append(abs(link.attributes.get('href')), ) # return url list return movie_urls
def extract_incidents(dom): incident_list = [] i = 0 for incident in dom.by_tag('tr'): if i > 0: link = INCIDENT_URL + incident.by_tag('a')[0].href print link url = URL(link) html = url.download(timeout=100) dom_incident = DOM(html) weapons = [weapon.strip() for weapon in dom_incident.by_tag('p')[16].content[27:].split('<br />')] weapons = ", ".join(weapons)[:-2] latitude = dom_incident.by_tag('p')[2].content[33:].strip() longitude = dom_incident.by_tag('p')[3].content[34:].strip() description = incident.by_tag('div')[0].content[1:].strip() date = incident.by_tag('td')[2].content[1:].strip() location = incident.by_tag('td')[3].content[1:].strip() violation = incident.by_tag('td')[4].content[1:].strip() incident_list.append([link.encode('utf-8'), location.encode('utf-8'), latitude.encode('utf-8'), longitude.encode('utf-8'), date.encode('utf-8'), violation.encode('utf-8'), weapons.encode('utf-8'), description.encode('utf-8')]) i += 1 return incident_list
def convertMapData(): print '[2/2] Convert map data' # output dictionary d3mapData = {} # download the file url = URL(DATASET3) data = url.download() # create array data = list(json.loads(data)) # fill output dictionary for dataRow in data: if dataRow['Year'] == '2014': population = dataRow['Value'] fillColor = defineColor(dataRow['Value']) d3mapData[dataRow['Country Code']] = {'population': population, 'fillKey': fillColor} print '[2/2] Write to json' # write output dictionary to json file with open('D3LinkedViews/data_map.json', 'wb') as output_file: json.dump(d3mapData, output_file) print '[2/2] Finish'
def get_patent(url): url = URL(url + "/fulltext") html = url.download() dom = DOM(html) title = plaintext(dom('h3 a')[0].content) body = plaintext(dom('#contents')[0].content) return [title, body]
def plainTextConverter(self, link, metodo="SinEtiquetas"): reload(sys) sys.setdefaultencoding('utf-8') url = URL(link) txtContent = "" try: if url.mimetype in MIMETYPE_PDF: document = open('temp.pdf', 'w') document.close() download = url.download() document = open('temp.pdf', 'a') document.write(download) document.close() #txtContent=os.system('pdf2txt.py temp.pdf') txtContent = commands.getoutput('pdf2txt.py temp.pdf') else: page = URL(url).download(user_agent='Mozilla/5') if metodo == "mantenerEtiquetas": txtContent = plaintext(page, keep={ 'title': [], 'h1': [], 'h2': [], 'strong': [] }) else: txtContent = plaintext(page, keep={}) except: pass return txtContent
def download_single_image(self, url_link, pic_prefix_str): """ Download data according to the url link given. Args: url_link (str): url str. pic_prefix_str (str): pic_prefix_str for unique label the pic """ self.download_fault = 0 file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext temp_filename = pic_prefix_str + file_ext temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename ) valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive url = URL(url_link) if url.redirect: return # if there is re-direct, return if file_ext not in valid_image_ext_list: return #return if not valid image extension f = open(temp_filename_full_path, 'wb') # save as test.gif print url_link self.pic_info_list.append(pic_prefix_str + ': ' + url_link ) try: f.write(url.download())#if have problem skip except: #if self.__print_download_fault: print 'Problem with processing this data: ', url_link self.download_fault =1 f.close()
def downloadPDFs(dictListJSON, state, jsonExists = False): #state = dictListJSON[0, 2] dlJSONFile = open(dictListJSON, "r") dictJSON = json.load(dlJSONFile) dlJSONFile.close() #some condition to check if the JSON already exists if jsonExists: pdfDictList = dictJSON else: pdfDictList = findPDFLinks(dictJSON, state) count = 0 for dict in pdfDictList: #test if date > 01/01/13 fileName = "".join(str(dict["AdvertiserInfo"]).split()) print "Writing to " + fileName url = dict["PDFLink"] url = re.sub(' ', '%20', url) print url if url != "NO URL": urlOpened = URL(url) f = open(fileName, 'wb') #download to state pdfs directory f.write(urlOpened.download(cached=False)) f.close() count += 1 if count > 4: break
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. from pattern.web import abs url = URL("http://www.imdb.com/chart/top") dom = DOM(url.download(cached=True)) for e in dom.by_tag("td.titleColumn")[:250]: for link in e.by_tag("a"): link = link.attrs.get("href", "") link = abs(link, base=url.redirect or url.string) movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. top_250_url = URL(url) top_250_html = top_250_url.download(cached=True) top_250_dom = DOM(top_250_html) for a in top_250_dom.by_tag("td.titleColumn"): for b in a.by_tag("a"): link_ext = b.attrs["href"].encode("utf-8") link_base = "http://www.imdb.com" link = link_base+link_ext movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def downloading_csv(self, download_type = 'hist'): """ Download the csv information for particular stock. download_type can be hist or div. If hist, will download the hist price. If div, will download dividend history. Kwargs: download_type (str): hist or div (default hist). """ self.download_fault = 0 if download_type == 'hist': target_url = self.hist_quotes_full_url sav_filename = os.path.join(self.hist_quotes_csvfile_path,'hist_stock_price_'+ self.individual_stock_sym+ '.csv') elif download_type == 'div': target_url = self.div_history_full_url sav_filename = os.path.join(self.hist_quotes_csvfile_path,'div_hist_'+ self.individual_stock_sym+ '.csv') else: print 'wrong download type' raise url = URL(target_url) f = open(self.tempfile_sav_location, 'wb') # save as test.gif try: f.write(url.download())#if have problem skip except: if self.__print_download_fault: print 'Problem with processing this data: ', target_url self.download_fault =1 f.close() if not self.download_fault: if self.enable_save_raw_file: shutil.copyfile(self.tempfile_sav_location,sav_filename )
def download_single_image(self, url_link, pic_prefix_str): """ Download data according to the url link given. Args: url_link (str): url str. pic_prefix_str (str): pic_prefix_str for unique label the pic """ self.download_fault = 0 file_ext = os.path.splitext(url_link)[ 1] #use for checking valid pic ext temp_filename = pic_prefix_str + file_ext temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename) valid_image_ext_list = [ '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff' ] #not comprehensive url = URL(url_link) if url.redirect: return # if there is re-direct, return if file_ext not in valid_image_ext_list: return #return if not valid image extension f = open(temp_filename_full_path, 'wb') # save as test.gif print url_link self.pic_info_list.append(pic_prefix_str + ': ' + url_link) try: f.write(url.download()) #if have problem skip except: #if self.__print_download_fault: print 'Problem with processing this data: ', url_link self.download_fault = 1 f.close()
def process_page(): url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series") dom = DOM(url.download(cached=True)) domIndex = 0 for title in dom.by_class("title"): theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace') titleCatalog.append(Title(theTitle)) try: match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace')) #print match.group(1) # titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace')) titleCatalog[domIndex].addRunTime(match.group(1)) except Exception, e: pass try: titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace')) except Exception, e: pass
def getQuotes(sym): frontUrl = "http://real-chart.finance.yahoo.com/table.csv?s=" endUrl = "&a=10&b=8&c=1997&d=10&e=8&f=2015&g=d&ignore=.csv" failed = [] count = 1 for ticker in sym: fname = "quotes/" + ticker + ".csv" df = object() tickerUrl = frontUrl + ticker + endUrl url = URL(tickerUrl) f = open(fname, 'wb') try: f.write(url.download()) except: print "quotes csv download failed: " + ticker failed.append(ticker) count += 1 continue f.close() count+=1 print "progress: " + str(count) + "/" + str(len(sym)) return failed
def getQuotes(sym): frontUrl = "http://real-chart.finance.yahoo.com/table.csv?s=" endUrl = "&a=10&b=8&c=1997&d=10&e=8&f=2015&g=d&ignore=.csv" failed = [] count = 1 for ticker in sym: fname = "quotes/" + ticker + ".csv" df = object() tickerUrl = frontUrl + ticker + endUrl url = URL(tickerUrl) f = open(fname, 'wb') try: f.write(url.download()) except: print "quotes csv download failed: " + ticker failed.append(ticker) count += 1 continue f.close() count += 1 print "progress: " + str(count) + "/" + str(len(sym)) return failed
class documento(object): """docstring for documento""" url = "" clase = "" atributos = {} query = "" html = "" contenido = "" elemento = None # unDocumento.elemento('a[href=""]') Para obtener solo los enlaces con href def __init__(self,url, query): super(documento, self).__init__() self.url = url self.urlObjet = URL('http://www.clips.ua.ac.be') self.html = self.urlObjet.download(user_agent='Mozilla/5.0') self.contenido = plaintext(self.html, keep=[], replace=blocks, linebreaks=2, indentation=False) self.elemento = Element(self.html) def save(self,arg): pass def descargar(self, arg): pass def obtenerAtributos(self,arg): pass def setUnAtributo(self,atributo,valor): pass
def scrape_top_250(url): """ Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). """ # This piece of code is needed to use the dom structure while it is not given as argument. TOP_250_URL = "http://www.imdb.com/chart/top" top_250_url = URL(TOP_250_URL) top_250_html = top_250_url.download(cached=True) dom = DOM(top_250_html) movie_urls = [] """ Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film. Uses CSS selectors to find the right urls and subsequently places them in a list """ for e in dom.by_tag("td.titleColumn"): for a in e.by_tag("a")[:1]: main = "http://www.imdb.com" Locallink = main + a.attrs["href"] movie_urls.append(Locallink) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' # This piece of code is needed to use the dom structure while it is not given as argument. TOP_250_URL = 'http://www.imdb.com/chart/top' top_250_url = URL(TOP_250_URL) top_250_html = top_250_url.download(cached=True) dom = DOM(top_250_html) movie_urls = [] ''' Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film. Uses CSS selectors to find the right urls and subsequently places them in a list ''' for e in dom.by_tag("td.titleColumn"): for a in e.by_tag("a")[:1]: main = "http://www.imdb.com" Locallink = main + a.attrs["href"] movie_urls.append(Locallink) # return the list of URLs of each movie's page on IMDB return movie_urls
def get_by_year(year): url = URL("http://www.imdb.com/event/ev0000003/" + str(year)) dom = DOM(url.download(cached=True)) dictAll = {} awards = dom.by_class('award') awardTitles = awards[0].by_tag('h2') awardList = [] for award in awardTitles: awardList.append(award.content) prize = awards[0].by_tag('blockquote') for index, title in enumerate(prize[1:25]): winner = title.by_tag('strong')[0].by_tag('a')[0].content winner_id = str(title.by_tag('strong')[0].by_tag('a')[0].attrs['href'][-8:-1]) nomineeList = [] for each in title.by_tag('strong')[1::]: name = each.by_tag('a')[0].content id = str(each.by_tag('a')[0].attrs['href'][-8:-1]) nomineeList.append((clean_unicode(name),id)) winnersAndNominees = {} winnersAndNominees['winner'] = (clean_unicode(winner),winner_id) winnersAndNominees['nominees'] = nomineeList dictAll[awardList[index]] = winnersAndNominees return dictAll
def download_single_image(url_link, pic_prefix_str, target_folder, image_size): """ Download data according to the url link given. Args: url_link (str): url str. pic_prefix_str (str): pic_prefix_str for unique label the pic """ file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext temp_filename = pic_prefix_str + ".jpg" temp_filename_full_path = os.path.join(target_folder, temp_filename) valid_image_ext_list = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive url = URL(url_link) if url.redirect: return # if there is re-direct, return if file_ext not in valid_image_ext_list: return #return if not valid image extension # save as test.gif print url_link try: response = url.download() img = resize_image(response, image_size) img.save(temp_filename_full_path, "JPEG") except Exception as e: #if self.__print_download_fault: print 'Problem with processing this data: ', str(e), url_link
def loadPage(numPage): #Load the content from the given page url = URL(url_estruc_1 + str(numPage) + url_estruc_2) dom = DOM(url.download(cached=True)) for row in dom(ROWS_PATH)[1:]: #pprint.pprint(plaintext(row(CELLS_PATH)[0].content)) RESULTS.append({"place": plaintext(row(CELLS_PATH)[0].content), "place_gender": plaintext(row(CELLS_PATH)[1].content) }) pprint.pprint(str(numPage + 1) + "/" + str(last_page))
def all_lyrics(artist): clean = re.sub(r"\s+|'", '', artist) url = URL(BASE_URL + artist[0] + '/' + clean + '.html') dom = DOM(url.download()) titles = [a.content for a in dom('div#listAlbum a')] ew_amazon = [ abs(link.attributes.get('href', ''), base=url.redirect or url.string) for link in dom('div#listAlbum a') ] songlinks = [l for l in ew_amazon if 'amazon' not in l] lyrics = [] for link in songlinks: song_url = URL(link) song_dom = DOM(song_url.download()) lyrics.append(plaintext(song_dom('div#main div')[4:5][0].content)) zippy_lyrics = zip(titles, lyrics) return json.dumps(zippy_lyrics, sort_keys=True)
def downloading_csv(self, url_address): """ Download the csv information from the url_address given. """ cache.clear() url = URL(url_address) f = open(self.cur_quotes_csvfile, 'wb') # save as test.gif f.write(url.download()) f.close()
def read_web(url): html = '' start = etime() try: uri = URL(url) html = uri.download(cached=True) except Exception, e: print 'HTTP Error:' + str(e.message)
def downloading_xml(self, url_address): """ Download the xml information from the url_address given. """ cache.clear() url = URL(url_address) f = open(self.feeds_xmlfile, 'wb') # save as test.gif f.write(url.download()) f.close()
def get_patent_urls(keyword, limit=10): keyword = urllib.quote_plus(keyword) base_url = "http://www.lens.org" url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) + "&q=" + keyword) dom = DOM(url.download()) links = [base_url + a.attributes.get('href') for a in dom('a.link')] return links
def get_dom_object(self, url_target): try: url = URL(url_target) dom_object = DOM(url.download(cached=True)) except: print('Problem retrieving data for this url: ', self.target_url_str) self.url_query_timeout = 1 return dom_object
def scrape(url): with io.open("allMusicOneWeek.csv", "w", encoding="utf8") as f: url = "http://www.top40.nl/top40/2015/week-46" week = url.split("/") week = week[-1] url = URL("http://www.top40.nl/top40/2015/week-46") dom = DOM(url.download(cached=True)) # geeft de week i = 1 # de lijst van de top 40 selecteren for l in dom.by_tag("ol.top40"): # per nummer selecteren= print "lijst top 40" for e in l.by_tag("div.clearfix"): muziekGegevens = "" #positie in de top 40 muziekGegevens += str(i) + "," print i, 'positie' i += 1 # opletten met resetten # de artiest selecteren for artiest in e.by_class("credit"): muziekGegevens += artiest.content + "," #positie for inner in e.by_tag("strong")[1:2]: print inner.content, "1:2" muziekGegevens += inner.content + "," # hoogste notering for inner in e.by_tag("strong")[2:3]: print inner.content, "2:3" muziekGegevens += inner.content + "," # aantal punten for inner in e.by_tag("strong")[3:4]: print inner.content, "3:4" muziekGegevens += inner.content + "," # jaar van het nummer for inner in e.by_tag("strong")[4:5]: print inner.content.strip(), "4:5" muziekGegevens += inner.content.strip() h = HTMLParser.HTMLParser() muziekGegevens = h.unescape(muziekGegevens) if not whatisthis(muziekGegevens): muziekGegevens = unicode(muziekGegevens, "utf-8") print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf' f.write(muziekGegevens + "\n") else: f.write(muziekGegevens + "\n") # 1 positie # week-45 # ,1, # Traceback (most recent call last): # File "testhtmlscraper.py", line 58, in <module> # f.write(muziekGegevens + "\n") # TypeError: must be unicode, not str ??? f.close
def download_json(self): """ Download the json file from the self.com_data_full_url. The save file is defaulted to the self.saved_json_file. """ cache.clear() url = URL(self.com_data_full_url) f = open(self.saved_json_file, 'wb') # save as test.gif f.write(url.download(timeout=50)) #increse the time out time for this f.close()
def extract_percentages(dom): file_url = URL(TARGET_URL) file_dom = DOM(file_url.download()) percentage_list = [] if file_dom.by_class('percentage'): for item in file_dom.by_class('percentage'): percentage_list.append(item.content.encode('utf-8')) return percentage_list[0] else: return "nodata"
def create_dom_object(self): """ Create dom object based on element for scraping Take into consideration that there might be query problem. """ try: url = URL(self.full_url_str) self.dom_object = DOM(url.download(cached=True)) except: if self.__print_url_finding_error: print 'Problem retrieving data for this url: ', self.full_url_str self.url_query_timeout = 1
def getKeyRatios(marketUrl, marketSbls, valDict): krUrl = "http://financials.morningstar.com/ajax/exportKR2CSV.html?&callback=?&t=" endUrl = "®ion=usa&culture=en-US&cur=&order=asc" temp = "temp/mskr.csv" #the ratios we want we define here indexes = [ 'Dividends USD', 'Payout Ratio %', 'Shares Mil', 'Return on Assets %', 'Return on Equity %', 'Current Ratio', 'Quick Ratio', 'Debt/Equity' ] columns = [str(year) for year in range(2006, 2015)] + ['TTM'] failed = [] count = 1 for ticker in marketSbls: #the df to store the data we extract refinedDf = pd.DataFrame(columns=columns, index=indexes) #the df to store all the data from the csv downloaded df = object() tickerUrl = krUrl + marketUrl + ticker + endUrl url = URL(tickerUrl) f = open(temp, 'wb') try: #actually download f.write(url.download()) except: print "could not download csv: " + ticker count += 1 failed.append(ticker) continue f.close() try: #turn csv into dataframe df = pd.read_csv(temp, header=2, thousands=",", index_col=0) except: count += 1 failed.append(ticker) continue #change nans to dash df = df.fillna('-') #rename columns df.columns = columns #extract rows neededd for year in columns: refinedDf[year] = df[year][indexes] #append to valuation dataframe combined = valDict[ticker].append(refinedDf) print "saving combined KR and val data to db: " + ticker #save combined df to csv, into database combined.to_csv("db/" + ticker + ".csv", mode='w', encoding='utf-8') print "Progress: " + str(count) + "/" + str(len(marketSbls)) count += 1 return failed
def descargarContenidoHtml(self, url): try: unaUrl = URL(url) if "pdf" in extension(unaUrl.page): return self.descargarPDF(unaUrl) else: return unaUrl.download() except Exception as e: try: return self.urlLibDescarga(url) except Exception as e: print "except " + str(e) print url
def downloading_csv(self): """ Download the csv information for particular stock. """ self.download_fault = 0 url = URL(self.com_data_full_url) f = open(self.ms_stats_extract_temp_csv, 'wb') # save as test.gif try: f.write(url.download())#if have problem skip except: if self.__print_download_fault: print 'Problem with processing this data: ', self.com_data_full_url self.download_fault =1 f.close()
def descargarContenido(self, url): """Metodo para descargar el contenido de los documentos webs siendo url o pdf""" try: unaUrl = URL(url) if "pdf" in extension(unaUrl.page): return self.descargarPDF(unaUrl) else: return plaintext(unaUrl.download()) except Exception as e: try: return plaintext(self.urlLibDescarga(url)) except Exception as e: print "except " + str(e) print url
def conjugate(verb, language="italian"): url = URL("http://en.wiktionary.org/wiki/%s" % verb) dom = DOM(url.download(throttle=10, cached=True)) conj = {"infinitive": verb} mood = None for table in dom("table.inflection-table"): # Search the header that marks the start for the given language: # <h2><span class="mw-headline" id="Italian">Italian</span></h2> h2 = table.parent.parent while h2: h2 = h2.previous if getattr(h2, "tag", "") == "h2" and \ getattr(h2("span")[0], "id", "") != language: continue for tr in table("tr"): for th in tr("th"): # <th>indicative</th> if th.content in MOOD: mood = th.content # <th>present</th><td>sono</td><td>sei></td>... if th.content in TENSE: conj[th.content, mood] = [plain(td.content) for td in tr("td")] # <th>gerund</th><td>essendo</td> if th.content in PARTICIPLE: conj[th.content] = plain(th.next.next.content) # <th>imperative</th></tr><tr><td></td><td>sii</td>... if mood == "imperative" and len(tr("th")) == 0: conj["present", mood] = [plain(td.content) for td in tr("td")] return conj return {}