Пример #1
0
def extract_data(stock_ticker):
    url_base = 'http://financials.morningstar.com/ajax/exportKR2CSV.html?&callback=?&t='
    url_end = '&region=usa&culture=en-US&cur=&order=asc'
    # May add more exchanges later on, but these cover the main US stock exchanges: Nasdaq, New York SE, and Pink Sheets (OTC stocks), respectively
    # Loops through main stock exchanges to get proper URL for data extraction
    stock_exchange_list = ['XNAS:','XNYS:','PINX:'] 
    for exchange in stock_exchange_list:
        test = URL(url_base+exchange+stock_ticker+url_end)
        if sys.getsizeof(test.download()) > 35: #A broken URL produces an empty string, which has memory size 33; size 35 allows for minor variation in the size
            break
    temp_data = 'C:/Users/Owner/Documents/temp.csv'
    f = open(temp_data, mode='w')
    try:
        f.write(test.download())
    except:
        raise IOError('There was an error processing this data')
        sys.exit(1)
    f.close()
    try:
        stock_data_df =  pd.read_csv(temp_data, header=2,thousands =',',index_col=0,skiprows=[19,20,31,41,42,43,48,58,53,64,65,72,73,95,101,102])
    except:
        raise IOError('Problem downloading files')
        os.remove(temp_data)
        sys.exit(1)
    os.remove(temp_data)
    stock_data_df = stock_data_df.transpose()
    return(stock_data_df)
Пример #2
0
    def research_on(self, what, where):

        url = URL(
            "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
            what + "&ou=" + where + "&proximite=0")
        dom = DOM(url.download(cached=True))

        for a in dom.by_tag("div.main-title pj-on-autoload "):
            for e in a.by_tag("span.denombrement"):
                number_of_results = int(
                    self.decode_if_unicode(plaintext(e.content))[:3])

        number_of_page_results = number_of_results / 20
        if (number_of_results % 20 > 0):
            number_of_page_results += 1

        self.exctract_values(dom, self.myInfo)

        for i in range(2, number_of_page_results + 1):
            url = URL(
                "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
                what + "&ou=" + where + "&proximite=0+"
                "&page=" + str(i))
            dom = DOM(url.download(cached=True))
            self.exctract_values(dom, self.myInfo)

        self.myInfo.sort_and_merge()
Пример #3
0
    def cats(self, namespace=0, start=None, acmin=1, count=100, cached=True, **kwargs):
        """ Returns an iterator over all article titles (for a given namespace id).
        """
        kwargs.setdefault("unicode", True)
        kwargs.setdefault("throttle", self.throttle)
        # Fetch article titles (default) or a custom id.
        id = kwargs.pop("_id", "title")
        id = "*"
        # Loop endlessly (= until the last request no longer yields an "apcontinue").
        # See: http://www.mediawiki.org/wiki/API:Allpages
        while start != -1:
            url = URL(self._url, method=GET, query={
                     "action": "query",
                       "list": "allcategories",
                     "acfrom": start or "",
                    "aclimit": min(count, 500),
                    "acprop": "size",
                    "acmin": max(1, acmin),
                     "format": "json"
            })
            data = url.download(cached=cached, **kwargs)
            data = json.loads(data)
            for x in data.get("query", {}).get("allcategories", {}):
                # print(x)
                if x.get(id):
                    # yield x[id]
                    x['name'] = x.pop('*')
                    yield x

            start = data.get("query-continue", {}).get("allcategories", {})
            start = start.get("accontinue", start.get("acfrom", -1))
        raise StopIteration
Пример #4
0
def getRandomHistoryDOM(language):
    url = URL("http://"+language+".wikipedia.org/wiki/Special:Random")
    #Gets the url only of the page this redirects to
    redirectUrl = url.redirect
    try:
        #Grab the name of the wikipedia article from the url
        urlComponents = string.split(redirectUrl, '/')
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        return getRandomHistoryDOM(language)

    #Get the history section of the article
    redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history"
    print "Current article is: " +str(urlComponents[4])
    #print redirectUrl
    url = URL(redirectUrl);
    dom = DOM(url.download(cached=False))
    try:
        historyList = dom.by_id("pagehistory").by_tag("li")
        return historyList, urlComponents[4]
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        dom = getRandomHistoryDOM(language)

    return getRandomHistoryDOM(language)
Пример #5
0
def getContributorInfo(devUrl):

    url          = URL(devUrl)
    contribInfo  = json.loads(url.download())


    """
Пример #6
0
def download_pdfs():
    """download pdfs from fda"""

    # where to save pdfs
    path = 'classifier_docs/pdfs/'

    # create directory if it doesn't exist
    if not os.path.exists(path):
        os.makedirs(path)

    # load in non-standard pdf urls from 2012 to serve as control text
    # note: had to lookup urls manually
    # drugs are erivedge (203388) and sirturo (204384)
    # also, menhibrix (125363) has no medical review available 
    urls = ['http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/203388Orig1s000MedRpdf.pdf',
            'http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/204384Orig1s000MedR_.pdf']
    for url in urls:
        m = re.search('20..\/(\d{6})', url)
        app_num = m.group(1)
        url = URL(url)
        # make sure that url points to PDF, print error otherwise
        if url.mimetype in MIMETYPE_PDF:
            # write pdf for medical review if it doesn't exist
            fn = path + app_num + '.pdf'
            if not os.path.exists(fn):
                print "writing {} from {}".format(fn, url)
                f = open(fn, 'w')
                f.write(url.download(cached = False))
                f.close()
            else:
                print "{} already exists".format(fn)
        else:
            print "warning: {} did not resolve to pdf".format(url)

    return
Пример #7
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # Download the HTML file
    url = URL(url)
    html = url.download()

    # Parse the HTML file into a DOM representation
    dom = DOM(html)

    # Iterate through all 250 table rows on the index page
    for movies in dom('.lister-list > tr'):
        # take the movie's href attribute and put it in href
        href = movies('td.titleColumn a')[0].attrs["href"]
        # append the href attribute to the string, but also add http://www.imdb.com/ in front of it
        movie_urls.append("http://www.imdb.com/" + href)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
def download_single_image(url_link, pic_prefix_str, target_folder, image_size):
    """ Download data according to the url link given.
        Args:
            url_link (str): url str.
            pic_prefix_str (str): pic_prefix_str for unique label the pic
    """
    file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext
    temp_filename = pic_prefix_str + ".jpg"
    temp_filename_full_path = os.path.join(target_folder, temp_filename)

    valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive

    url = URL(url_link)
    if url.redirect:
        return # if there is re-direct, return

    if file_ext not in valid_image_ext_list:
        return #return if not valid image extension

     # save as test.gif
    print url_link
    try:
        response = url.download()
        img = resize_image(response, image_size)
        img.save(temp_filename_full_path, "JPEG")
    except Exception as e:
        #if self.__print_download_fault:
        print 'Problem with processing this data: ', str(e), url_link
Пример #9
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    url = URL(url)
    html = url.download()
    dom = DOM(html)
    homeUrl = 'http://www.imdb.com'
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    for e in dom.by_tag("td.titleColumn"):
        absoluteUrl = ''
        for a in e.by_tag("a"):
            link = a.attributes.get("href", "")
            absoluteUrl = homeUrl + link
            movie_urls.append(absoluteUrl)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
Пример #10
0
def extract_tvseries(dom):

    url = URL(TARGET_URL)
    dom = DOM(url.download(cached=True))
    #print dom.body.content
    x = 0
    csv_row = []
    for series in dom.by_tag('td.title'):    
        title = series.by_tag('a')[0].content.encode('ascii', 'ignore')
        ranking = series.by_tag('span.value')[0].content.encode('ascii', 'ignore')
        genres = series.by_tag('span.genre')[0].by_tag('a')
        genres = [g.content.encode('ascii', 'ignore') for g in genres]
        actors = series.by_tag('span.credit')[0].by_tag('a')
        actors = [a.content.encode('ascii', 'ignore') for a in actors]
        x = x + 1
        try:
            runtime = series.by_tag('span.runtime')[0].content.encode('ascii', 'ignore')
        except:
            runtime = "Unknown"
        #print x, title, ranking, genres, actors, runtime

        csv_titles = title
        csv_ranking = ranking
        csv_genres = genres
        csv_actors = actors
        csv_runtime = runtime
        row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime]
        csv_row.append(row)

    return csv_row
def scrape_education(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=01,ALAMEDA&cType=T&cGender=&Submit=1'
	url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=' + county_num + '01,ALAMEDA&cType=T&cGender=&Submit=1'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	

	other = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[12].content.replace(',','')
	associates = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[11].content.replace(',','')
	bachelors = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[9].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[10].content.replace(',','')))

	masters = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[4].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[5].content.replace(',','')))
	jurisdoctor = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[3].content.replace(',','')
	doctorate = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[2].content.replace(',','')
	
	bachelors_and_less = str(int(bachelors) + int(associates) + int(other))
	
	post_grad = str(int(masters) + int(jurisdoctor) + int(doctorate))
	
	county = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("a")[0].content

	# write all the collected data to a new row of the output file
	writer.writerow([county, bachelors_and_less, post_grad, associates, bachelors, masters, jurisdoctor, doctorate])
Пример #12
0
def summarize(query=None, k=4,url=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
        for sentence in sentences:
            lsa1.parse(sentence)
    else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
        for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()
    summary =[(sentences[i], norm(dot(diag(lsa1.S),lsa1.Vt[:,b]),2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))]
    sorted(summary, key=itemgetter(1))
    summary = dict((v[0],v) for v in sorted(summary, key=lambda summary: summary[1])).values()
    return '.'.join([a for a, b in summary][len(summary)-(k):])
Пример #13
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    top_250_url = URL(url)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)

    for a in top_250_dom.by_tag("td.titleColumn")[:1]:
        for b in a.by_tag("a"):
            link_ext = b.attrs["href"].encode("utf-8")
            link_base = "http://www.imdb.com"
            link = link_base+link_ext
            movie_urls.append(link)
             

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
Пример #14
0
def summarize(query=None, k=4, url=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [
            word for sentence in j for word in sentence.split()
            if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word
            or '"' in word
        ]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [
            sentence for sentence in sentences
            if len(sentence) > 1 and sentence != ''
        ]
        for sentence in sentences:
            lsa1.parse(sentence)
    else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
        for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()
    summary = [(sentences[i], norm(dot(diag(lsa1.S), lsa1.Vt[:, b]), 2))
               for i in range(len(sentences)) for b in range(len(lsa1.Vt))]
    sorted(summary, key=itemgetter(1))
    summary = dict(
        (v[0], v)
        for v in sorted(summary, key=lambda summary: summary[1])).values()
    return '.'.join([a for a, b in summary][len(summary) - (k):])
Пример #15
0
def dl_byUrllib2(url, filename):
    myurl = URL(url)
    if os.path.exists(filename):
        return
    with open(filename,'wb') as fp:
        fp.write(myurl.download(cached=False))
        fp.close()
Пример #16
0
def get_patent_urls(keyword, limit=10):
    keyword = urllib.quote_plus(keyword)
    base_url = "http://www.lens.org"
    url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) + "&q=" + keyword)
    dom = DOM(url.download())
    links = [base_url + a.attributes.get("href") for a in dom("a.link")]
    return links
Пример #17
0
def get_patent(url):
    url = URL(url + "/fulltext")
    html = url.download()
    dom = DOM(html)
    title = plaintext(dom("h3 a")[0].content)
    body = plaintext(dom("#contents")[0].content)
    return [title, body]
Пример #18
0
def google_search(targetword, itemlist,targetpath):
    resultnum=0
    engine = Google(license=None)
    file = codecs.open(targetpath,'a','utf-8')
    outtext= ''
    patt = ur'\W+'
    for item in itemlist:
        for i in range(1,5):
            for result in engine.search(item, type=SEARCH, start=i):

                  url = URL(result.url)
                  text = url.download(unicode=True)

                  text = plaintext(text)
                  text = correctPersianString(text)
                  text = text.replace('\n',' ')
                  lines = text.split('.')
                  for line in lines:
                      if targetword in line:

                              match = re.findall(patt,line)
                              output =  ' '.join(match)

                              for item in punclist:
                                  if item in line:
                                      line = line.replace(item,' ')

                              print output
                              file.write(output)
                              file.write('\n')
    print str(resultnum)+" found in web"
    file.close()
Пример #19
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    url = URL(url)
    html = url.download()
    dom = DOM(html)
    homeUrl = 'http://www.imdb.com'
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    for e in dom.by_tag("td.titleColumn"):
        absoluteUrl = ''
        for a in e.by_tag("a"):
            link = a.attributes.get("href","")
            absoluteUrl = homeUrl + link
            movie_urls.append(absoluteUrl)
        
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
Пример #20
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    from pattern.web import abs
    url = URL("http://www.imdb.com/chart/top")
    dom = DOM(url.download(cached = True))
    for e in dom.by_tag("td.titleColumn")[:250]:
        for link in e.by_tag("a"):
            link = link.attrs.get("href","")
            link = abs(link, base=url.redirect or url.string)
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
def scrape_starrtest(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	
	#sciend_num = dom.by_class("rm")[4].content
	scicst_num = dom.by_class("rm")[3].content
	math_num = dom.by_class("rm")[2].content
	hist_num = dom.by_class("rm")[1].content
	ela_num = dom.by_class("rm")[0].content
	
	#sciend_percent = dom.by_class("rs")[4].content[:5]
	scicst_percent = dom.by_class("rs")[3].content[:5]
	math_percent = dom.by_class("rs")[2].content[:5]
	hist_percent = dom.by_class("rs")[1].content[:5]
	ela_percent = dom.by_class("rs")[0].content[:5]
	
	county = dom.by_tag("h2")[0].content
	
	
	# write all the collected data to a new row of the output file
	writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    #absolute_url = 'http://www.imdb.com'

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    url = URL(url)
    dom = DOM(url.download(cached=True))

    #return dom

    for e in dom('.titleColumn'):
        for link in e('a'):
            movie_urls.append(abs(link.attributes.get('href')), )

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
Пример #23
0
def summarize_evaluation(query=None, url=None, summary=None):
    j=[]
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
        for sentence in sentences:
            lsa.parse(sentence)
    else:
        lsa = LSA(stopwords, ignore_characters)
        for sentence in query:
            lsa.parse(sentence)
    lsa.build()
    lsa.calc()
    lsa2 = LSA(stopwords, ignore_characters)
    for sentence in summary:
        lsa2.parse(sentence)
    lsa2.build()
    lsa2.calc()
    vectors =[(dot(lsa.S,lsa.U[0,:]),dot(lsa.S,lsa.U[i,:])) for i in range(len(lsa.U))]
    vectors2 =[(dot(lsa2.S,lsa2.U[0,:]),dot(lsa2.S,lsa2.U[i,:])) for i in range(len(lsa2.U))]
    angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a in vectors for b in vectors2]
    return str(abs(1 - float(angles[1])/float(pi/2)))
Пример #24
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.
    Args:
        url: pattern.web.URL instance pointing to the top 250 index page
    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    #absolute_url = 'http://www.imdb.com'

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    url = URL(url)
    dom = DOM(url.download(cached=True))
    
    # return the dom value
    
    for e in dom('.titleColumn'):
        for link in e('a'):
            movie_urls.append(abs(link.attributes.get('href')), )
            
    # return url list
    return movie_urls
def extract_incidents(dom):

    incident_list = []
    i = 0

    for incident in dom.by_tag('tr'):
        if i > 0:
            link = INCIDENT_URL + incident.by_tag('a')[0].href
            print link

            url = URL(link)
            html = url.download(timeout=100)
            dom_incident = DOM(html)

            weapons = [weapon.strip() for weapon in dom_incident.by_tag('p')[16].content[27:].split('<br />')]
            weapons = ", ".join(weapons)[:-2]
            latitude = dom_incident.by_tag('p')[2].content[33:].strip()
            longitude = dom_incident.by_tag('p')[3].content[34:].strip()

            description = incident.by_tag('div')[0].content[1:].strip()
            date = incident.by_tag('td')[2].content[1:].strip()
            location = incident.by_tag('td')[3].content[1:].strip()
            violation = incident.by_tag('td')[4].content[1:].strip()
            incident_list.append([link.encode('utf-8'), location.encode('utf-8'), latitude.encode('utf-8'), longitude.encode('utf-8'), date.encode('utf-8'), violation.encode('utf-8'), weapons.encode('utf-8'), description.encode('utf-8')])

        i += 1

    return incident_list
Пример #26
0
def convertMapData():
    print '[2/2] Convert map data'

    # output dictionary
    d3mapData = {}

    # download the file
    url = URL(DATASET3)
    data = url.download()

    # create array
    data = list(json.loads(data))

    # fill output dictionary
    for dataRow in data:
        if dataRow['Year'] == '2014':
            population = dataRow['Value']
            fillColor = defineColor(dataRow['Value'])
            d3mapData[dataRow['Country Code']] = {'population': population, 'fillKey': fillColor}

    print '[2/2] Write to json'

    # write output dictionary to json file
    with open('D3LinkedViews/data_map.json', 'wb') as output_file:
        json.dump(d3mapData, output_file)

    print '[2/2] Finish'
Пример #27
0
def get_patent(url):
    url = URL(url + "/fulltext")
    html = url.download()
    dom = DOM(html)
    title = plaintext(dom('h3 a')[0].content)
    body = plaintext(dom('#contents')[0].content)
    return [title, body]
Пример #28
0
 def plainTextConverter(self, link, metodo="SinEtiquetas"):
     reload(sys)
     sys.setdefaultencoding('utf-8')
     url = URL(link)
     txtContent = ""
     try:
         if url.mimetype in MIMETYPE_PDF:
             document = open('temp.pdf', 'w')
             document.close()
             download = url.download()
             document = open('temp.pdf', 'a')
             document.write(download)
             document.close()
             #txtContent=os.system('pdf2txt.py temp.pdf')
             txtContent = commands.getoutput('pdf2txt.py temp.pdf')
         else:
             page = URL(url).download(user_agent='Mozilla/5')
             if metodo == "mantenerEtiquetas":
                 txtContent = plaintext(page,
                                        keep={
                                            'title': [],
                                            'h1': [],
                                            'h2': [],
                                            'strong': []
                                        })
             else:
                 txtContent = plaintext(page, keep={})
     except:
         pass
     return txtContent
Пример #29
0
def dl_byUrllib2(url, filename):
    myurl = URL(url)
    if os.path.exists(filename):
        return
    with open(filename,'wb') as fp:
        fp.write(myurl.download(cached=False))
        fp.close()
    def download_single_image(self, url_link, pic_prefix_str):
        """ Download data according to the url link given.
            Args:
                url_link (str): url str.
                pic_prefix_str (str): pic_prefix_str for unique label the pic
        """
        self.download_fault = 0
        file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext
        temp_filename = pic_prefix_str + file_ext
        temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename )
 
        valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive
 
        url = URL(url_link)
        if url.redirect:
            return # if there is re-direct, return
 
        if file_ext not in valid_image_ext_list:
            return #return if not valid image extension
 
        f = open(temp_filename_full_path, 'wb') # save as test.gif
        print url_link
        self.pic_info_list.append(pic_prefix_str + ': ' + url_link )
        try:
            f.write(url.download())#if have problem skip
        except:
            #if self.__print_download_fault:
            print 'Problem with processing this data: ', url_link
            self.download_fault =1
        f.close()
Пример #31
0
def downloadPDFs(dictListJSON, state, jsonExists = False):
    #state = dictListJSON[0, 2]
    dlJSONFile = open(dictListJSON, "r")
    dictJSON = json.load(dlJSONFile)
    dlJSONFile.close()
    #some condition to check if the JSON already exists
    if jsonExists:
        pdfDictList = dictJSON
    else:
        pdfDictList = findPDFLinks(dictJSON, state)


    count = 0
    for dict in pdfDictList:
        #test if date > 01/01/13
        fileName = "".join(str(dict["AdvertiserInfo"]).split())
        print "Writing to " + fileName
        url = dict["PDFLink"]
        url = re.sub(' ', '%20', url)
        print url
        if url != "NO URL":
            urlOpened = URL(url)
            f = open(fileName, 'wb')
            #download to state pdfs directory
            f.write(urlOpened.download(cached=False))
            f.close()
        count += 1
        if count > 4:
            break
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    from pattern.web import abs
    url = URL("http://www.imdb.com/chart/top")
    dom = DOM(url.download(cached=True))
    for e in dom.by_tag("td.titleColumn")[:250]:
        for link in e.by_tag("a"):
            link = link.attrs.get("href", "")
            link = abs(link, base=url.redirect or url.string)
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
Пример #33
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    top_250_url = URL(url)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)

    for a in top_250_dom.by_tag("td.titleColumn"):
        for b in a.by_tag("a"):
            link_ext = b.attrs["href"].encode("utf-8")
            link_base = "http://www.imdb.com"
            link = link_base+link_ext
            movie_urls.append(link)
             

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
    def downloading_csv(self, download_type = 'hist'):
        """ Download the csv information for particular stock.
            download_type can be hist or div. If hist, will download the hist price.
            If div, will download dividend history.
            Kwargs:
                download_type (str): hist or div (default hist).
        """
        self.download_fault = 0

        if download_type == 'hist':
            target_url = self.hist_quotes_full_url
            sav_filename = os.path.join(self.hist_quotes_csvfile_path,'hist_stock_price_'+ self.individual_stock_sym+ '.csv')
        elif download_type == 'div':
            target_url = self.div_history_full_url
            sav_filename = os.path.join(self.hist_quotes_csvfile_path,'div_hist_'+ self.individual_stock_sym+ '.csv')
        else:
            print 'wrong download type'
            raise

        url = URL(target_url)
        f = open(self.tempfile_sav_location, 'wb') # save as test.gif
        try:
            f.write(url.download())#if have problem skip
        except:
            if self.__print_download_fault: print 'Problem with processing this data: ', target_url
            self.download_fault =1
        f.close()

        if not self.download_fault:
            if self.enable_save_raw_file:
                shutil.copyfile(self.tempfile_sav_location,sav_filename )
    def download_single_image(self, url_link, pic_prefix_str):
        """ Download data according to the url link given.
            Args:
                url_link (str): url str.
                pic_prefix_str (str): pic_prefix_str for unique label the pic
        """
        self.download_fault = 0
        file_ext = os.path.splitext(url_link)[
            1]  #use for checking valid pic ext
        temp_filename = pic_prefix_str + file_ext
        temp_filename_full_path = os.path.join(self.gs_raw_dirpath,
                                               temp_filename)

        valid_image_ext_list = [
            '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'
        ]  #not comprehensive

        url = URL(url_link)
        if url.redirect:
            return  # if there is re-direct, return

        if file_ext not in valid_image_ext_list:
            return  #return if not valid image extension

        f = open(temp_filename_full_path, 'wb')  # save as test.gif
        print url_link
        self.pic_info_list.append(pic_prefix_str + ': ' + url_link)
        try:
            f.write(url.download())  #if have problem skip
        except:
            #if self.__print_download_fault:
            print 'Problem with processing this data: ', url_link
            self.download_fault = 1
        f.close()
Пример #36
0
def process_page():

    url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
    dom = DOM(url.download(cached=True))
    domIndex = 0

    for title in dom.by_class("title"):

        theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace')
        titleCatalog.append(Title(theTitle))
    
        try:

            match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
            #print match.group(1)
            # titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
            titleCatalog[domIndex].addRunTime(match.group(1))

        except Exception, e:
            pass

        try:
            titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace'))
        except Exception, e:
            pass
Пример #37
0
def getQuotes(sym):
	frontUrl = "http://real-chart.finance.yahoo.com/table.csv?s="
	endUrl = "&amp;a=10&amp;b=8&amp;c=1997&amp;d=10&amp;e=8&amp;f=2015&amp;g=d&amp;ignore=.csv"
	
	failed = []
	count = 1

	for ticker in sym:
		fname = "quotes/" + ticker + ".csv"
		df = object()
		tickerUrl = frontUrl + ticker + endUrl
		url = URL(tickerUrl)
		f = open(fname, 'wb')
		try:
			f.write(url.download())
		except:
			print "quotes csv download failed: " + ticker
			failed.append(ticker)
			count += 1
			continue
		f.close()
		count+=1
		print "progress: " + str(count) + "/" + str(len(sym))

	return failed
Пример #38
0
def getQuotes(sym):
    frontUrl = "http://real-chart.finance.yahoo.com/table.csv?s="
    endUrl = "&amp;a=10&amp;b=8&amp;c=1997&amp;d=10&amp;e=8&amp;f=2015&amp;g=d&amp;ignore=.csv"

    failed = []
    count = 1

    for ticker in sym:
        fname = "quotes/" + ticker + ".csv"
        df = object()
        tickerUrl = frontUrl + ticker + endUrl
        url = URL(tickerUrl)
        f = open(fname, 'wb')
        try:
            f.write(url.download())
        except:
            print "quotes csv download failed: " + ticker
            failed.append(ticker)
            count += 1
            continue
        f.close()
        count += 1
        print "progress: " + str(count) + "/" + str(len(sym))

    return failed
Пример #39
0
class documento(object):
    """docstring for documento"""

    url = ""
    clase = ""
    atributos = {}
    query = ""
    html = ""
    contenido = ""
    elemento = None
    # unDocumento.elemento('a[href=""]') Para obtener solo los enlaces con href

    def __init__(self,url, query):
        super(documento, self).__init__()
        self.url = url
        self.urlObjet = URL('http://www.clips.ua.ac.be')
        self.html = self.urlObjet.download(user_agent='Mozilla/5.0')
        self.contenido = plaintext(self.html, keep=[], replace=blocks, linebreaks=2, indentation=False)
        self.elemento = Element(self.html)

    def save(self,arg):
        pass

    def descargar(self, arg):
        pass

    def obtenerAtributos(self,arg):
        pass

    def setUnAtributo(self,atributo,valor):
        pass
def scrape_top_250(url):
    """
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    """

    # This piece of code is needed to use the dom structure while it is not given as argument.
    TOP_250_URL = "http://www.imdb.com/chart/top"
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    dom = DOM(top_250_html)
    movie_urls = []

    """
    Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film.
    Uses CSS selectors to find the right urls and subsequently places them in a list
    """

    for e in dom.by_tag("td.titleColumn"):
        for a in e.by_tag("a")[:1]:
            main = "http://www.imdb.com"
            Locallink = main + a.attrs["href"]
            movie_urls.append(Locallink)
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
Пример #41
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    # This piece of code is needed to use the dom structure while it is not given as argument.
    TOP_250_URL = 'http://www.imdb.com/chart/top'
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    dom = DOM(top_250_html)
    movie_urls = []

    '''
    Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film.
    Uses CSS selectors to find the right urls and subsequently places them in a list
    '''

    for e in dom.by_tag("td.titleColumn"): 
        for a in e.by_tag("a")[:1]:
            main = "http://www.imdb.com"
            Locallink = main + a.attrs["href"]
            movie_urls.append(Locallink)
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
Пример #42
0
def get_by_year(year):

    url = URL("http://www.imdb.com/event/ev0000003/" + str(year))
    dom = DOM(url.download(cached=True))
    
    dictAll = {}
    
    awards = dom.by_class('award')
    awardTitles = awards[0].by_tag('h2')
    awardList = []
    for award in awardTitles:
        awardList.append(award.content)

    prize = awards[0].by_tag('blockquote')
    for index, title in enumerate(prize[1:25]):
        winner = title.by_tag('strong')[0].by_tag('a')[0].content
        winner_id = str(title.by_tag('strong')[0].by_tag('a')[0].attrs['href'][-8:-1])

        nomineeList = []
        for each in title.by_tag('strong')[1::]:
            name = each.by_tag('a')[0].content
            id = str(each.by_tag('a')[0].attrs['href'][-8:-1])
            nomineeList.append((clean_unicode(name),id))
            
        winnersAndNominees = {}
        winnersAndNominees['winner'] = (clean_unicode(winner),winner_id)
        winnersAndNominees['nominees'] = nomineeList
        dictAll[awardList[index]] =  winnersAndNominees
    return dictAll
Пример #43
0
def download_single_image(url_link, pic_prefix_str, target_folder, image_size):
    """ Download data according to the url link given.
        Args:
            url_link (str): url str.
            pic_prefix_str (str): pic_prefix_str for unique label the pic
    """
    file_ext = os.path.splitext(url_link)[1]  #use for checking valid pic ext
    temp_filename = pic_prefix_str + ".jpg"
    temp_filename_full_path = os.path.join(target_folder, temp_filename)

    valid_image_ext_list = ['.png', '.jpg', '.jpeg', '.gif', '.bmp',
                            '.tiff']  #not comprehensive

    url = URL(url_link)
    if url.redirect:
        return  # if there is re-direct, return

    if file_ext not in valid_image_ext_list:
        return  #return if not valid image extension

    # save as test.gif
    print url_link
    try:
        response = url.download()
        img = resize_image(response, image_size)
        img.save(temp_filename_full_path, "JPEG")
    except Exception as e:
        #if self.__print_download_fault:
        print 'Problem with processing this data: ', str(e), url_link
Пример #44
0
def loadPage(numPage):
    #Load the content from the given page
    url = URL(url_estruc_1 + str(numPage) + url_estruc_2)
    dom = DOM(url.download(cached=True))
    for row in dom(ROWS_PATH)[1:]:
        #pprint.pprint(plaintext(row(CELLS_PATH)[0].content))
        RESULTS.append({"place": plaintext(row(CELLS_PATH)[0].content), "place_gender": plaintext(row(CELLS_PATH)[1].content) })
    pprint.pprint(str(numPage + 1) + "/" + str(last_page))
Пример #45
0
def all_lyrics(artist):
    clean = re.sub(r"\s+|'", '', artist)
    url = URL(BASE_URL + artist[0] + '/' + clean + '.html')
    dom = DOM(url.download())
    titles = [a.content for a in dom('div#listAlbum a')]
    ew_amazon = [
        abs(link.attributes.get('href', ''), base=url.redirect or url.string)
        for link in dom('div#listAlbum a')
    ]
    songlinks = [l for l in ew_amazon if 'amazon' not in l]
    lyrics = []
    for link in songlinks:
        song_url = URL(link)
        song_dom = DOM(song_url.download())
        lyrics.append(plaintext(song_dom('div#main div')[4:5][0].content))
    zippy_lyrics = zip(titles, lyrics)
    return json.dumps(zippy_lyrics, sort_keys=True)
Пример #46
0
 def downloading_csv(self, url_address):
     """ Download the csv information from the url_address given.
     """
     cache.clear()
     url = URL(url_address)
     f = open(self.cur_quotes_csvfile, 'wb')  # save as test.gif
     f.write(url.download())
     f.close()
Пример #47
0
def read_web(url):
    html = ''
    start = etime()
    try:
        uri = URL(url)
        html = uri.download(cached=True)
    except Exception, e:
        print 'HTTP Error:' + str(e.message)
 def downloading_xml(self, url_address):
     """ Download the xml information from the url_address given.
     """
     cache.clear()
     url = URL(url_address)
     f = open(self.feeds_xmlfile, 'wb')  # save as test.gif
     f.write(url.download())
     f.close()
Пример #49
0
def get_patent_urls(keyword, limit=10):
    keyword = urllib.quote_plus(keyword)
    base_url = "http://www.lens.org"
    url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) +
              "&q=" + keyword)
    dom = DOM(url.download())
    links = [base_url + a.attributes.get('href') for a in dom('a.link')]
    return links
Пример #50
0
 def get_dom_object(self, url_target):
     try:
         url = URL(url_target)
         dom_object = DOM(url.download(cached=True))
     except:
         print('Problem retrieving data for this url: ',
               self.target_url_str)
         self.url_query_timeout = 1
     return dom_object
Пример #51
0
def scrape(url):
    with io.open("allMusicOneWeek.csv", "w", encoding="utf8") as f:
        url = "http://www.top40.nl/top40/2015/week-46"
        week = url.split("/")
        week = week[-1]
        url = URL("http://www.top40.nl/top40/2015/week-46")
        dom = DOM(url.download(cached=True))
        # geeft de week
        i = 1
        # de lijst van de top 40 selecteren

        for l in dom.by_tag("ol.top40"):
            # per nummer selecteren=
            print "lijst top 40"
            for e in l.by_tag("div.clearfix"):
                muziekGegevens = ""
                #positie in de top 40
                muziekGegevens += str(i) + ","
                print i, 'positie'
                i += 1  # opletten met resetten
                # de artiest selecteren
                for artiest in e.by_class("credit"):
                    muziekGegevens += artiest.content + ","
                #positie
                for inner in e.by_tag("strong")[1:2]:
                    print inner.content, "1:2"
                    muziekGegevens += inner.content + ","
                # hoogste notering
                for inner in e.by_tag("strong")[2:3]:
                    print inner.content, "2:3"
                    muziekGegevens += inner.content + ","
                # aantal punten
                for inner in e.by_tag("strong")[3:4]:
                    print inner.content, "3:4"
                    muziekGegevens += inner.content + ","
                # jaar van het nummer
                for inner in e.by_tag("strong")[4:5]:
                    print inner.content.strip(), "4:5"
                    muziekGegevens += inner.content.strip()
                h = HTMLParser.HTMLParser()
                muziekGegevens = h.unescape(muziekGegevens)

                if not whatisthis(muziekGegevens):
                    muziekGegevens = unicode(muziekGegevens, "utf-8")
                    print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf'
                    f.write(muziekGegevens + "\n")
                else:
                    f.write(muziekGegevens + "\n")

    #                     1 positie
    # week-45
    # ,1,
    # Traceback (most recent call last):
    #   File "testhtmlscraper.py", line 58, in <module>
    #     f.write(muziekGegevens + "\n")
    # TypeError: must be unicode, not str ???
    f.close
Пример #52
0
    def download_json(self):
        """ Download the json file from the self.com_data_full_url.
            The save file is defaulted to the self.saved_json_file.

        """
        cache.clear()
        url = URL(self.com_data_full_url)
        f = open(self.saved_json_file, 'wb')  # save as test.gif
        f.write(url.download(timeout=50))  #increse the time out time for this
        f.close()
Пример #53
0
    def extract_percentages(dom):
        file_url = URL(TARGET_URL)
        file_dom = DOM(file_url.download())

        percentage_list = []
        if file_dom.by_class('percentage'):
            for item in file_dom.by_class('percentage'):
                percentage_list.append(item.content.encode('utf-8'))
            return percentage_list[0]
        else:
            return "nodata"
Пример #54
0
 def create_dom_object(self):
     """ Create dom object based on element for scraping
         Take into consideration that there might be query problem.
         
     """
     try:
         url = URL(self.full_url_str)
         self.dom_object = DOM(url.download(cached=True))
     except:
         if self.__print_url_finding_error:
             print 'Problem retrieving data for this url: ', self.full_url_str
         self.url_query_timeout = 1
Пример #55
0
def getKeyRatios(marketUrl, marketSbls, valDict):
    krUrl = "http://financials.morningstar.com/ajax/exportKR2CSV.html?&callback=?&t="
    endUrl = "&region=usa&culture=en-US&cur=&order=asc"
    temp = "temp/mskr.csv"
    #the ratios we want we define here
    indexes = [
        'Dividends USD', 'Payout Ratio %', 'Shares Mil', 'Return on Assets %',
        'Return on Equity %', 'Current Ratio', 'Quick Ratio', 'Debt/Equity'
    ]
    columns = [str(year) for year in range(2006, 2015)] + ['TTM']
    failed = []
    count = 1

    for ticker in marketSbls:
        #the df to store the data we extract
        refinedDf = pd.DataFrame(columns=columns, index=indexes)
        #the df to store all the data from the csv downloaded
        df = object()
        tickerUrl = krUrl + marketUrl + ticker + endUrl
        url = URL(tickerUrl)
        f = open(temp, 'wb')
        try:
            #actually download
            f.write(url.download())
        except:
            print "could not download csv: " + ticker
            count += 1
            failed.append(ticker)
            continue
        f.close()
        try:
            #turn csv into dataframe
            df = pd.read_csv(temp, header=2, thousands=",", index_col=0)
        except:
            count += 1
            failed.append(ticker)
            continue
        #change nans to dash
        df = df.fillna('-')
        #rename columns
        df.columns = columns
        #extract rows neededd
        for year in columns:
            refinedDf[year] = df[year][indexes]

        #append to valuation dataframe
        combined = valDict[ticker].append(refinedDf)
        print "saving combined KR and val data to db: " + ticker
        #save combined df to csv, into database
        combined.to_csv("db/" + ticker + ".csv", mode='w', encoding='utf-8')
        print "Progress: " + str(count) + "/" + str(len(marketSbls))
        count += 1
    return failed
 def descargarContenidoHtml(self, url):
     try:
         unaUrl = URL(url)
         if "pdf" in extension(unaUrl.page):
             return self.descargarPDF(unaUrl)
         else:
             return unaUrl.download()
     except Exception as e:
         try:
             return self.urlLibDescarga(url)
         except Exception as e:
             print "except " + str(e)
             print url
Пример #57
0
    def downloading_csv(self):
        """ Download the csv information for particular stock.
 
        """
        self.download_fault = 0
 
        url = URL(self.com_data_full_url)
        f = open(self.ms_stats_extract_temp_csv, 'wb') # save as test.gif
        try:
            f.write(url.download())#if have problem skip
        except:
            if self.__print_download_fault: print 'Problem with processing this data: ', self.com_data_full_url
            self.download_fault =1
        f.close()
 def descargarContenido(self, url):
     """Metodo para descargar el contenido de los documentos webs siendo url o pdf"""
     try:
         unaUrl = URL(url)
         if "pdf" in extension(unaUrl.page):
             return self.descargarPDF(unaUrl)
         else:
             return plaintext(unaUrl.download())
     except Exception as e:
         try:
             return plaintext(self.urlLibDescarga(url))
         except Exception as e:
             print "except " + str(e)
             print url
Пример #59
0
def conjugate(verb, language="italian"):

    url = URL("http://en.wiktionary.org/wiki/%s" % verb)
    dom = DOM(url.download(throttle=10, cached=True))
    conj = {"infinitive": verb}
    mood = None

    for table in dom("table.inflection-table"):

        # Search the header that marks the start for the given language:
        # <h2><span class="mw-headline" id="Italian">Italian</span></h2>

        h2 = table.parent.parent

        while h2:
            h2 = h2.previous

            if getattr(h2, "tag", "") == "h2" and \
               getattr(h2("span")[0], "id", "") != language:
                continue

        for tr in table("tr"):

            for th in tr("th"):

                # <th>indicative</th>

                if th.content in MOOD:
                    mood = th.content

                # <th>present</th><td>sono</td><td>sei></td>...

                if th.content in TENSE:
                    conj[th.content,
                         mood] = [plain(td.content) for td in tr("td")]

                # <th>gerund</th><td>essendo</td>

                if th.content in PARTICIPLE:
                    conj[th.content] = plain(th.next.next.content)

            # <th>imperative</th></tr><tr><td></td><td>sii</td>...

            if mood == "imperative" and len(tr("th")) == 0:
                conj["present", mood] = [plain(td.content) for td in tr("td")]

        return conj

    return {}