Пример #1
0
def parse(html):
    '''
    页面分析,如果接收到的内容是ERROR_NUM,则说明超时了,则无需在分析;
    如果正常,则分别匹配出商品的id,name,price,stat,并写到以日期命名的文件中
    '''
    if not html:
        logger.info('======pass parse=====')
        return {}

    items = {}
#    print isinstance(html, str)
    parse_page = BeautifulSoup(html)
    goods = parse_page.find_all('div', class_='goods-content')

    for good in goods:

        good_id = good['nctype_goods'][1:]#在开始有一个空格

        good_name = good.select('div[class="goods-name"]')[0].a.text.replace(',', '_')

        good_price = good.select('em[class="sale-price"]')[0].text
        if re.findall(u'\u4e07', good_price):#处理‘1.3万’这种价格
            good_price = str(float(good_price[:-1])*10000)
        else:#去掉价格里的人民币符号
            good_price = good_price[1:]

        good_stat = good.select('a[class="status"]')[0].text

        items[good_id] = good_name + ',' + good_price + ',' + good_stat

    return items
Пример #2
0
def moderate_tags(html):
    """replaces instances of <a> and <img>
    with "item in moderation" alerts
    """
    from askbot.conf import settings
    soup = BeautifulSoup(html, 'html5lib')
    replaced = False
    if settings.MODERATE_LINKS:
        links = soup.find_all('a')
        if links:
            template = get_template('widgets/moderated_link.jinja')
            aviso = BeautifulSoup(template.render(), 'html5lib').find('body')
            map(lambda v: v.replaceWith(aviso), links)
            replaced = True

    if settings.MODERATE_IMAGES:
        images = soup.find_all('img')
        if images:
            template = get_template('widgets/moderated_link.jinja')
            aviso = BeautifulSoup(template.render(), 'html5lib').find('body')
            map(lambda v: v.replaceWith(aviso), images)
            replaced = True

    if replaced:
        return force_text(soup.find('body').renderContents(), 'utf-8')

    return html
Пример #3
0
def from_pmml(self, pmml):
    """Returns a model with the intercept and coefficients represented in PMML file."""

    model = self()
    
    # Reads the input PMML file with BeautifulSoup.
    with open(pmml, "r") as f:
        lm_soup = BeautifulSoup(f, "xml")

    if not lm_soup.RegressionTable:
        raise ValueError("RegressionTable not found in the input PMML file.")

    else:
    ##### DO I WANT TO PULL THIS OUT AS ITS OWN FUNCTION? #####
        # Pulls out intercept from the PMML file and assigns it to the
        # model. If the intercept does not exist, assign it to zero.
        intercept = 0
        if "intercept" in lm_soup.RegressionTable.attrs:
            intercept = lm_soup.RegressionTable['intercept']
        model.intercept_ = float(intercept)

        # Pulls out coefficients from the PMML file, and assigns them
        # to the model.
        if not lm_soup.find_all('NumericPredictor'):
            raise ValueError("NumericPredictor not found in the input PMML file.")
        else:
            coefs = []
            numeric_predictors = lm_soup.find_all('NumericPredictor')
            for i in numeric_predictors:
                i_coef = float(i['coefficient'])
                coefs.append(i_coef)
            model.coef_ = numpy.array(coefs)
            
    return model
Пример #4
0
    def test_23_admin_add_category(self):
        """Test ADMIN add category works"""
        self.create()
        category = {'name': 'cat', 'short_name': 'cat',
                    'description': 'description'}
        # Anonymous user
        url = '/admin/categories'
        res = self.app.post(url, data=category, follow_redirects=True)
        dom = BeautifulSoup(res.data)
        err_msg = "Anonymous users should be redirected to sign in"
        assert dom.find(id='signin') is not None, err_msg

        # Authenticated user but not admin
        self.signin(email=self.email_addr2, password=self.password)
        res = self.app.post(url, data=category, follow_redirects=True)
        err_msg = "Non-Admin users should get 403"
        assert res.status_code == 403, err_msg
        self.signout()

        # Admin
        self.signin(email=self.root_addr, password=self.root_password)
        res = self.app.post(url, data=category, follow_redirects=True)
        err_msg = "Category should be added"
        assert "Category added" in res.data, err_msg
        assert category['name'] in res.data, err_msg

        category = {'name': 'cat', 'short_name': 'cat',
                    'description': 'description'}

        self.signin(email=self.root_addr, password=self.root_password)
        res = self.app.post(url, data=category, follow_redirects=True)
        err_msg = "Category form validation should work"
        assert "Please correct the errors" in res.data, err_msg
Пример #5
0
def replace_links_with_text(html):
    """any absolute links will be replaced with the
    url in plain text, same with any img tags
    """
    soup = BeautifulSoup(html, 'html5lib')
    abs_url_re = r'^http(s)?://'

    images = soup.find_all('img')
    for image in images:
        url = image.get('src', '')
        text = image.get('alt', '')
        if url == '' or re.match(abs_url_re, url):
            image.replaceWith(format_url_replacement(url, text))

    links = soup.find_all('a')
    for link in links:
        url = link.get('href', '')
        text = ''.join(link.text) or ''

        if text == '':  # this is due to an issue with url inlining in comments
            link.replaceWith('')
        elif url == '' or re.match(abs_url_re, url):
            link.replaceWith(format_url_replacement(url, text))

    return force_text(soup.find('body').renderContents(), 'utf-8')
Пример #6
0
def scrap_items():
	for itemlist in ITEMLIST:
		soup = BS(urllib2.urlopen(''.join([LOLWIKI, itemlist])).read())
		item_table = soup.find('table', class_='stdt sortable')

		for tr in item_table.find_all('tr'):
			tds = tr.find_all('td')
			if len(tds) < 1:
				continue
			if tr.find('p') == None:
				continue

			item_name = tr.find('p').text.strip()
			item_url = tr.find('img')['src']

			if item_url.split(':')[0] == 'data':
				item_url = tr.find('img')['data-src']

			if not HOOKED:
				continue

			#store item in database
			d_item = Item()
			d_item.name = item_name

			t_img = NamedTemporaryFile(delete=True)
			t_img.write(urllib2.urlopen(item_url).read())
			t_img.flush()
			t_img.name = '.'.join([item_name, 'jpg'])

			d_item.picture = File(t_img)
			d_item.save()
Пример #7
0
	def __call__(self, url, count_of_crawler):
		"""
		Function which fetch the content from the given URL and collect all the
		URL in the content and pass the first url of the page to fetch the
		content.
		"""
		try:
			page = urllib2.urlopen(url)
			soup = BeautifulSoup(page.read())	

			links_on_page = map(lambda anchor: anchor.get('href'), 
						soup.find_all('a'))

			cleaned_url = map(lambda link: link if urlparse(link).scheme 
	 				and urlparse(url).netloc else (urlparse(url)
					.scheme+"://"+urlparse(url).netloc+link if 
					link[0] == "/" else url+link), links_on_page)
			visited_url.append(url)
			total_collected_url.append(cleaned_url)
			next_url_to_visit = [next_url for next_url in cleaned_url\
				 if not next_url in visited_url and not "#" in next_url][0]
		
			if count_of_crawler and next_url_to_visit:	
				count_of_crawler = crawler(next_url_to_visit, 
								count_of_crawler-1)
	
		except:
			print "It seems there is some issue in URL "+url
	
		return count_of_crawler
Пример #8
0
def crawlSearch(url,pages):
	try:
		arr=[]
		source_code=requests.get(url)
		plain_text=source_code.text
		soup=BeautifulSoup(plain_text)
		for link in soup.findAll('a'):

			href=link.get('href')
			href_test=str(href)
			#if href_test[0]!='/' and href_test[0]!='j' and href_test!='none' and href_test[0]!='#':
			if is_in_arr(pages,str(href))==False:
				if "microsoft" not in href_test and "facebook" not in href_test and "twitter" not in href_test and "google" not in href_test:
					if href_test.startswith("http"):
						if "bing" not in href_test:
							if "scholarships.com" not in href_test:
								pages.append(href)
								print str(href)
							else:
								if countS<2:
									crawl(href,pages)
									print "Crawling "+str(href)
									countS=countS+1
								else:
									print "Skiping "+str(href)
					else:
						pass


	except:
		print "Error at: "+str(url)
Пример #9
0
def show_options(id):
    r = requests.get("https://interaktiv.mx.dk/toolbox/" + votetype + "/get/" + id)
    soup2 = BeautifulSoup(r.text, "lxml")

    clear_console()
    print_logo()
    print "(Interaktiv version. Kør scriptet med -h eller --help for flere indstillinger.)"
    print

    vote_text = soup2.find("div", attrs={"id": "vote_text"}).text
    print vote_text
    print

    if votetype == "advancedvotes":
            for option in soup2.find_all("div", attrs={"class": "vote_button"}):

                number = option.get("data-vote")
                text = option.text

                print "(%s) %s" % (number, text)
            print

    else:

            for option in soup2.find_all("div", attrs={"class": "vote_button"}):
                if option.get("id") == "vote_yes":
                    number = "1"

                else:
                    number = "0"

                text = option.text
                print "(%s) %s" % (number, text)
            print
Пример #10
0
    def reverseIP(self):
        #acomodar la url como la necesitamos (www.url.com)
        if self.url.startswith("http://"):
            url = self.url.replace("http://","") #remplazar por vacio :v
        else:
            url = self.url

        #se envia por post ya que la pagina usa un formulario para pedir la url a escanear
        #data son los datos POST que es la url
        #remoteHost es como se envía el parametro (la url que se especifica en connection)
        data = {"remoteHost" : url}
        connection = requests.post(
            #parametros necesarios para la conexion
            url="http://www.ipfingerprints.com/scripts/getReverseIP.php", data=data
        )

        #connection.text es el html que retorna la conexion
        #BeautifulSoup lo parsea menos horrible
        #html.parser para salida mas limpia
        beautifulOut = BeautifulSoup(connection.text, "html.parser")

        #aqui guardaremos todos los links que encontremos en la etiqueta
        response = list()

        #find_all busca todas las equitas y 'a' es el parametro para filtrar solo ese tipo de etiqueta
        for link in beautifulOut.find_all("a"):
            #href es el nombre del dominio (que es lo unico que nos interesa de toda la etiqueta)
            currentLink = link.get("href")
            response.append(currentLink[11:-2])

        return response
Пример #11
0
def crawlLinkScoial(url):
	try:
		pages=[]
		arr=[]
		source_code=requests.get(url)
		plain_text=source_code.text
		soup=BeautifulSoup(plain_text)
		for link in soup.findAll('a'):

			href=link.get('href')
			href_test=str(href)
			#if href_test[0]!='/' and href_test[0]!='j' and href_test!='none' and href_test[0]!='#':
			if is_in_arr(pages,str(href))==False:
				if "facebook" in href_test or "twitter" in href_test or "google" in href_test:

					lin=getGoodLink(url)
					pages.append(lin+str(href))
		newArr=deleteDuplicates(pages)
		for page in newArr:
			socialFile.write(page)
			socialFile.write("\n")
		allFile.write("Social-Media-Links: \n")
		for page in newArr:
			allFile.write(page)
			allFile.write("\n")



	except:
		print "Error at: "+str(url)
Пример #12
0
def convert_links(text, quote="\""):
    soup = BeautifulSoup(text, "html.parser")
    for t in soup.findAll(text=True):
        if has_link_parent(t):
            continue
        split = re.split(r"(?:(https?://)|(www\.))([\S]+\.[^\s<>\"\']+)", t)
        if len(split) == 1:
            continue
        r = ""
        n = 0
        split = [s or "" for s in split]
        while split:
            if n % 2 == 0:
                r += split[0]
                split.pop(0)
            else:
                r += "<a href=%shttp://%s%s%s>%s%s%s</a>" % (
                    quote, split[1], split[2], quote,
                    split[0], split[1], split[2]
                    )
                split.pop(0)
                split.pop(0)
                split.pop(0)
            n += 1

        t.replaceWith(BeautifulSoup(r, "html.parser"))
    return str(soup)
Пример #13
0
def parse_data(data):
    page = BeautifulSoup(data)

    results = page.find("div", id="res")
    if results is None:
        raise NoResultsException

    calc = results.find("img", src="/images/icons/onebox/calculator-40.gif")
    if calc is not None:
        calc = results.find("h2", {"class": "r"})
        if calc is not None:
            superscripts = calc.find_all("sup")
            if superscripts is not None and len(superscripts):
                for x in superscripts:
                    x.contents[0].replaceWith("^" + x.contents[0])
            return [dict(type="string", string=util.strip_html(calc).decode("utf-8"))]

    nresults = results.find_all("li", {"class": "g"})
    if len(nresults) == 0:
        raise NoResultsException

    processed_results = []
    for x in nresults:
        a_tag = x.find("a")
        if a_tag is not None:
            processed_results.append(
                dict(type="result", href=urlparse.parse_qs(urlparse.urlparse(a_tag["href"]).query)["q"][0],
                     text=util.strip_html(a_tag).decode("utf-8")))

    return processed_results
Пример #14
0
def get_sp500_symbols():
    page_html = wiki_html('List_of_S%26P_500_companies', 'SP500.html')
    wiki_soup = BeautifulSoup(page_html, "html.parser")
    symbol_table = wiki_soup.find(attrs={'class': 'wikitable sortable'})

    symbol_data_list = list()

    for symbol in symbol_table.find_all("tr"):
        symbol_data_content = dict()
        symbol_raw_data = symbol.find_all("td")
        td_count = 0
        for symbol_data in symbol_raw_data:
            if(td_count == 0):
                symbol_data_content[
                    'symbol'] = symbol_data.text
            elif(td_count == 1):
                symbol_data_content[
                    'company'] = symbol_data.text
            elif(td_count == 3):
                symbol_data_content[
                    'sector'] = symbol_data.text
            elif(td_count == 4):
                symbol_data_content[
                    'industry'] = symbol_data.text
            elif(td_count == 5):
                symbol_data_content[
                    'headquarters'] = symbol_data.text

            td_count += 1

        symbol_data_list.append(symbol_data_content)

    return symbol_data_list[1::]
Пример #15
0
 def parse(self, response):
     logger.info("Parsing {}".format(response.url))
     soup = BeautifulSoup(response.body, "html.parser")
     trs = soup.find_all("tr", "item")
     if trs:
         for tr in trs:
             link = tr.find("a")
             article_url = DETAIL_URL.format(link["href"])
             r = scrapy.Request(article_url,
                                      callback=self.parse_article)
             yield r
     # next urls
     try:
         next_url = soup.find(class_="next").a
         cat_url = response.url
         u = urlparse(cat_url)
         query = None
         # Strip the query part
         u = u._replace(query=query)
         follow_url = urlunparse(u) + next_url["href"]
         r = scrapy.Request(follow_url, callback=self.parse)
         yield r
     except AttributeError:
         logger.info("Done with".format(response.url))
         pass
Пример #16
0
def _get_new_brunswick_flows(requests_obj):
    """
    Gets current electricity flows in and out of New Brunswick.

    There is no reported data timestamp in the page. The page returns
    current time and says "Times at which values are sampled may vary by
    as much as 5 minutes."
    """

    url = 'https://tso.nbpower.com/Public/en/SystemInformation_realtime.asp'
    response = requests_obj.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table', attrs={'bordercolor': '#191970'})

    rows = table.find_all('tr')

    headers = rows[1].find_all('td')
    values = rows[2].find_all('td')

    flows = {headers[i].text.strip(): float(row.text.strip())
             for i, row in enumerate(values)}

    return flows
Пример #17
0
def get_page_info(id_no, s=None):
    '''
    Extract restaurant information from Charlotte's health inspection website

    INPUT:  id_no = int, id # for ESTABLISHMENT
            s = request.Session(), [OPTIONAL] 
    OUTPUT: out = dict, establishment-level information
    '''
    if s is None:
        s = requests.Session()
    link = 'https://public.cdpehs.com/NCENVPBL/INSPECTION/ShowESTABLISHMENTPage.aspx'
    payload = {'ESTABLISHMENT':id_no, 'esttst_cty':60}
    z = s.get(link, params=payload)
    soup = BeautifulSoup(z.content, from_encoding='UTF-8')
    
    t = soup.findAll('table')[0]
    
    insp_info = np.array([y.text for y in t.findAll('td', attrs={'class':'ttc'})]).reshape(-1,4)
    
    if insp_info.shape[0] < 1:
        return None
    
    r = t.findAll('td', attrs={'class':'dfv'})
    rest_info = [x.text for x in r]
    
    return {'name'       :rest_info[0],
            'address'    :rest_info[2],
            'city'       :rest_info[8],
            'state'      :rest_info[9],
            'zip'        :rest_info[10],
            'type'       :rest_info[16],
            'county'     :rest_info[19],
            'inspections':insp_info}
Пример #18
0
def htmlfile(url):
  r = urllib2.urlopen(url)
  soup = BeautifulSoup(r)
  
  html = []
  #html- title, css (body width 960px)
  html.append('<html><head><title>'+soup.title.string+'</title><link rel="stylesheet" type="text/css" href="page.css"></head><body>')
  
  #parses for content only in article div - depends on site oblicously
  content =  soup.find('div', {'class': 'layout-block-a'})
  
  #gets hhtml paragraphs and h1 headings - should be alterd for websites style
  for text in content.find_all(['p', 'h1']):
    if text.name == 'p':
      html.append(str(text).decode("ascii", "ignore"))
    else:
      html.append(str(text).decode("ascii", "ignore"))
    
  html.append('</body></html>')
    
  # creates html files here
  out = open(soup.title.string+'.html', 'a')
  for line in html:
    out.write(line)
  out.close(
  
if __name__ == '__main__':
  main()
Пример #19
0
 def getWeibos(self, keyword,  page=1, count=None):
     url = 'http://t.hexun.com/k/topic.html?type=1&value=%s&pg=%d' % (json.dumps(keyword).replace('\\', '%').replace('"', ''), page)
     result = WeiboCrawler.request(self, url, self.headers)
     if 'result' in result and result['result']:
         infos = result['info'].decode('gb2312')
         soup = BeautifulSoup(infos)
         total_soup = soup.select('.headerR1')[0]
         total_num = total_soup.get_text().split('共')[-1].split('条')[0].strip()
         return_val = {'total_count': int(total_num), 'msgs':[]}
         allmsgs = []
         msgs_soup = soup.select('.nr_con')
         for msg_soup in msgs_soup:
             avatar =  'http://t.hexun.com%s' % msg_soup.select('.nr_conLa > a')[0].get('href')
             nickandtext = msg_soup.select('.nr_shuo')[0].get_text().split(':')
             nickname = nickandtext[0]
             text = nickandtext[1]
             ts = msg_soup.select('.nr_tan > h3 > a')[0].get_text()
             allmsgs.append({
                 'avatar': avatar,
                 'nickname': nickname,
                 'text': text,
                 'datetime': ts,
                 })
         return_val['msgs'] = allmsgs
         return return_val
Пример #20
0
def getCategoryUrl(site="",url=""):
    catDb = openTable(tableName=global_setting['catTable'])
    r = session.get(url)
    if not r.text:
        return False

    soup = BeautifulSoup(r.text)
    for level1 in soup.select('.classify_books'):
        curLevel1 = level1.select('.classify_title')[0].text
        curLevel1 = re.sub('\s', '', curLevel1)
        for level2 in level1.select('.classify_kind'):
            curLevel2 = level2.select('.classify_kind_name')[0].text
            curLevel2 = re.sub('\s', '', curLevel2)
            for level3 in level2.select('ul li a'):
                #curLevel3 = re.sub('\s', '', level3.text)
                curLevel3 =  level3.text.strip()
                curlUrl = level3['href']
                retFind = re.findall(r'\/cp(.*)\.html',curlUrl)
                if retFind:
                    curCatID = retFind[0]
                    catType = 'book'
                else:
                    retFind = re.findall(r'\/cid(.*)\.html',curlUrl)
                    if retFind:
                        curCatID = retFind[0]
                        catType = 'nonbook'
                if retFind:
                    if catDb.find({'catId':curCatID}).count() >0:
                        logger.debug('catetogy %s exists,skip\n'%(curCatID))
                    else:
                        catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl,'catType':catType, 'site':site})
    return True
Пример #21
0
def extract_images(base,html):
  images = []
  soup = BeautifulSoup(html)
  for img in soup.find_all("img"):
    if img.has_attr("src"):
      images.append(urljoin(base,img["src"]))
  return images
Пример #22
0
    def insert_push(self):
        uw = user_website.UserWebsite()
        userids = uw.get_user_ids_by_website_id(self.website_id)
        for id in userids:
            p = push.Push()
            p.website_id = self.website_id
            p.user_id = id
            p.title = "has new notice"

            soup_diff = BeautifulSoup(self.get_different())

            new_link_list =  soup_diff.find_all('a')

            new_link_count = len(new_link_list)

            if new_link_count == 1:
                content = "one notice is published:\n"
            else:
                content = str(new_link_count) + " notices are published:\n"

            content += self.get_different()


            p.content = content
            p.content = p.content.replace('"',"'")


            p.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            p.website_id = self.website_id
            p.content_url = ""
            p.insert()
Пример #23
0
def prettify(which, id):
    prefix = which[0]
    bs = BeautifulSoup(open(os.path.join(root,which, i+"-" + prefix + ".xml")), 'xml')
    sgm = i + "-" + prefix + ".sgm"
    out = bs.prettify(encoding='utf-8')
    [first, rest] = out.split("\n",1)
    return rest.replace(sgm, i) # the ID in the files look like "atwoma-b.sgm" rather than "atwoma"
Пример #24
0
def Get_All_Teams():
    data_path = '../data/'
    # get the teams
    url = 'http://espn.go.com/nba/teams'
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html, 'lxml')
    # print (soup.prettify())
    tables = soup.find_all('ul', class_ = 'medium-logos')

    tables[0].find_all('li')[0].h5.a

    name_pref_Tuples = []
    city_name_Dict = {}
    for table in tables:
        lis = table.find_all('li')
        for li in lis:
            info = li.h5.a
            team_url = info['href']
            team_name = info.text
            pref = team_url.split('/')[-2]
            city_name = ' '.join(info.text.split()[:-1])
            if team_name == 'Portland Trail Blazers':
                city_name = 'Portland'
            city_name_Dict[city_name] = team_name
            name_pref_Tuples.append((team_name, pref))

    print 'output two files: city_name.pickle and name_pref.pickle'
    print 'city_name.pickle is a dict with (city, team_name) pairs'
    print 'name_pref.pickle is a list of (team_name, team_name_prefix) tuples'
    pk.dump(city_name_Dict, open(data_path + 'city_name.pickle', 'wb'))
    pk.dump(name_pref_Tuples, open(data_path + 'name_pref.pickle', 'wb'))
Пример #25
0
def get_text_from_html(html_text):
    """Returns the content part from an HTML document
    retains links and references to images and line breaks.
    """
    soup = BeautifulSoup(html_text, 'html5lib')

    # replace <a> links with plain text
    links = soup.find_all('a')
    for link in links:
        url = link.get('href', '')
        text = ''.join(link.text) or ''
        link.replaceWith(format_url_replacement(url, text))

    # replace <img> tags with plain text
    images = soup.find_all('img')
    for image in images:
        url = image.get('src', '')
        text = image.get('alt', '')
        image.replaceWith(format_url_replacement(url, text))

    # extract and join phrases
    body_element = soup.find('body')
    filter_func = lambda s: bool(s.strip())
    phrases = map(
        lambda s: s.strip(),
        filter(filter_func, body_element.get_text().split('\n'))
    )
    return '\n\n'.join(phrases)
Пример #26
0
 def _login(self, username=None, store_password=False):
     if username is None:
         if self.USERNAME == "":
             raise LoginError("If you do not pass a username to login(), you should configure a default one!")
         else:
             username = self.USERNAME
     # Get password from keyring or prompt
     password_from_keyring = keyring.get_password("astroquery:www.eso.org", username)
     if password_from_keyring is None:
         if system_tools.in_ipynb():
             log.warn("You may be using an ipython notebook:"
                      " the password form will appear in your terminal.")
         password = getpass.getpass("{0}, enter your ESO password:\n".format(username))
     else:
         password = password_from_keyring
     # Authenticate
     log.info("Authenticating {0} on www.eso.org...".format(username))
     # Do not cache pieces of the login process
     login_response = self._request("GET", "https://www.eso.org/sso/login", cache=False)
     login_result_response = self._activate_form(login_response,
                                                 form_index=-1,
                                                 inputs={'username': username,
                                                         'password': password})
     root = BeautifulSoup(login_result_response.content, 'html5lib')
     authenticated = not root.select('.error')
     if authenticated:
         log.info("Authentication successful!")
     else:
         log.exception("Authentication failed!")
     # When authenticated, save password in keyring if needed
     if authenticated and password_from_keyring is None and store_password:
         keyring.set_password("astroquery:www.eso.org", username, password)
     return authenticated
Пример #27
0
def get_visible_text(html):
    """returns visible text from html
    http://stackoverflow.com/a/19760007/110274
    """
    soup = BeautifulSoup(html, 'html5lib')
    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
    return soup.get_text()
Пример #28
0
def get_games(date, output_file=None):

    # games_url = base + '/scoreboard/' + format_date(date) + '/games.json'
    games_url = si_base + 'schedule'
    #print format_date(date)

    result = requests.get(games_url, params={'date': format_date(date)})

    #print games_url + format_date(date)

    soup = BeautifulSoup(result.text)

    #date_string = date.strftime('%B %d,%Y')

    games = soup.find_all('tr', 'component-scoreboard-list final')

    game_ids = []

    for game in games:
        game_date_elem = game.find('div', 'game-anchor')
        game_date_text = game_date_elem['id']
        game_date = date_parser.parse(game_date_text).date()
        if game_date == date:
            game_id = int(game['data-id'])
            game_ids.append(game_id)

    if output_file is not None:
        of = open(output_file, 'w')
        of.write(json.dumps({'game_date': format_date(date), 'game_ids': game_ids}))
        of.close()

    return game_ids
Пример #29
0
def getMoviesActors(movieList):
    """

    :param A list containing formatted movie list
    :return: A list containing ID of the movie and all actors in that movie including actors ID
    """
    actorsInMovies = {}

    for x in movieList:
        req = urllib.request.Request(BASE_URL+movieList[x]["Url"]+"/fullcredits")
        #print(req.full_url)
        # Header is necessary to get the right movie titles, as in the english title
        req.add_header('Accept-Language', 'en-US,en')
        # Send the request and get response
        response = urllib.request.urlopen(req)

        bsoup = BeautifulSoup(response)

        findCastList = bsoup.find("table", {"class": "cast_list"})

        findAllActors = findCastList.findAll("td", itemprop="actor")

        actors = {}
        for d in findAllActors:
            actorName = d.find("span", itemprop="name")
            actorNumber = d.find("a", href=re.compile("\/name\/nm"))
            actorID = re.match("(?:\/name\/nm)(?P<userid>\d+)", actorNumber["href"]).group("userid")
            actors[actorID] = actorName.contents[0]

        actorsInMovies[movieList[x]["ID"]] = actors

    return actorsInMovies
Пример #30
0
    def get_Comics(self, name, comic_url):
        if not self.mkdir(name):
            again = ''
            while (1):
                again = str(input('Directory ' + name + ' already exists, do you wanna to download again? (Y/N)'))
                if again == 'Y' or again == 'N':
                    break
            if again == 'N':
                print('Folder \'BLEACH/' + name + '\' already exists!')
                return
            else:
                shutil.rmtree(self.path)
                self.mkdir(name)

        # Parse html
        page_url = self.prefix + comic_url
        data = urllib.request.urlopen(page_url).read().decode('utf-8', 'ignore')
        data.encode('utf-8')
        soup = BeautifulSoup(data, 'lxml')
        lists = soup.findAll('img', {'class': 'BDE_Image'})

        print('Downloading: ' + name)
        # Define progress bar's length
        progress_bar = tqdm(unit='Pic', total=len(lists))
        count = 0

        for each in lists:
            pic_url = each['src']
            filename = '%03d.txt' % count  + '.' + pic_url.split('.')[-1]
            urllib.request.urlretrieve(pic_url, filename = self.path + '/' + filename)
            progress_bar.update(1)
            count = count + 1

        # Close bar
        progress_bar.close()