示例#1
0
    def _parse(self, htmlString, encode='utf8'):
        """
        Parse HTML to DOM Document
        """
        if len(htmlString) < 2:
            return None
        if encode:
            dom = parseString(htmlString, html=1, unfinished=1, htmlencoding=encode)
            self._testDom(dom)
        else:
            dom = parseString(htmlString, html=1, unfinished=1)
            self._testDom(dom)

        return dom
示例#2
0
 def fetch_labels(self, query):
     self.__clear_labels()
     self.wq.search(query, sites='en.wikipedia.org', count=self.max_docs)
     opener = urllib2.build_opener()
     opener.addheaders = [('User-agent', 'MwClient-0.6.4')]
     for idx, url in enumerate(self.wq.urls()[0:self.max_docs]):
         try:
             infile = opener.open(url)
             page = infile.read()
             doc = libxml2dom.parseString(page, html=1)
             if self.debug:
                 util.log("url", url)
             labels = DocLabels()
             labels.title = self.__collect_text(doc.xpath("//*[@id='firstHeading']")[0])
             labels.categories = self.__nodes_to_array(doc.xpath("//*[@id='mw-normal-catlinks']/span"))
             # remove disambiguation pages
             dp_str = 'Disambiguation pages'
             if dp_str in labels.categories:
                 labels.categories.remove(dp_str)
             # headline text
             labels.headlines = []
             for node in doc.xpath("//h3/*[@class='mw-headline']"):
                 labels.headlines.append(self.__collect_text(node))
             labels.num_anchors = len(doc.getElementsByTagName("a"))
             labels.anchors = []
             # only taking external link texts
             for node in doc.xpath("//ul/li/*[@class='external text']"):
                 labels.anchors.append(self.__collect_text(node))
             labels.rank = idx + 1
             self.labels_for_urls[url] = labels
         except (urllib2.HTTPError, IndexError), e:
             if self.debug:
                 util.error("%s, url: %s" % (e, url))
示例#3
0
 def fetch_labels(self, query):
     self.__clear_labels()
     self.wq.search(query, sites='en.wikipedia.org', count=self.max_docs)
     opener = urllib2.build_opener()
     opener.addheaders = [('User-agent', 'MwClient-0.6.4')]
     for idx, url in enumerate(self.wq.urls()[0:self.max_docs]):
         try:
             infile = opener.open(url)
             page = infile.read()
             doc = libxml2dom.parseString(page, html=1)
             if self.debug:
                 util.log("url", url)
             labels = DocLabels()
             labels.title = self.__collect_text(
                 doc.xpath("//*[@id='firstHeading']")[0])
             labels.categories = self.__nodes_to_array(
                 doc.xpath("//*[@id='mw-normal-catlinks']/span"))
             # remove disambiguation pages
             dp_str = 'Disambiguation pages'
             if dp_str in labels.categories:
                 labels.categories.remove(dp_str)
             # headline text
             labels.headlines = []
             for node in doc.xpath("//h3/*[@class='mw-headline']"):
                 labels.headlines.append(self.__collect_text(node))
             labels.num_anchors = len(doc.getElementsByTagName("a"))
             labels.anchors = []
             # only taking external link texts
             for node in doc.xpath("//ul/li/*[@class='external text']"):
                 labels.anchors.append(self.__collect_text(node))
             labels.rank = idx + 1
             self.labels_for_urls[url] = labels
         except (urllib2.HTTPError, IndexError), e:
             if self.debug:
                 util.error("%s, url: %s" % (e, url))
示例#4
0
def getParams(data, formName=None, params={}):
    doc = libxml2dom.parseString(data, html=1)
    xpath = "//form"
    if formName is not None:
        xpath += "[@name = '%s']" % formName
    for form in doc.xpath(xpath):
        for inputElm in form.xpath(".//input"):
            elmType = inputElm.getAttribute("type").lower()
            if elmType == "reset":
                continue
            if not elmType == "checkbox" or inputElm.hasAttribute("checked"):
                params[inputElm.getAttribute("name")] = inputElm.getAttribute(
                    "value")
        for selectElm in form.xpath(".//select"):
            options = selectElm.xpath(".//option[@selected]")
            if len(options) > 1:
                print "Error: Multiple selected options not handled: %s (%d)" % (
                    inputElm.getAttribute("name"), len(options))
            for optionElm in options:
                params[inputElm.getAttribute("name")] = optionElm.getAttribute(
                    "value")
        for textElm in form.xpath("textarea"):
            params[inputElm.getAttribute("name")] = "".join(
                inputElm.xpath(".//text()"))
    return params
示例#5
0
def parse_issue_page(link):
    ps = get_page_string(link)
    dom = libxml2dom.parseString(ps, html=1)

    #retrieve id
    id = link[link.rfind('/')+1:]

    #retrieve subject
    subject_dom = dom.xpath('.//div[@class="subject"]')[0]
    subject = subject_dom.xpath('.//h3/text()')[0].toString()

    attributes_dom = dom.xpath('.//table[@class="attributes"]')[0]

    #retrieve status
    status = attributes_dom.xpath('.//td[@class="status"]/text()')[0].toString()

    #retrieve priority
    priority = attributes_dom.xpath('.//td[@class="priority"]/text()')[0].toString()

    #retrieve category
    category = attributes_dom.xpath('.//td[@class="category"]/text()')[0].toString()

    #retrieve affected version
    aff_dom = attributes_dom.xpath('.//td[@class="cf_4"]/a/text()')
    aff_version = ""
    if (len(aff_dom) > 0):
        aff_version = aff_dom[0].toString()

    if not issues_dict.has_key(category):
        issues_dict[category] = []
    issues_dict[category].append([id, status, subject, priority, aff_version])

    print id + " " + category + " " + subject + " " + status + " " + priority + " " + aff_version
示例#6
0
    def getEinzelwerteListe(self, strIndex):
        url = self.__UrlEinzelwerteListePerIndex.replace("XXX", strIndex)

        page = self.__webConnect.runGETRequest(url)
        page = page.replace("<br>", " ").replace("<br/>",
                                                 " ").replace("<br />", " ")

        doc = libxml2dom.parseString(page, html=1)
        td_elements = doc.getElementsByTagName("td")
        '''
            durch alle td elemente laufen und schauen,ob irgendwo eine isin gefunden wurde. falls ja, ist das ein gueltiger eintrag
        '''

        StockList = list()

        for i in td_elements:
            data = i.textContent

            arr = data.rsplit()

            if len(arr) > 1 and self.__checkForISIN(arr[-1]) == 1:
                s = CStock()

                s.ISIN = arr[-1]
                s.Name = string.join(arr[0:-1])
                StockList.append(s)

        if len(StockList) == 0:
            raise NameError('Achtung: Aktienliste fuer ' + strIndex +
                            ' hat keine Werte!')

        return StockList
示例#7
0
    def test_TreeContent(self):
        """
        Unittest to check tree content after load
        """
        html = """
<html>
    <body>
        <div>
            Ola
            <div>
                Mundo
            </div>
            <div>
                <a>
                !
                </a>
            </div>
        </div>
    </body>
</html>
        """

        dom = parseString(html, html=1)

        tree = Node().loadNodeTree(dom, 0)
        print tree.childNodes[0].childNodes[0].dom.localName
        self.assertEquals(tree.childNodes[0].childNodes[0].depth, 2)
        self.assertEquals(tree.childNodes[0].childNodes[0].height, 5)

        #        print tree.childNodes[0].childNodes[0].childNodes[0].childNodes[1].str
        self.assertEquals(tree.childNodes[0].childNodes[0].childNodes[0].childNodes[1].depth, 4)
        self.assertEquals(tree.childNodes[0].childNodes[0].childNodes[0].childNodes[1].height, 2)
def getArticleInfoForPage(page):
	doc=libxml2dom.parseString(page,html=True)
	doc=doc.getElementById('content_list_view').childNodes[1]

	items=[]
	
	currDomain=''
	currCat=''
	for ch in doc.childNodes:
		if ch.tagName=='h1':
			currDomain=ch.textContent
		if ch.tagName=='li':
			currCat=ch.textContent.strip()
		if ch.tagName=='ul' :
			try:
				info=[]
				if ch.getAttribute('style')=='margin-left:10px':
					#pocinje lista odma
					curSubcat=''
					info=parseSubCategory(ch)
					for item in info:
						item['topic']=currDomain+'/'+currCat+'/'+curSubcat
				else:				
					curSubcat=ch.childNodes[0].textContent.strip()
					info=parseSubCategory([t for t in ch.childNodes if t.tagName=='ul'][0])		
					for item in info:
						item['topic']=currDomain+'/'+currCat+'/'+curSubcat
				items+=info
			except:
				pass
	return items
示例#9
0
 def getEinzelwerteListe(self, strIndex):
     url = self.__UrlEinzelwerteListePerIndex.replace("XXX", strIndex)
     
     page = self.__webConnect.runGETRequest( url )
     page = page.replace("<br>", " ").replace("<br/>", " ").replace("<br />", " ")
                     
     doc = libxml2dom.parseString(page, html=1)
     td_elements = doc.getElementsByTagName("td")
     
     '''
         durch alle td elemente laufen und schauen,ob irgendwo eine isin gefunden wurde. falls ja, ist das ein gueltiger eintrag
     '''
     
     StockList = list()
     
     for i in td_elements:
         data = i.textContent
         
         arr = data.rsplit()
         
         if len(arr) > 1 and self.__checkForISIN(arr[-1]) == 1:
             s = CStock()
             
             s.ISIN = arr[-1]
             s.Name = string.join(arr[0:-1])
             StockList.append(s)
         
     if len(StockList) == 0:
         raise NameError('Achtung: Aktienliste fuer ' + strIndex + ' hat keine Werte!')    
         
     return StockList
示例#10
0
def parse(htmlString, encoding=None, url='', category='', language=''):
    """
        Parse HTML to DOM Document
    """

    result = parseString(htmlString, html=1, unfinished=1, htmlencoding='utf8')
    return result
示例#11
0
    def test_TreeStructure(self):
        """
        Unittest tree strucure
        """
        html = """
<html>
    <body>
        <div>
            Ola
            <div>
                Mundo
            </div>
            <div>
                <a>
                !
                </a>
            </div>
        </div>
    </body>
</html>
        """

        dom = parseString(html, html=1)

        tree = Node().loadNodeTree(dom, 0)

        self.assert_(tree.dom == dom)
        self.assertEquals(tree.childNodes[0].dom.localName, "html")
        self.assert_(tree.childNodes[0].dom == dom.childNodes[0])

        #        print tree.childNodes[0].childNodes[0].dom.localName
        self.assertEquals(tree.childNodes[0].childNodes[0].str, "bodydivdivdiva")

        #        print ttree.childNodes[0].childNodes[0].childNodes[0].childNodes[1].str
        self.assertEquals(tree.childNodes[0].childNodes[0].childNodes[0].childNodes[1].str, "diva")
示例#12
0
    def __getDataforStock(self, stock):
        t1 = date.today()
        t2 = t1 - timedelta(days=380)

        self.__POSTDataAktie["pkAktieNr"] = stock.FinanzenNetId
        self.__POSTDataAktie["strBoerse"] = stock.strBoerseFinanzenNet

        self.__POSTDataAktie["dtTag1"] = t2.day
        self.__POSTDataAktie["dtMonat1"] = t2.month
        self.__POSTDataAktie["dtJahr1"] = t2.year

        self.__POSTDataAktie["dtTag2"] = t1.day
        self.__POSTDataAktie["dtMonat2"] = t1.month
        self.__POSTDataAktie["dtJahr2"] = t1.year

        page = self.__webConnect.runPOSTRequest(
            self.__FinanzenNetHistorischeKurseURL, self.__POSTDataAktie)

        doc = libxml2dom.parseString(page, html=1)
        td_elements = doc.getElementsByTagName("td")

        c = 0
        for i in td_elements:
            data = i.textContent

            if (self.__validateDate(data) == 1):
                datum = datetime.datetime.strptime(data, '%d.%m.%Y').date()
                '''
                    datum ist gueltig. pruefe nun, wann die notwendigen deltas erreicht wurden,
                    um die werte zu setzen
                '''
                if (self.AktienkursHeute == 0
                        and datum >= date.today() - timedelta(days=3)):
                    self.AktienkursHeute = float(
                        td_elements[c + 2].textContent.replace(",", "."))

                if (self.AktienkursVor6Monaten == 0
                        and datum <= date.today() - timedelta(days=180)):
                    self.AktienkursVor6Monaten = float(
                        td_elements[c + 2].textContent.replace(",", "."))

                if (self.AktienkursVor12Monaten == 0
                        and datum <= date.today() - timedelta(days=360)):
                    self.AktienkursVor12Monaten = float(
                        td_elements[c + 2].textContent.replace(",", "."))

                DatumList = []
                DatumList.append(date.today() - timedelta(date.today().day))
                DatumList.append(DatumList[0] - timedelta(DatumList[0].day))
                DatumList.append(DatumList[1] - timedelta(DatumList[1].day))
                DatumList.append(DatumList[2] - timedelta(DatumList[2].day))

                for i in [0, 1, 2, 3]:
                    if (self.AktieList[i] == 0 and datum <= DatumList[i]):
                        self.AktieList[i] = float(
                            td_elements[c + 2].textContent.replace(
                                ".", "").replace(",", "."))

            c = c + 1
示例#13
0
def get_problem(url):
	data = urllib.urlopen(url)
	s = data.read()
	doc = libxml2dom.parseString(s,html=1)
	main = doc.getElementById("main")
	for node in main.childNodes:
		if  node.getAttribute("class") == "wiki_text_block":
			return node.toString()
示例#14
0
def parseString(s, html=0, htmlencoding=None, unfinished=0, impl=None):
    doc = libxml2dom.parseString(s,
                                 html=html,
                                 htmlencoding=htmlencoding,
                                 unfinished=unfinished,
                                 impl=(impl or default_impl))
    initialiseEvents(doc)
    return doc
示例#15
0
 def __init__(self, path, nodes):
     sock = urllib.urlopen(path)
     html_source = sock.read()
     sock.close()
     doc = libxml2dom.parseString(html_source, html=1)
     self.content = doc.getElementById("gc-wrapper")
     self.wrapper = self.__get_wrapper()
     self.__nodes = nodes
     self.__collect_nodes()
	def load(self,file_name):
		try:
			f = open(file_name,'r')
			doc_string = reduce(lambda x,y:x+y,f)
			f.close()

			return  libxml2dom.parseString(doc_string, html=1)
		except IOError:
			print "unable to read file ",file_name
示例#17
0
def parse(url):
    #parse out the uri for the first recent image
    html = download(url)    
    try:
        return libxml2dom.parseString(html, html=1)
    
    except TypeError, detail:
        print detail
        print "Could not parse document, attempt ", attempts, "; retrying "
        return parse(url)
示例#18
0
 def __init__(self, gameUrl):
   openGame = urllib2.urlopen(gameUrl.replace("\n","")+"/rating-systems").read()
   self.docGame = libxml2dom.parseString(openGame, html=1)
 
   allDiv = self.docGame.getElementsByTagName("div")
   self.descDiv=None
   for div in allDiv:
     for atr in div.attributes:
       if(atr.nodeValue=="rightPanelMain"):
         self.descDiv=div
示例#19
0
def genmoedict(result, dirname, fnames):
	indexfile = os.path.join(dirname, 'index.html')
	assert os.path.exists(indexfile)
	f = open(indexfile, 'r')
	buf = f.read()
	try:
		assert buf.find('</html>') > 0
	except:
		print >> sys.stderr, 'Error: incomplete index.html under ', dirname
	finally:
		flist = getindex.findall(buf)[1:]
	f.close()
	for fname in flist:
		isdir = False
		if fname[-1] == '/':
			fname = fname[:-1]
			isdir = True
		elif fname.find('/') > 0:
			p = fname.find('/')
			if fname[p+1:] == 'index.html':
				fname = fname[:p]
				isdir = True
		if fname not in fnames:
			print >> sys.stderr, 'Warning: %s not in %s' % (
					fname, dirname)
			continue
		if isdir: continue
		if not (fname.startswith('m_') and fname.endswith('.html')):
			print >> sys.stderr, 'Ignoring ' + fname
			continue
		num = int(fname[2:-5])
		try:
			f = open(os.path.join(dirname, fname), 'r')
			buf = f.read()
			f.close()
		except:
			print >> sys.stderr, 'Error reading %s under %s' % (
					fname, dirname)
			continue
		try:
			doc = libxml2dom.parseString(buf, html=1, 
					htmlencoding='latin-1')
		except:
			print >> sys.stderr, 'Error parsing %s under %s' % (
					fname, dirname)
			continue
		try:
			result.append((num, getwordmeanfromdoc(doc)))
		except:
			print >> sys.stderr, 'Error processing %s under %s' % (
					fname, dirname)
			if DEBUG:
				traceback.print_exc()
				pdb.set_trace()
			raise
示例#20
0
def getMeasure(w):		
   f = urllib.urlopen("http://compling.org/cgi-bin/DAL_sentence_xml.cgi?sentence="+w)
   s = f.read()
   f.close()
   
   #Parse the XML result to obtain valence and activation scores
   doc = libxml2dom.parseString(s)
   measure = doc.getElementsByTagName("measure")
   valence = measure[0].getAttribute("valence")        
   activation = measure[0].getAttribute("activation")   
   return [valence,activation]
	def get_element(self,url):
		""" Get an HTML element hosted in the given url. If successful, returns the DOM of the element, if not, returns 			None.
		"""
	
		try:

			#doc_string = self.proxy.make_request_without_proxy(url)
			doc_string = self.proxy.make_request(url)
			if doc_string != None:
				return libxml2dom.parseString(doc_string, html=1)
		except IOError:
			print "unable to connect to server "
示例#22
0
文件: main.py 项目: kp7/plan
def parse(html_data, header):
    class St:
        pass

    st = St()
    st.id = -1
    st.level = 0
    st.saved_id = -100
    st.saved_level = -1
    st.results = []
    st.used = False

    def check_header(h):
        if header.__class__ == str:
            return h.strip() == header.strip()
        else:
            return header(h)

    def dfs(node, good_ol=False):
        st.id += 1
        st.level += 1
        # print str(node.name) +" " + str(st.id)
        # 	print "name=%s id=%d content=%s" %( node.name, st.id, node.textContent)
        if ["h2", "h3"].count(enc(node.name)) and check_header(enc(node.textContent)):
            st.saved_id = st.id
            st.saved_level = st.level
            st.used = False
        elif ["ol", "ul"].count(node.name) and (st.id == st.saved_id + 3 or st.level == st.saved_level):
            good_ol = True
        elif st.level == st.saved_level and node.name != "text" and st.used:
            # print "clearing for item type %s" %(node.name,)
            st.saved_level = -1
        elif st.level < st.saved_level:
            st.saved_level = -1
        if node.name == "a" and good_ol:

            class C:
                pass

            c = C()
            c.link = absolute_url(node.getAttribute("href"))
            c.name = node.textContent.strip()
            st.results.append(c)
            st.used = True
        for x in node.childNodes:
            dfs(x, good_ol)
        st.level -= 1

    dom = libxml2dom.parseString(html_data, html=True, htmlencoding="utf-8")
    dfs(dom)
    return st.results
示例#23
0
def getOffers(isbn):
	if isbn in ISBNcache:
		return ISBNcache[isbn]
	g = urllib.urlopen("http://m.bookscouter.com/prices.php?isbn="+isbn)
	html = libxml2dom.parseString(g.read(), html=1)
	venues = []
	tablecells = html.getElementsByTagName("td")
	for n in xrange(0,min(len(tablecells),NUMBER_VENUES*2),2):
		name = tablecells[n].textContent.replace(" ","_")
		price  = tablecells[n+1].xpath(".//a")[0].textContent
		venues.append([price, name])
	ISBNcache[isbn] = venues
	time.sleep(0.5) # Be nice to bookscouter
	return ISBNcache[isbn]
示例#24
0
def get_problems(page):
	problems = []
	data = urllib.urlopen("http://infoarena.ro/arhiva?display_entries=250&first_entry="+str(250*page))
	s = data.read()
	doc = libxml2dom.parseString(s, html=1)
	a_elements = doc.getElementsByTagName("a")
	for node in a_elements:
		if re.match('^\/problema.*',node.getAttribute("href")):
			problems.append(node.getAttribute("href"))

	problems = list(set(problems)) #getting rid of duplicates caused by open problems who have another link
	for i in range(len(problems)):
		problems[i] = [problems[i], "http://infoarena.ro"+problems[i]]
	return problems
示例#25
0
 def __getDataforStock(self, stock):
     t1 = date.today()
     t2 = t1 - timedelta(days=380)
     
     self.__POSTDataAktie["pkAktieNr"] = stock.FinanzenNetId
     self.__POSTDataAktie["strBoerse"] = stock.strBoerseFinanzenNet
     
     self.__POSTDataAktie["dtTag1"] = t2.day
     self.__POSTDataAktie["dtMonat1"] = t2.month
     self.__POSTDataAktie["dtJahr1"] = t2.year
     
     self.__POSTDataAktie["dtTag2"] = t1.day
     self.__POSTDataAktie["dtMonat2"] = t1.month
     self.__POSTDataAktie["dtJahr2"] = t1.year
     
     page = self.__webConnect.runPOSTRequest(self.__FinanzenNetHistorischeKurseURL, self.__POSTDataAktie)
     
     doc = libxml2dom.parseString(page, html=1)
     td_elements = doc.getElementsByTagName("td")
     
     c = 0
     for i in td_elements:
         data = i.textContent
         
         if( self.__validateDate(data) == 1 ):
             datum = datetime.datetime.strptime(data, '%d.%m.%Y').date()
             '''
                 datum ist gueltig. pruefe nun, wann die notwendigen deltas erreicht wurden,
                 um die werte zu setzen
             '''
             if( self.AktienkursHeute == 0 and datum >= date.today() - timedelta(days=3) ):
                 self.AktienkursHeute = float( td_elements[c+2].textContent.replace(",", ".") )
                 
             if( self.AktienkursVor6Monaten == 0 and datum <= date.today() - timedelta(days=180) ):
                 self.AktienkursVor6Monaten = float( td_elements[c+2].textContent.replace(",", ".") )
         
             if( self.AktienkursVor12Monaten == 0 and datum <= date.today() - timedelta(days=360) ):
                 self.AktienkursVor12Monaten = float( td_elements[c+2].textContent.replace(",", ".") )
   
             DatumList= []    
             DatumList.append( date.today() - timedelta(date.today().day) )
             DatumList.append( DatumList[0] - timedelta(DatumList[0].day) )
             DatumList.append( DatumList[1] - timedelta(DatumList[1].day) )   
             DatumList.append( DatumList[2] - timedelta(DatumList[2].day) )
             
             for i in [0,1,2,3]:
                 if( self.AktieList[i] == 0 and datum <= DatumList[i]  ):
                     self.AktieList[i] = float( td_elements[c+2].textContent.replace(".", "").replace(",", ".") )
             
         c = c + 1    
示例#26
0
    def doWork(self,work):
        frame = Jaime.getInstance().page.mainFrame()
        try :
            self.document = libxml2dom.parseString(frame.toHtml().encode('utf-8','ignore'),
                                                   html=1)
        except Exception as e:
            print 'Error en el frame to html %s' % e
#         print 'entro a dowork'
        for w in work:
            try:
                f = getattr(self, w[0])
                f(*w[1])
            except Exception as e:
                print 'Excepcion en doWork %s' % e
示例#27
0
    def run(self, html):
        doc = libxml2dom.parseString(html, html=1)
        doc_table = doc.getElementsByTagName('table')
        doc_table = doc_table[0]
        self.tds_width = self._some_has_width(doc.getElementsByTagName('td'))
        self.ths_width = self._some_has_width(doc.getElementsByTagName('th'))

        doc_table = self.parse_table(doc_table)

        doc_table = self.set_size_table(doc_table)
        doc_str = doc_table.toString()
        doc_str_lt = doc_str.replace('&lt;', '<')
        doc_str_gt = doc_str_lt.replace('&gt;', '>')

        return doc_str_gt
示例#28
0
 def __getKBV(self, stock):
             
     page = self.__webConnect.runGETRequest( self.__UrlKBV + str(stock.ISIN) )
     
     doc = libxml2dom.parseString(page, html=1)
     td_elements = doc.getElementsByTagName("td")
     
     c = 0
     for i in td_elements:
         data = i.textContent
         
         if self.KBV == "NA" and data.find("KBV") > -1 and data.find("title=\"Kurs/Buchungs"):
             self.KBV = float( td_elements[c+1].textContent.replace(",", ".") )
             
         c = c + 1
示例#29
0
    def run(self, html):
        doc = libxml2dom.parseString(html, html=1)
        doc_table = doc.getElementsByTagName('table')
        doc_table = doc_table[0]
        self.tds_width = self._some_has_width(doc.getElementsByTagName('td'))
        self.ths_width = self._some_has_width(doc.getElementsByTagName('th'))

        doc_table = self.parse_table(doc_table)

        doc_table = self.set_size_table(doc_table)
        doc_str = doc_table.toString()
        doc_str_lt = doc_str.replace('&lt;', '<')
        doc_str_gt = doc_str_lt.replace('&gt;', '>')

        return doc_str_gt
示例#30
0
文件: main.py 项目: kp7/plan
def get_lecture_text(html_data):
    def try_toc_format(dom):
        def dfs(node):
            count = None
            if node.getAttribute("id") == "bodyContent":
                count = 0
            for x in node.childNodes:
                r = dfs(x)
                if r:
                    return r
                if count != None:
                    count += 1
            return count

        res = ""
        entries = parse(html_data, "Spis tre¶ci")
        # d = dfs(dom)
        # print "dfs returned %d, len(entries)=%d" %(d, len(entries))
        # print "html_data: " + html_data
        if [19, 22].count(dfs(dom)) == 0 or len(entries) < 5:
            return None
        for entry in entries:
            # print "subentry: %s %s" %(enc( entry.name), enc( entry.link))
            html_data2 = fetch_html(absolute_url(entry.link))
            res += " " + get_lecture_text(html_data2)
        return res

    def dfs(node, good_ol=False):
        if node.getAttribute("id") == "bodyContent":
            good_ol = True
        if good_ol and node.name == "text":
            st.results += " " + enc(node.textContent)
        for x in node.childNodes:
            dfs(x, good_ol)

    class St:
        pass

    st = St()
    st.results = ""

    dom = libxml2dom.parseString(html_data, html=True, htmlencoding="utf-8")
    res = try_toc_format(dom)
    if res:
        return res
    dfs(dom)
    return st.results
示例#31
0
    def __getKBV(self, stock):

        page = self.__webConnect.runGETRequest(self.__UrlKBV + str(stock.ISIN))

        doc = libxml2dom.parseString(page, html=1)
        td_elements = doc.getElementsByTagName("td")

        c = 0
        for i in td_elements:
            data = i.textContent

            if self.KBV == "NA" and data.find("KBV") > -1 and data.find(
                    "title=\"Kurs/Buchungs"):
                self.KBV = float(td_elements[c + 1].textContent.replace(
                    ",", "."))

            c = c + 1
示例#32
0
def get_site_description(res):
    doc = libxml2dom.parseString(res.read(), html=1)
    meta_tag_nodes = doc.getElementsByTagName('meta')
    
    for meta_tag_node in meta_tag_nodes:
        if str(meta_tag_node.getAttribute('name')).lower() == "description":
            return meta_tag_node.getAttribute('content')
    
    # if description meta was not found, try google
    url = ('http://ajax.googleapis.com/ajax/services/search/web' \
           '?v=1.0&q=site:'+res.url)
    res = get_http_response(url)
    if not res:
        return ''
    results = simplejson.load(res)['responseData']['results']
    if results:
        return results[0]['content']
    return ''
示例#33
0
文件: Onvista.py 项目: sscit/trader
 def getOnvistaId(self, stock):
     page = self.__webConnect.runGETRequest( self.__StockOverviewUrl + str(stock.ISIN) )
     
     onvistaId = ""
     
     doc = libxml2dom.parseString(page, html=1)
     a_elements = doc.getElementsByTagName("a")
     
     for i in a_elements:
         if i.textContent == "Kennzahlen" and "kennzahlen/fundamental.html?ID_OSI" in i.attributes["href"].value:
             url = i.attributes["href"].value
             onvistaId = str(url.split("=")[1])
             break
     
     if onvistaId.isdigit() == False:
         raise NameError('Error: getOnvistaId, Id nicht numeric: ' + onvistaId)
     
     return onvistaId 
示例#34
0
def getUrlList(mobygameUrl):
  doc = libxml2dom.parseString(mobygameUrl, html=1)
  """mof_object_list - id of the table containing the list of all games"""
  table = doc.getElementById("mof_object_list"); 
  urlList=[]
  if(table is None):
    print ("ERROR! No table found!")

  tbody = table.getElementsByTagName("tbody");  
  if(len(tbody) > 1):
    print ("ERROR! More then one tbody in table found!")
  elif(len(tbody) == 0):
    print ("ERROR! No tbody found")
  
  allTr = tbody[0].getElementsByTagName("tr");
  for tr in allTr:
    urlList.append(tr.getElementsByTagName("a")[0].getAttribute("href"))
  return urlList
示例#35
0
 def getFinanzenNetId(self, stock):
     url= self.__UrlKBV + stock.ISIN
     page = self.__webConnect.runGETRequest( url )
     finanzenNetId = ""
     
     doc = libxml2dom.parseString(page, html=1)
     a_elements = doc.getElementsByTagName("a")
     
     for i in a_elements:
         if i.textContent == "Historisch" and "kurse_historisch.asp" in i.attributes["href"].value:
             url = i.attributes["href"].value
             finanzenNetId = str(url.split("=")[1].split("&")[0])
             break
     
     if finanzenNetId.isdigit() == False:
         raise NameError('Error: getFinanzenNetId, Id nicht numeric: ' + finanzenNetId)
     
     return finanzenNetId     
示例#36
0
    def __getDataForIndex(self, stock):
        t1 = date.today()
        t2 = t1 - timedelta(days=380)

        url = self.__FinanzenNetHistorischeKurseIndizesURL.replace(
            "XXX", stock.strIndexFinanzenNet)

        self.__POSTDataIndex["dtTag1"] = t2.day
        self.__POSTDataIndex["dtMonat1"] = t2.month
        self.__POSTDataIndex["dtJahr1"] = t2.year

        self.__POSTDataIndex["dtTag2"] = t1.day
        self.__POSTDataIndex["dtMonat2"] = t1.month
        self.__POSTDataIndex["dtJahr2"] = t1.year

        page = self.__webConnect.runPOSTRequest(url, self.__POSTDataIndex)

        doc = libxml2dom.parseString(page, html=1)
        td_elements = doc.getElementsByTagName("td")

        c = 0
        for i in td_elements:
            data = i.textContent

            if (self.__validateDate(data) == 1):
                datum = datetime.datetime.strptime(data, '%d.%m.%Y').date()
                '''
                    datum ist gueltig. pruefe nun, wann die notwendigen deltas erreicht wurden,
                    um die werte zu setzen
                '''

                DatumList = []
                DatumList.append(date.today() - timedelta(date.today().day))
                DatumList.append(DatumList[0] - timedelta(DatumList[0].day))
                DatumList.append(DatumList[1] - timedelta(DatumList[1].day))
                DatumList.append(DatumList[2] - timedelta(DatumList[2].day))

                for i in [0, 1, 2, 3]:
                    if (self.IndexList[i] == 0 and datum <= DatumList[i]):
                        self.IndexList[i] = float(
                            td_elements[c + 2].textContent.replace(
                                ".", "").replace(",", "."))

            c = c + 1
示例#37
0
def no_header(source, headers, table_index):
    #initiate a list to hold the return list
    return_list = []

    #get a document object out of the source code
    doc = libxml2dom.parseString(source, html=1)

    #get the tables from document
    tables = doc.getElementsByTagName('table')

    try:
        #Try to get focus on the desired table
        main_table = tables[table_index]
    except:
        #if the table doesn't exits then return an error
        return ['The table index was not found']

        #get all of the rows out of the main_table  
    rows = main_table.getElementsByTagName('tr')

    #loop through each row
    for row in rows:

        #get all cells from the current row
        cells = row.getElementsByTagName('td')

        #initiate a list to append into the return_list
        cell_list = []

        #loop through the list of desired headers
        for i in headers:
            try:
                #try to add text from the cell into the cell_list
                cell_list.append(cells[i].textContent)
            except:
                #if there is an error usually an index error just continue
                continue
                #append the data scraped into the return_list
        return_list.append(cell_list)

        #return the return list  
    return return_list
示例#38
0
    def __getDataForIndex(self, stock):
        t1 = date.today()
        t2 = t1 - timedelta(days=380)
        
        url = self.__FinanzenNetHistorischeKurseIndizesURL.replace("XXX", stock.strIndexFinanzenNet)
        
        self.__POSTDataIndex["dtTag1"] = t2.day
        self.__POSTDataIndex["dtMonat1"] = t2.month
        self.__POSTDataIndex["dtJahr1"] = t2.year
        
        self.__POSTDataIndex["dtTag2"] = t1.day
        self.__POSTDataIndex["dtMonat2"] = t1.month
        self.__POSTDataIndex["dtJahr2"] = t1.year
        
        page = self.__webConnect.runPOSTRequest(url, self.__POSTDataIndex)
        
        doc = libxml2dom.parseString(page, html=1)
        td_elements = doc.getElementsByTagName("td")
        
        c = 0
        for i in td_elements:
            data = i.textContent
            
            if( self.__validateDate(data) == 1 ):
                datum = datetime.datetime.strptime(data, '%d.%m.%Y').date()
 
                '''
                    datum ist gueltig. pruefe nun, wann die notwendigen deltas erreicht wurden,
                    um die werte zu setzen
                '''
     
                DatumList= []    
                DatumList.append( date.today() - timedelta(date.today().day) )
                DatumList.append( DatumList[0] - timedelta(DatumList[0].day) )
                DatumList.append( DatumList[1] - timedelta(DatumList[1].day) )   
                DatumList.append( DatumList[2] - timedelta(DatumList[2].day) )
                
                for i in [0,1,2,3]:
                    if( self.IndexList[i] == 0 and datum <= DatumList[i]  ):
                        self.IndexList[i] = float( td_elements[c+2].textContent.replace(".", "").replace(",", ".") )
                
            c = c + 1
示例#39
0
    def getFinanzenNetId(self, stock):
        url = self.__UrlKBV + stock.ISIN
        page = self.__webConnect.runGETRequest(url)
        finanzenNetId = ""

        doc = libxml2dom.parseString(page, html=1)
        a_elements = doc.getElementsByTagName("a")

        for i in a_elements:
            if i.textContent == "Historisch" and "kurse_historisch.asp" in i.attributes[
                    "href"].value:
                url = i.attributes["href"].value
                finanzenNetId = str(url.split("=")[1].split("&")[0])
                break

        if finanzenNetId.isdigit() == False:
            raise NameError('Error: getFinanzenNetId, Id nicht numeric: ' +
                            finanzenNetId)

        return finanzenNetId
示例#40
0
    def getOnvistaId(self, stock):
        page = self.__webConnect.runGETRequest(self.__StockOverviewUrl +
                                               str(stock.ISIN))

        onvistaId = ""

        doc = libxml2dom.parseString(page, html=1)
        a_elements = doc.getElementsByTagName("a")

        for i in a_elements:
            if i.textContent == "Kennzahlen" and "kennzahlen/fundamental.html?ID_OSI" in i.attributes[
                    "href"].value:
                url = i.attributes["href"].value
                onvistaId = str(url.split("=")[1])
                break

        if onvistaId.isdigit() == False:
            raise NameError('Error: getOnvistaId, Id nicht numeric: ' +
                            onvistaId)

        return onvistaId
示例#41
0
 def _retrieve_summary(self, url):
     """Retrieve the episode summary"""
     # reader = HtmlLib.Reader()
     if not url:
         print >> sys.stderr, 'Warning: no URL for summary'
         return ''
     print "Retrieving %s" % url
     show = urllib.urlopen(url)
     showstr = show.read()
     # Website sometimes contains invalid characters, which cause the
     # DOM parser to fail. Discard any non-ASCII character
     showstr = showstr.decode('ascii', 'ignore')
     show.close()
     doc = libxml2dom.parseString(showstr, html=1)
     for node in doc.getElementsByTagName('p'):
         if node.hasAttribute('class'):
             if 'deck' in node.getAttribute('class'):
                 for child_node in node.childNodes:
                     if child_node.nodeType == child_node.TEXT_NODE:
                         summary = child_node.nodeValue.strip()
                         if summary:
                             return summary
     return ''
示例#42
0
文件: ininj.py 项目: eblot/miscripts
 def _retrieve_summary(self, url):
     """Retrieve the episode summary"""
     # reader = HtmlLib.Reader()
     if not url:
         print >> sys.stderr, 'Warning: no URL for summary'
         return ''
     print "Retrieving %s" % url
     show = urllib.urlopen(url)
     showstr = show.read()
     # Website sometimes contains invalid characters, which cause the
     # DOM parser to fail. Discard any non-ASCII character
     showstr = showstr.decode('ascii', 'ignore')
     show.close()
     doc = libxml2dom.parseString(showstr, html=1)
     for node in doc.getElementsByTagName('p'):
         if node.hasAttribute('class'):
             if 'deck' in node.getAttribute('class'):
                 for child_node in node.childNodes:
                     if child_node.nodeType == child_node.TEXT_NODE:
                         summary = child_node.nodeValue.strip()
                         if summary:
                             return summary
     return ''
示例#43
0
    def __parseOnvistaSummary(self, stock):

        page = self.__webConnect.runGETRequest(
            self.__OnvistaFundamentaldatenTabelleUrl + str(stock.OnvistaId))

        doc = libxml2dom.parseString(page, html=1)
        td_elements = doc.getElementsByTagName("td")

        c = 0
        for i in td_elements:
            data = i.textContent
            try:
                if (data == "Marktkap.:"):
                    try:
                        tmp = float(td_elements[c + 1].textContent.replace(
                            ".", "").replace(",", ".").replace(" Mio EUR", ""))
                        self.__MarktkapitalisierungInEuro = tmp * 1000 * 1000
                    except ValueError:
                        self.__MarktkapitalisierungInEuro = "NA"

                if (data == "Dividendenrendite in %"):
                    try:
                        self.__DivRenditeAktJahrProzent = float(
                            td_elements[c + 1].textContent.replace(",", "."))
                    except ValueError:
                        self.__DivRenditeAktJahrProzent = "NA"

                if (data == u"Kurs-Buchwert-Verhältnis"):
                    try:
                        self.__KBVAktJahr = float(
                            td_elements[c + 1].textContent.replace(",", "."))
                    except ValueError:
                        self.__KBVAktJahr = "NA"

                if (data == "KGV"):
                    try:
                        self.__KGVAktJahr = float(
                            td_elements[c + 1].textContent.replace(",", "."))
                    except ValueError:
                        self.__KGVAktJahr = "NA"

                    try:
                        summe = 0
                        for j in {1, 2, 3, 4, 5}:
                            summe += float(td_elements[c +
                                                       j].textContent.replace(
                                                           ",", "."))
                        self.__KGVMean5Years = summe / 5
                    except ValueError:
                        self.__KGVMean5Years = "NA"

                if (data == "Eigenkapitalquote in %"):
                    try:
                        self.__EigenkapitalquoteAktJahrProzent = float(
                            td_elements[c + 1].textContent.replace(",", "."))
                    except ValueError:
                        self.__EigenkapitalquoteAktJahrProzent = "NA"

                if (data == "EBIT-Marge in %"):
                    try:
                        self.__EbitMargeAktJahrProzent = float(
                            td_elements[c + 1].textContent.replace(
                                "%", "").replace(",", "."))
                    except ValueError:
                        self.__EbitMargeAktJahrProzent = "NA"

                if (data == "Eigenkapitalrendite in %"):
                    try:
                        self.__EKRAktJahrProzent = float(
                            td_elements[c + 1].textContent.replace(
                                "%", "").replace(",", "."))
                    except ValueError:
                        self.__EKRAktJahrProzent = "NA"

            except ValueError, e:
                traceback.print_exc()
                raise ValueError("Error parseOnvistaSummary, Stock " +
                                 stock.Name + ", ISIN " + stock.ISIN)

            c = c + 1
示例#44
0
# Each CMS has an id associated with it

ACIDRE = re.compile(".*start.asp\?acid=([^&]*).*", re.DOTALL)

acidMatch = ACIDRE.match(cmsList)
if acidMatch is None:
    flush_print("Could not find site ID")
    sys.exit(-1)

acid = acidMatch.group(1)
flush_print("Using Site ID: '%s'" % acid)

# Pull the IDs out for each site

cmsListDoc = libxml2dom.parseString(cmsList, html=1)
CMSRE = re.compile("http://([^/]+)/site/lookup.asp\?c=(.+)", re.DOTALL)
MENURE = re.compile("ShowMenu\(event, *'(.*)'.*")
FOLEXPCOLLRE = re.compile("fExpandCollapse\(([0-9]*), *[01][^01]")
FILEXPCOLLRE = re.compile("fExpandCollapse\('([0-9]*)', *1[01]")
CIDRE = re.compile(".*[^a]cid={([^}]*)}.*", re.DOTALL)
BINIDRE = re.compile("{([^}]*)}.*")

# Using the parseFile method is causing the program to abort with a
# "too many files open" error


def parseFile(filename, html=True):
    flush_print("Parsing: %s" % filename)
    fileHandle = open(filename)
    fileDoc = libxml2dom.parse(fileHandle)
示例#45
0
#		if child.nodeType == child.TEXT_NODE:
#			result += child.nodeValue
#		else:
#			result += TextInNode(child)
#	return result

filelist = glob.glob('*.htm')
filenum = len(filelist)
num = 0
errorfiles = []
for filename in filelist:
    num += 1
    print >> sys.stderr, filename, num, 'of', filenum
    try:
        fp = open(filename, 'r')
        doc = libxml2dom.parseString(fp.read(), html=1)
        fp.close()
        style = doc.getElementsByTagName("style")[0].textContent
        style = re.search(r'(?s)\s*\.(\S+)\s*{\s*display:\s*none', style)
        displaynone = style.group(1)
        tabpages = doc.getElementsByTagName("div")
        tabpages = filter(lambda s: s.getAttribute("class") == "tab-page",
                          tabpages)
        for tabpage in tabpages:
            found = False
            for node in tabpage.childNodes:
                if node.nodeType == node.ELEMENT_NODE and node.name == 'h2':
                    if node.textContent == whattoextract:
                        found = True
                    break
            if found:
示例#46
0
def appendHTML(node, html):
    html = "<html>%s</html>" % html
    doc = libxml2dom.parseString(html, html=1)
    if doc.documentElement.childNodes.length > 0:
        for child in doc.documentElement.childNodes[0].childNodes:
            node.appendChild(node.importNode(child, True))
示例#47
0
def parseString(s, html=0, htmlencoding=None, unfinished=0, impl=None):
    return libxml2dom.parseString(s,
                                  html=html,
                                  htmlencoding=htmlencoding,
                                  unfinished=unfinished,
                                  impl=(impl or default_impl))
示例#48
0
        logging.critical("""To use downloadFromSomeSite function, \
you must provide options baselink and urlsearch in your config""")
        return None
    global libxml2dom
    try:
        libxml2dom
    except NameError:
        import libxml2dom
    try:
        a = codecs.open(os.path.join(directory, filename), 'rb')
    except IOError, m:
        failedProcedure(
            u"""%s: could not even open our just written file.leaving \
function..""" % m, directory, filename, threadName, rssItemNode, downloadDict)
        return None
    p = libxml2dom.parseString(a.read(), html=True)
    try:
        link = "%s%s" % (baselink, [
            x.getAttribute('href') for x in p.getElementsByTagName('a') if
            x.hasAttribute('href') and x.getAttribute('href').count(urlsearch)
        ][0])
        # if you want a regex. Then, instead of
        # x.getAttribute('href').count(urlsearch) do:
        # re.search(urlsearch, x.getAttribute('href'))
    except IndexError, m:
        failedProcedure(
            u"""%s: could not find href for downloaded %s item for \
redownload""" % (m, threadName), directory, filename, threadName, rssItemNode,
            downloadDict)
        return None
    try: