예제 #1
0
def ckeck_product_status(url , f):
    try :
        doc = lh.parse(url)
    except:
        print "page unable to load ::::::::::: http://www.bizrate.com/ratings_guide/guide/  \n "
                            
    try :
        List=doc.xpath('.//*[@id="section2"]/div/div/div/a[1]/@href')
        List.extend(doc.xpath('.//*[@id="section3"]/div/div/div/a[1]/@href'))
        List.extend(doc.xpath('.//*[@id="section4"]/div/div/div/a[1]/@href'))

        list_set = set(List)

        for element in list_set:
            try:
                p_url = 'http://aol.nextag.com'+element
                #p_url=p_url.replace('\'','')
                #p_url=p_url.replace(' ','%20')
                response = urllib2.urlopen(p_url)
                doc = lh.parse(response)
                p_list = doc.xpath('.//*[@id="search_results_content_id_0"]/li/div/a/@href')

                if len(p_list) == 0 :
                    raise Exception
               
            except Exception:
                f.write(p_url+'\n')
    except:
        pass
예제 #2
0
def get_pmc_number(doc_tuple):
    if isinstance(doc_tuple, tuple):
        sim_num = doc_tuple[0]
        doc_file_name = DOCUMENT_FILE_NAMES[sim_num]
        pmc_num = ''
        pmc_page_title = ''
        try:
            pmc_num = TITLE_DICT[doc_file_name][0]
            pmc_link = 'http://www.ncbi.nlm.nih.gov/pubmed/{}'.format(pmc_num)
            try:
                pmc_page_title = pubmed_link_cache[pmc_link]
            except KeyError:
                print "didn't get from cache"
                t = html.parse(pmc_link)
                pmc_page_title = t.find(".//title").text
                pubmed_link_cache[pmc_link] = pmc_page_title

        except KeyError:
            return None
        if pmc_num == '':
            pmid = TITLE_DICT[doc_file_name][1]
            pmid_link = 'http://www.ncbi.nlm.nih.gov/pubmed/?term={}'.format(pmid)
            try:
                pmc_page_title = pubmed_link_cache[pmid_link]
            except KeyError:
                print "didn't get frmo cache"
                t = html.parse(pmid_link)
                pmc_page_title = t.find(".//title").text
                pubmed_link_cache[pmid_link] = pmc_page_title
            return pmid_link, pmc_page_title
        return pmc_link, pmc_page_title
예제 #3
0
파일: xkcdb.py 프로젝트: dgw/sopel-xkcdb
def xkcdb(bot, trigger):
    qid = trigger.group(3)
    if qid:  # specific quote lookup
        page = html.parse('http://www.xkcdb.com/%s' % qid).getroot()
    else:  # random quote
        page = html.parse('http://www.xkcdb.com/random1').getroot()
    try:
        quoteblock = page.cssselect('p.quoteblock')[0]
    except IndexError:
        bot.say("XKCDB quote %snot found!" % ("#%s " % qid) if qid else "")
        return
    header = quoteblock.cssselect('span.quotehead')[0]
    quote = quoteblock.cssselect('span.quote')[0]
    for br in quote.xpath('*//br'):
        br.tail = '\n' + br.tail if br.tail else '\n'
    lines = quote.text_content().split('\n')
    qid = int(header.cssselect('.idlink')[0].text_content()[1:])
    ratings = re.search('\(\+(?P<up>\d+)/\-(?P<down>\d+)\)', header.text_content())
    up = formatting.color('+%s' % ratings.group('up'), 'green')
    down = formatting.color('-%s' % ratings.group('down'), 'red')
    url = 'http://www.xkcdb.com/%s' % qid

    bot.say("XKCDB quote #%s (%s/%s) - %s" % (qid, up, down, url))
    if len(lines) <= 6:
        for line in lines:
            bot.say(line)
    else:
        for line in lines[:3]:
            bot.say(line)
        bot.say("[Quote truncated. Visit %s to read the rest.]" % url)
예제 #4
0
def CacheItems():
    items_cache = open(CACHEFile, 'w');
    try:  
        MainMenuLinks = [];
        #if not os.path.exists(imagesDir):
            #os.makedirs(imagesDir);
        page = urllib2.urlopen(site_url + '/catalog');
        tree = html.parse(page);
        root = tree.getroot();
        catalog_elem = root.get_element_by_id('catalog');
        #catalog_elem = catalog_elem.find_class('inner').pop();
        if catalog_elem is not None:
            MainMenuItems = catalog_elem.find_class('menu').pop();
            #print MainMenuItems;
            #парсим категории
            if (MainMenuItems is not None) and (MainMenuItems.tag == 'ul'): 
                for MainMenuItem in MainMenuItems:
                    for link in MainMenuItem.iterlinks():
                        if 'catalog' in link[2]:
                            MainMenuLinks.append(link[2]);
                            print 'MainMenu link:' + link[2];
        
        #Обходим все страницы   
        for MainMenuLink in MainMenuLinks:
            page_num = 1; 
            ItemsEnded = False;
            First_stored = False;
            while not ItemsEnded:
                try:
                    print 'Opening: ' + site_url + MainMenuLink + '?PAGEN_1={0}'.format(page_num);
                    page = urllib2.urlopen(site_url + MainMenuLink + '?PAGEN_1={0}'.format(page_num), timeout = 10000);
                    tree = html.parse(page);
                    root = tree.getroot();
                    if not root.find_class('errortext'):
                        lst = root.find_class('product-list').pop();
                        ItemsEnded = True;
                        for link in lst.iterlinks():
                            if re.search('^/catalog/[A-Za-z_0-9]+/[A-Za-z_0-9]+/$', link[2]):
                                if not First_stored:
                                    First_item = link[2];
                                    First_stored = True;
                                if (page_num != 1) and (First_item == link[2]):
                                    ItemsEnded = True;
                                    break;
                                else: 
                                    ItemsEnded = False;
                                print 'Cached:' + link[2];
                                items_cache.write(link[2] +'\n');
                    else:
                        ItemsEnded = True; 
                except:
                    print site_url + MainMenuLink + '?PAGEN_1={0}'.format(page_num) + ' is broken!!!';
                    page_num += 1;
                    continue;
                
                page_num += 1;        
        items_cache.close();
    except:
        items_cache.close();
        raise;
예제 #5
0
	def fillemission(self,query=""):
		emissions=[]
		html_parser = etree.HTMLParser(encoding='utf-8', recover=True,strip_cdata=True)
		page= html.parse(self.url)
	
		try:
			expressiontitle = GenericTranslator().css_to_xpath(self.argtitle)
			expressionurl = GenericTranslator().css_to_xpath(self.argurl)
		except SelectorError:
			return 0
			#feedparser.error('Invalid CSS selector')
	
		for e,eid in zip(page.xpath(expressiontitle),page.xpath(expressionurl)):
			if eid.get("href"):
				try:
					if self.name=="France culture":
						foundb =re.search('/podcast/(.*)', eid.get("href")).group(1)
						pageb = html.parse("http://www.franceculture.fr/podcast/"+foundb) 
						aaa= pageb.xpath(GenericTranslator().css_to_xpath(".lien-rss"))[0]
						found = re.search("http.*rss_(.*)\.xml",aaa.get("href")).group(1)
						print found
					else:
						found =re.search('http.*rss_(.*)\.xml', eid.get("href")).group(1)
				except AttributeError:
				    found = '' 
			else:
				found=""
			etemp = emissionradiofrance(e.text,found)
			emissions.append(etemp)
		self.emissions=emissions
예제 #6
0
	def fillemissionindb(self,query=""):
		self.cleardb()
		conn = sqlite3.connect('podcast.db')
		c = conn.cursor()
		html_parser = etree.HTMLParser(encoding='utf-8', recover=True,strip_cdata=True)
		page= html.parse(self.url)
	
		try:
			expressiontitle = GenericTranslator().css_to_xpath(self.argtitle)
			expressionurl = GenericTranslator().css_to_xpath(self.argurl)
		except SelectorError:
			return 0
			#feedparser.error('Invalid CSS selector')
	
		for e,eid in zip(page.xpath(expressiontitle),page.xpath(expressionurl)):
			if eid.get("href"):
				try:
					if self.name=="France culture":
						foundb =re.search('/podcast/(.*)', eid.get("href")).group(1)
						pageb = html.parse("http://www.franceculture.fr/podcast/"+foundb) 
						aaa= pageb.xpath(GenericTranslator().css_to_xpath(".lien-rss"))[0]
						found = re.search("http.*rss_(.*)\.xml",aaa.get("href")).group(1)
						print found
					else:
						found =re.search('http.*rss_(.*)\.xml', eid.get("href")).group(1)
				except AttributeError:
				    found = '' 
			else:
				found=""
			etemp = emissionradiofrance(e.text,found)
			qqq = "INSERT INTO emissions (station, title, podcasturl, idemission) VALUES (\""+self.name+"\",\""+etemp.name+"\",'"+etemp.podcasturl+"','"+str(etemp.idpod)+"')"
			print qqq
			c.execute(qqq)
		conn.commit()
		conn.close()
예제 #7
0
파일: Linia.py 프로젝트: d33tah/OpenMPK
	def listuj_linie(url):
		"""
		Funkcja wchodzi na podanego URL'a (w przypadku strony MPK, musi
		nast¹piæ przekierowanie, bo mamy index.jsp, a nie .html) i 
		pobiera listê linii.

		TODO: rozró¿niaæ autobusy dzienne/nocne i tramwaje? Na tej 
		podstronie jest taka mo¿liwoœæ.
		"""
		tree = html.parse(url+'/index.html')
		przekierowanie = tree.xpath('//meta [@http-equiv="refresh"]')
		if przekierowanie:
			#Wybierz pierwszy element z tej tablicy i weŸ tekst na 
			#prawo od URL w jego 'content'.
			nowy_url = przekierowanie[0].attrib['content'].split(
					'URL=')[-1]
			tree = html.parse(nowy_url)

		linie_tree = wybierz_ramke(tree,'rozklad',url)
		linie_td = linie_tree.xpath('//div [contains(@id,bx1)]//td \
				[@class="nagl" and not(contains( \
				.,"Aktualny"))]')
		ret = []
		
		makedir_quiet('przetworzone')
		f = open('przetworzone/lista_linii.txt','w')

		for linia in linie_td:
			link = linia.xpath('a')[0]
			#wytnij "Linia: " z linka i uznaj to za nazwê linii
			nazwa_linii = link.text_content().lstrip("Linia: ")
			url_linii = url+link.attrib['href']
			ret += [Linia(nazwa_linii,url_linii,url)]
			print(nazwa_linii,file=f)
		return ret
예제 #8
0
def get_article(url):
    ## Section 0 - Initial set.
    blog_url = list()
    dem = list()

    ## Section 1 - Got frame src.
    dem.append(html.parse(url).getroot())
    blog_url.append("http://blog.daum.net" + dem[0][1][0].attrib["src"])
    # print "[System] Got blog-url[1] from iframe successfully. :", blog_url[0]

    ## Section 2 - Get frame src(2).
    dem.append(html.parse(blog_url[0]).getroot())
    frames = dem[1].cssselect("iframe")
    for frame in frames:
        if "if_b" in frame.get("name"):
            blog_url.append("http://blog.daum.net" + frame.get("src"))
    # print "[System] Got blog-url[2] from iframe successfully. :", blog_url[1]

    ## Section 3 - Get contents of article.
    dem.append(html.parse(blog_url[1]).getroot())
    article = dem[2].cssselect("div#contentDiv")[0]

    img_links = get_images(article)

    ## Section 4 - Return data.
    return st.strip_html(html.tostring(article, encoding="utf-8", method="html")), img_links
예제 #9
0
	def _get_tree_from_url(self, url):
		if not url:
			return
		if url.startswith('/'):
			url = "http://www.redbus.in" + url
#		print url , " in _get_tree_from_url"
		urlstr = url.replace('/', '_')
		try:
#			print "\nIn Try (_get_tree_from_url)"
			f = open("dump/%s" % urlstr, 'r')
			doc = html.fromstring(f.read())
			tree = etree.ElementTree(doc)
			print 'Found'
		except:
#			print "\nIn Except (_get_tree_from_url)"
			print "Error:", sys.exc_info()[0]
#			print 'Downloading'
			tree = html.parse(url)
#			print "Tree :- " , tree
			if not tree:
				print "\nFalling back"
				tree = html.parse(url)
			output = open("dump/%s" % urlstr, 'w')
			output.write(html.tostring(tree))
			output.close()
		return tree
예제 #10
0
 def get_poster(self, download=True, force=False):
     imdb_id = self.movie_imdb_id
     genre = list()
     
     if self.has_poster() and not force:
         return os.path.join(R_POSTERS_PATH, str(imdb_id)+'.jpg')
     
     second_page = parse(self.__create_request(movie_url(imdb_id)))
     if second_page:
         try:
             poster_page_url = second_page.xpath("//td[@id='img_primary']/a")[0].attrib.get('href')
         except IndexError:
             poster_page_url = None
             poster_url = None
         if poster_page_url is not None:
             poster_page = parse(self.__create_request('http://www.imdb.com'+poster_page_url))
             try:
                 poster_url = poster_page.xpath("//div[@id='photo-container']/div[@id='canvas']//img[@id='primary-img']")[0].attrib.get('src')
             except IndexError:
                 poster_url = None
         if poster_url and download:
             print poster_url
             try:
                 f = self.__create_request(poster_url)
             except:
                 pass
             else:
                 path = os.path.join(POSTERS_PATH, str(imdb_id)+'.jpg')
                 with open(path, 'w') as local:
                     local.write(f.read())
                 poster_url = os.path.join(R_POSTERS_PATH, str(imdb_id)+'.jpg')
     
     if poster_url is None:
         poster_url = UNKNOWN_POSTER_PATH
     return poster_url
예제 #11
0
파일: scraper.py 프로젝트: sseveiN/Hecho
def scrapeGit():
    lst = []
    nextPage = 'https://github.com/showcases'
    curr = 1
    last = 0;
    while(curr > last):
        url = urlopen(nextPage)
        tree = parse(url)
        #Gets a list of categories on the page
        page = tree.xpath('//*[@id="site-container"]/div[2]/ul//li/a/@href')
        #Goes through each category and gets the repo titles and descriptions
        for i in range(0, 2):
            time.sleep(3)
            url = urlopen('https://github.com' + page[i])
            tree2 = parse(url)
            title = tree2.xpath('//*[@id="site-container"]/div[2]/div[2]/div/div[1]/ul[2]//li/h3/a/@href')
            des = tree2.xpath('//*[@id="site-container"]/div[2]/div[2]/div/div[1]/ul[2]//li/p/text()')
            for x in range(0, len(title)-1):
                newTitle = ''
                for j in reversed(title[x]):
                    if j == '/':
                        break
                    else:
                        newTitle = j + newTitle
                newDes = des[x].strip().replace('\n', '')
                " ".join(newDes.split())
                link = 'github.com'+ title[x]
                source = 'GIT'
                newProj = project(newTitle, newDes, link, "", source)
                lst += [newProj]
        nextPage = tree.xpath('//*[@id="site-container"]/div[2]/div[3]/div/a[last()]/@href')[0]
        last += 1
        curr = int(nextPage[-1])

    return lst
예제 #12
0
파일: crawler.py 프로젝트: blackcan/Ya-hook
def fetch_ipeen_info(url):
    root = parse(_IPEEN_BASE_URL + url).getroot()

    # get basic information
    info_rows = root.xpath('//table[@class="binfo"]/tr/td/div')
    basic_info_list = [_remove_space(row.text_content()) for row in info_rows]

    # get comments
    comment_links = root.xpath('//h2[@class="absTitle"]/a')
    comment_list = _extract_links(comment_links)

    # get more shops
    path = './/div[@class="name"]/a'
    shop_blocks = root.xpath('//div[@class="sblock rec"]')
    shop_list = {
        'rel': _extract_links(shop_blocks[0].iterfind(path)),
        'near': _extract_links(shop_blocks[1].iterfind(path))
    }

    # get photos
    url = url.replace('/shop/','/shop/photos/')
    root = parse(_IPEEN_BASE_URL + url).getroot()
    photo_imgs = root.xpath('//a[@rel="shop_photos_share"]/img')
    photo_list = ['http:' + img.get('src') for img in photo_imgs]

    # wrap infomation
    info = {
        'basic_info': basic_info_list,
        'comments': comment_list,
        'more_shop': shop_list,
        'photos': photo_list
    }

    return info
예제 #13
0
def get_apocopes(list_urls):
    apo_urls = []
    for list_url in list_urls:
        for node in parse(list_url).findall('.//div[@class="mw-category"].//li/a[@href]'):
            apo_urls.append((node.text, 'http://fr.wiktionary.org' + node.attrib['href']))
    
    with codecs.open('wiki.log', 'w', 'utf-8') as log:
        apos = {}
        for short, url in sorted(apo_urls):
            short = short.lower()
            if short not in apos:
                apos[short] = []
            fulls = apos[short]
            for node in parse(url).findall('.//dl/dd'): #/i/a[@href]
                text = etree.tostring(node, encoding = 'unicode', method = "text").lower().replace('\n', '')
                fulls_sub = []
                for match in extractor.findall(text):
                    for full in match:
                        full = cleaner.sub('\\1', full)
                        if not full:
                            continue
                        fulls_sub.append(full)
                log.write(delim.join([short, str(fulls_sub), text]) + newline)
                if not fulls_sub:
                    print short, '=>', text
                    continue
                for full in fulls_sub:
                    if full not in fulls:
                        fulls.append(full)
    return apos
예제 #14
0
    def replace_terms(html):
        html = force_text(html)
        remove_body = False
        remove_p = False
        etree = parse(StringIO(html))
        root_node = etree.getroot()
        if not _looks_like_full_html_unicode(html):
            root_node = root_node.getchildren()[0]
            remove_body = True
            if root_node.getchildren()[0].tag == 'p' and html[:3] != '<p>':
                remove_p = True

        variants_dict = Term.objects.variants_dict()
        replace_dict = Term.objects.replace_dict()
        replace_regexp = Term.objects.replace_regexp()
        replace_regexp__sub = replace_regexp.sub
        translate = get_translate_function(replace_dict, variants_dict)

        for node in get_interesting_contents(root_node, replace_regexp):
            new_content = replace_regexp__sub(
                translate, tostring(node, encoding='unicode'))
            new_node = parse(StringIO(new_content)).getroot().getchildren()[0]
            if node.tag != 'body':
                new_node = new_node.getchildren()[0]
            node.getparent().replace(node, new_node)

        if remove_body:
            if remove_p:
                root_node = root_node.getchildren()[0]
            out = root_node.text or ''
            out += ''.join([tostring(node, encoding='unicode')
                            for node in root_node.getchildren()])
            return out
        return tostring(etree, encoding='unicode')
예제 #15
0
def fetch_or_load(spec_path):
    """
    Fetch a new specification or use the cache if it's current.

    :argument cache_path: the path to a cached specification

    """

    headers = {}

    try:
        modified = datetime.utcfromtimestamp(os.path.getmtime(spec_path))
        date = modified.strftime("%a, %d %b %Y %I:%M:%S UTC")
        headers["If-Modified-Since"] = date
    except OSError as error:
        if error.errno != errno.ENOENT:
            raise

    request = urllib.Request(VALIDATION_SPEC, headers=headers)
    response = urllib.urlopen(request)

    if response.code == 200:
        with open(spec_path, "w+b") as spec:
            spec.writelines(response)
            spec.seek(0)
            return html.parse(spec)

    with open(spec_path) as spec:
        return html.parse(spec)
예제 #16
0
	def get_poster(self):
		movie_title = self.movie_title
		page_search_list = parse(urllib2.urlopen('http://movieposterdb.com/browse/search?'+urllib.urlencode({'type':'movies', 'query': movie_title})))
		page_movie_gallery = page_search_list.xpath('//tr/td/b/a')[0].attrib.get('href')
		movie_poster_gallery = parse(urllib2.urlopen(page_movie_gallery))
		movie_poster = movie_poster_gallery.xpath('//tr/td/div/a/img')[0].attrib.get('src')
		return dict(poster=movie_poster)
예제 #17
0
def get_index():
    """ Traverse the search results of an empty query for projects in 
    the CORDIS database. """

    # fetch an initial page:
    doc = html.parse(INITIAL_URL)
    # infinite loop isn't nice, but we'll break when no 'next' link is
    # available.
    while True:
        # iterate over the links for all projects on this page
        for project_link in doc.findall('//div[@id="PResults"]//a'):

            # join up URLs to generate the proper path
            href = project_link.get('href').replace('..', '')
            yield urljoin(INITIAL_URL, href)

        next_url = None

        # look at all links in the navigation section of the listing
        for nav in doc.findall('//p[@class="PNav"]/a'):

            # if the link is a 'next' link, follow it
            if 'Next' in nav.text:
                href = nav.get('href').replace('..','')
                next_url = urljoin(INITIAL_URL, href)

                # replace the document to traverse the next page in
                # the following iteration
                doc = html.parse(next_url)

        # no next link was found, so cancel
        if not next_url:
            break
def identify_and_get_right_url(url):
    tree = html.parse(url).getroot()
    is_problem_statement_string = tree.xpath('/html/body/table/tr/td[3]/table[1]/tr/td[3]/span/text()')[0].strip(' \t\n\r')

    # check if its a Problem statement page that is passed
    if re.search(r'Problem Statement', is_problem_statement_string):
        problem_detail_url = tree.xpath('/html/body/table/tr/td[3]/table[2]/tr[1]/td/table/tr[10]/td/a/@href')[0].strip(' \t\n\r')
        url = 'http://community.topcoder.com' + problem_detail_url
        print 'Given url is a problem statement url, trying to get a problem detailed url out of it'

        if check_is_url(url):
            print 'Extracted problem detailed page url = ', url
            return url
        else:
            print "ERROR: couldn't find problem detailed page url. Exiting!"
            sys.exit(1)

    # check if its a Problem detail page url
    tree = html.parse(url).getroot()
    is_problem_detail_string = tree.xpath('/html/body/table/tr/td[3]/table/tr/td[3]/span/text()')[0].strip(' \t\n\r')
    if re.search(r'Problem Detail', is_problem_detail_string):
        print 'Given url is a problem detail url'
        return url

    print "ERROR: Doesn't look like a topcoder url"
    sys.exit(1)
예제 #19
0
파일: eatiht.py 프로젝트: voidfiles/eatiht
def get_sentence_xpath_tuples(url, xpath_to_text=TEXT_FINDER_XPATH):
    """
    Given a url and xpath, this function will download, parse, then
    iterate though queried text-nodes. From the resulting text-nodes,
    extract a list of (text, exact-xpath) tuples.
    """
    try:
        parsed_html = html.parse(url)

    except IOError as e:
        # use requests as a workaround for problems in some
        # sites requiring cookies like nytimes.com
        # http://stackoverflow.com/questions/15148376/urllib2-returning-no-html
        page = requests.get(url)

        # http://lxml.de/parsing.html
        parsed_html = html.parse(BytesIO(page.content), html.HTMLParser())

    xpath_finder = parsed_html.getroot().getroottree().getpath

    nodes_with_text = parsed_html.xpath(xpath_to_text)

    sent_xpath_pairs = [
        ('\n\n' + s, xpath_finder(n)) if e == 0     # hard-code paragraph breaks (there has to be a better way)
        else (s, xpath_finder(n))
        for n in nodes_with_text
        for e, s in enumerate(sentence_token_pattern.split(bracket_pattern.sub('', ''.join(n.xpath('.//text()')))))
        if s.endswith(tuple(sentence_ending))
        ]

    return sent_xpath_pairs
예제 #20
0
파일: imdb.py 프로젝트: incuna/ircbot
def imdb(ircbot, input):
    origterm = input.groups()[1]
    if not origterm:
       return ircbot.say('Perhaps you meant ".wik Zen"?')
    origterm = origterm.encode('utf-8')

    doc = parse("http://m.imdb.com/find?q=" + urllib.quote(origterm));
    try:
        first_result = doc.xpath("/html/body/section/div/div/div")[0];
        movie_name = first_result.text_content().strip();
        movie_url = first_result.xpath("a")[0].get("href");
    except:
        return ircbot.say("No result");

    re_uri = re.compile("\/title\/tt[0-9]*\/");

    if re_uri.match(movie_url):
        doc = parse("http://m.imdb.com" + movie_url).getroot();

        details = doc.cssselect("section.details")[0];

        for i in details.xpath('div/h1'):
            if i.text == "Genre":
                genre = i.getnext().text;

        try:
            rating = doc.xpath("/html/body/section/a/p/strong")[0].text; #Unreleased movies have no rating
        except:
            rating = "";
    else:
        return ircbot.say("No result");
    return ircbot.say(movie_name + " - " + genre + " - " + rating + "/10 - http://imdb.com" + movie_url);

    ircbot.say(movie_name + " " + movie_url);
예제 #21
0
def get_sentence_xpath_tuples(url, xpath_to_text = TEXT_FINDER_XPATH):
    """
    Given a url and xpath, this function will download, parse, then iterate though
    queried text-nodes. From the resulting text-nodes, extract a list of (text, exact-xpath) tuples.
    """
    try:
        parsed_html = html.parse(url)

    except IOError as e:
        # workaround for problems in some sites requiring cookies
        # like nytimes.com
        # http://stackoverflow.com/questions/15148376/urllib2-returning-no-html
        import requests

        page = requests.get(url)

        try:
            from cStringIO import StringIO as BytesIO
        except ImportError:
            from io import BytesIO

        # http://lxml.de/parsing.html
        parsed_html = html.parse( BytesIO(page.content), html.HTMLParser() )

    xpath_finder = parsed_html.getroot().getroottree().getpath

    nodes_with_text = parsed_html.xpath(xpath_to_text)

    sent_xpath_pairs = [(s, xpath_finder(n))
        for n in nodes_with_text
        for s in sentence_token_pattern_C.split( bracket_pattern.sub( '', ''.join( n.xpath( './/text()') ) ) )
        if s.endswith('.')]

    return sent_xpath_pairs
    def parse(self, url):
        page = html.parse(url)
        dates = page.xpath(self.regex_date)
        exhibitions = page.xpath(self.regex_event)

        for date in dates:
            dprint("Date: %s" % date.text)

        events_list = []
        for exhibition in exhibitions:

            dprint("=" * 20)
            dprint("Exhibition name: %s" % exhibition.text)
            dprint("Additional info: %s" % exhibition.attrib['href'])
            url_decription = exhibition.attrib['href']
            page_additional_info = html.parse(url_decription)

            event_description = "Пусто"
            try:
                event_description = page_additional_info.xpath(self.regex_event_description).pop().text
            except Exception:
                pass

            dprint("Description: %s" % event_description)

            event_address = page_additional_info.xpath(self.regex_address).pop().text
            event_date = page_additional_info.xpath(self.regex_date).pop().text

            dprint("Address: %s" % event_address)
            dprint("Date: %s" % event_date)

            event = Event(description=event_description, address=event_address, date=event_date)
            events_list.append(event)

        return events_list
예제 #23
0
    def _get(self):
        h = html.parse(self.URL).getroot()
        h.make_links_absolute(self.URL)
        urls = set(
            re.sub('-(\d)\.', '-0\\1.', e.get('href'))
            for e in h.cssselect('.page')
        )

        for url in urls:
            h = html.parse(url).getroot()
            h.make_links_absolute(self.URL)

            h.cssselect('#advcenter')[0].getparent().drop_tree()
            entries = h.cssselect('#proxylist tr:nth-child(n+2)')
            data_url = h.cssselect('#ipportonly > a')[0].get('href')

            h = html.parse(data_url).getroot()
            data = h.cssselect('#content pre')[0].text

            for i, line in enumerate(data.splitlines()):
                ip, port = line.split(':')

                yield Proxy(
                    ip, port,
                    country=entries[i][3].text, anonlevel=entries[i][1].text,
                    source=self.__class__.__name__
                )
예제 #24
0
def scraper(url, package, tmp):
    """find and validate source tar.gz with md5 and pgp signatures
    searches for links to the 'package' on the 'url', downloads the .tar.gz, .tar.gz.md5, and .tar.gz.asc
    uses the .tar.gz.md5 and the .tar.gz.asc to validate the .tar.gz
    returns the path to the .tar.gz file inside of 'tmp'
    """
    # print "%s %s" % ( url, package )
    doc = parse(urlopen(url)).getroot()
    doc.make_links_absolute(url)
    links = doc.xpath("//a[contains(@href,'%s')]/@href" % package, )
    download_url = [i for i in links if i.endswith('.tar.gz')][0]
    # sometimes the download link does not let you download
    if download_url.startswith('http://www.apache.org/dyn/closer.cgi'):
        doc2 = parse(urlopen(download_url)).getroot()
        download_url = doc2.xpath("//a[contains(@href,'%s')][1]/@href" % package, )[0]
    # pp(download_url)
    archive = downloadChunks( download_url , tmp)
    md5_file = downloadChunks( [i for i in links if i.endswith('tar.gz.md5')][0], tmp)
    checksum = md5sum(archive)
    # make sure the checksum is correct
    print checksum
    assert(checksum in open(md5_file).read())
    pgp_file = downloadChunks( [i for i in links if i.endswith('tar.gz.asc')][0], tmp)
    subprocess.check_call(["gpg", "--verify", pgp_file, archive ])
    return archive
예제 #25
0
def scrape_thread(thread):
    print base+"/community/pr.aspx"+thread.attrib["href"][23:]
    title = thread.text
    qid = re.findall('\d*$', thread.attrib['href'])[0]
    t = html.parse(base+"/community/pr.aspx"+thread.attrib["href"][23:])
    for br in t.xpath("*//br"):
        br.tail = "\n" + br.tail if br.tail else "\n"
    no_signatures = re.sub('<hr.*?/td>', "", etree.tostring(t), flags=re.DOTALL)
    meta = t.xpath('//td[@class="printHead"]')
    posters = set()
    post_content = html.parse(StringIO(no_signatures)).xpath('//td[@class="printBody"]')[1:]
    for i, post in enumerate(zip(meta, post_content)):
        inferred_replies = set()
        local_id = i - 1
        reply_to = qid + "_top" if local_id >= 0 else " "
        poster = post[0].xpath('b')[0].text
        date = post[0].xpath('b')[0].tail[3:]
        content = post[1].text_content()
        unique_id = qid + "_top" if local_id < 0 else qid + "_" + str(local_id)
        for p in posters:
            if p in content:
                inferred_replies.add(p)
        row = [unique_id, qid, local_id, title, poster, date, reply_to, content, ' | '.join(inferred_replies), subforum]
        w.writerow(row)
        f.flush()
        posters.add(poster)
예제 #26
0
 def find_path() :
     html_file = "ETOZGianfranco.html"
     input_string = "Gianfranco Frattini"
     elem_tree = lh.parse(html_file)
     xpath = "//*[contains(normalize-space(.), '{0}') and not(.//*[contains(normalize-space(.), '{0}')])]/*"
     node = elem_tree.xpath(xpath.format(input_string))[0]
     path = elem_tree.getpath(node)
     #Use parent path
     path = path[:path.rfind('/')]
     #Use template
     result = elem_tree.xpath(path)[0]
     result_html = tostring(result)
     result_class = result.attrib.get('class')
     print '{0} -> {1}'.format(input_string, elem_tree.getpath(node))
     #Use template
     html_file2 = "ETOZCocktail.html"
     elem_tree2 = lh.parse(html_file2)
     result2 = elem_tree2.xpath(path)[0]
     result2_html = tostring(result2)
     #Create dao
     dao = Dao()
     #Update template
     dao.update_path(19, path)
     #Insert record
     dao.insert_record('TestUrl', result_html, 19)
     dao.insert_record('TestUrl', result2_html, 19)
예제 #27
0
 def grabinfo(self, url):                
     try:
         company = html.parse(url)
     except Exception:
         print('Bad URL: ', url)
         return ['Bad URL', '---', '---', '---', '---', url]
     offices = company.xpath("//div[@class='offices']/text()")
     if len(offices) > 0 and self.city['ru'] in offices[0]:  # self.city['ru'].decode('utf-8') in Python 2.7
         offices = offices[0].strip()
     else:
         return None
     companyname = company.xpath("//h1[@class='g-h2']/text()")[0].strip()
     staff = company.xpath("//div[@class='company-info']/text()")
     if len(staff) > 0:
         staff = max([el.strip() for el in staff])
     else:
         staff = ''
     site = company.xpath("//div[@class='site']/a/@href")
     if len(site) > 0:
         site = site[0]
     else:
         site = ''
     companyoffice = html.parse(url+'offices/')
     adress = companyoffice.xpath("//a[@name='"+self.city['en']+"']/../div/div[2]/div[1]/div/div[1]/text()")
     if len(adress) > 0:
         adress = adress[0].strip()
     else:
         adress = ''
     return [companyname, staff, offices, adress, site, url]
예제 #28
0
파일: full.py 프로젝트: SkyRzn/d3_stat
def load_page(username):
	try:
		page = html.parse('html/%s.htm' % username)
	except:
		load_html(username)
		page = html.parse('html/%s.htm' % username)
	return page.getroot()
예제 #29
0
def main():
    url = 'http://www.onekp.com/public_data.html'
    html = 'public.html'
    handle = open('result.tsv', 'w')

    if not exists(html):
        raw = parse(url)
    else:
        raw = parse(html)
    # /html/body/table/tr/td/self|b|a.href
    all_tr = raw.xpath('//tr')
    for tr in all_tr:
        all_td = tr.findall('td')
        for td in all_td:
            # some <td> have descdents, get a.href
            for i in td.iter():
                if i.tag == 'a':
                    handle.write(i.attrib['href'])
                    break
                elif i.tag == 'td':
                    handle.write(i.text_content()+' ')
                else:
                    pass
            handle.write('\t')
        handle.write('\n')
    handle.close()
    print('Done')
예제 #30
0
def getPages(manga, chapter):
	url = URI + '/' +  manga + '/' + chapter + '/'
	root = html.parse(url)
	soup = root.xpath('//li')
	url = URI + soup[0][0].get('href')
	root = html.parse(url)
	pages = root.xpath('//select[@class="page-select"]')
	return [page.text for page in pages[0].getchildren()]
예제 #31
0
import urllib
from HTMLParser import HTMLParser
from lxml.html import parse

urltext = []


class myHTMLParser(HTMLParser):
    def handle_data(self, data):
        if data != '\n':
            urltext.append(data)


if __name__ == '__main__':
    fileParser = myHTMLParser()
    testUrl = "http://www.shopping.com/products?KW=<keword>"
    parsedURL = parse(testUrl)
    doc = parsedURL.getroot()
    links = doc.findall('.//a')
    linksSet = []
    for entry in links:
        linksSet.append(entry.get('href'))
    for entry in linksSet:
        print entry
    #pageHandle = urllib.urlopen(testUrl).read()
    #print pageHandle+"\n\n"
    #fileParser.feed(pageHandle)
    #fileParser.close()
    #print urltext
예제 #32
0
def get_lxml_elements(url, element):
    _skip_if_no('lxml')
    from lxml.html import parse
    doc = parse(url)
    return doc.xpath('.//{0}'.format(element))
예제 #33
0
# -*- coding: UTF-8 -*-

from lxml import html
import os

import sys

reload(sys)
sys.setdefaultencoding('utf-8')
seed_url = u"http://www.kekenet.com/read/essay/ats/"
x = html.parse(seed_url)
spans = x.xpath("*//ul[@id='menu-list']//li/h2/a")
for span in spans[:10]:
    details_url = span.xpath("attribute::href")[0]
    xx = html.parse(details_url)
    name = 'story//' + span.text.replace(u' ', u'_')
    f = open(name, 'a')
    try:
        contents = xx.xpath("//div[@id='article']//p/text()")
        for content in contents:
            if len(str(content)) > 1:
                f.write(content.encode('utf-8') + '\n')
    except Exception, e:
        print "wrong!!!!", e
        f.close()
        os.remove(name)
    else:
        f.close()
import datetime
import sys

links = []


class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attributes):
        if tag == 'a':
            for name, value in attributes:
                if name == 'routerlink':
                    links.append(value)


parser = MyHTMLParser()
tree = html.parse('src/app/app.component.html')
parser.feed(html.tostring(tree).decode("utf-8"))
parser.close()

# prune /home since it's a duplicate of base url
links.remove('/home')
base_url = 'https://www.egill.rocks'
missing_links = links.copy()

tree = ET.parse('deploy/sitemap.xml')
root = tree.getroot()
url_el = list(root)
for el in url_el:
    for link in links:
        # assume sub-element 'loc' is the first one in the order
        if el[0].text == base_url + link:
예제 #35
0
def html_parse(site_string):

    # tmp = lhtml.parse(url_string)
    site = lhtml.parse(StringIO(site_string))
    cleaner = Cleaner(style=True,
                      links=True,
                      add_nofollow=True,
                      page_structure=False,
                      safe_attrs_only=False)
    html = cleaner.clean_html(site)
    body = html.getroot().cssselect('body')[0]

    for ele in body.cssselect('.header'):
        ele.drop_tree()
    for ele in body.cssselect('#header'):
        ele.drop_tree()
    for ele in body.cssselect(".ui-toolkit"):
        ele.drop_tree()
    for ele in body.cssselect('#footer'):
        ele.drop_tree()
    for ele in body.cssselect('nav'):
        ele.drop_tree()

    #goSquared
    for ele in body.cssselect('.navOffset'):
        ele.drop_tree()
    #exoscale
    for ele in body.cssselect('hgroup'):
        ele.drop_tree()
    #vircurex
    for ele in body.cssselect('.banner'):
        ele.drop_tree()
    #tyntec
    for ele in body.cssselect('.bar'):
        ele.drop_tree()
    #1linx
    for ele in body.cssselect('section'):
        ele.drop_tag()
    #one signal
    for ele in body.cssselect('#hub-header'):
        ele.drop_tree()
    for ele in body.cssselect('header'):
        ele.drop_tag()
    #clever tap
    for ele in body.cssselect('.doc-article__breadcrumb'):
        ele.drop_tree()

    for ele in body.iter():
        if 'div' == ele.tag:
            ele.drop_tag()
    if len(body.cssselect('h1')) > 0:
        for ele in body.cssselect('h1'):
            body = ele.getparent()
            break
    elif len(body.cssselect('h2')) > 0:
        for ele in body.cssselect('h2'):
            body = ele.getparent()
            break
    elif len(body.cssselect('h3')) > 0:
        for ele in body.cssselect('h3'):
            body = ele.getparent()
            break
    elif len(body.cssselect('h4')) > 0:
        for ele in body.cssselect('h4'):
            body = ele.getparent()
            break
    elif len(body.cssselect('h5')) > 0:
        for ele in body.cssselect('h5'):
            body = ele.getparent()
            break
    elif len(body.cssselect('h6')) > 0:
        for ele in body.cssselect('h6'):
            body = ele.getparent()
            break
    fo = open("what.txt", "w+")
    fo.write(lhtml.tostring(body))
    return body
예제 #36
0
    def run(self,
            content,
            no_lectures=False,
            no_exercises=False,
            class_code=None):
        doc = parse(StringIO(content)).getroot()
        subject = Subject.objects.get(abbr=self.parse_subject(doc))

        year, is_winter = self.parse_semester(doc)
        semester = Semester.objects.get(year=year, winter=is_winter)

        classes = list(
            map(str.strip,
                doc.xpath('//tr[@class="rowClass1"]/th/div/span[1]/text()')))
        labels = list(doc.xpath('//tr[@class="rowClass1"]/th/div/@title'))

        default_classes = []
        for code in class_code or []:
            try:
                default_classes.append(
                    Class.objects.get(semester__year=year,
                                      semester__winter=is_winter,
                                      code=code,
                                      subject__abbr=opts['subject']))
            except Class.DoesNotExist:
                raise ImportException(
                    f"Class with code {code} does not exist.")

        class_in_db = {}
        for c, label in zip(classes, labels):
            if not self.is_allowed(c, no_lectures, no_exercises):
                continue
            try:
                class_in_db[c] = Class.objects.get(code=c,
                                                   semester=semester,
                                                   subject=subject)
            except Class.DoesNotExist:
                s = label.split(' ')

                class_in_db[c] = Class()
                class_in_db[c].code = c
                class_in_db[c].day = s[6].upper()
                class_in_db[c].hour = s[7]
                class_in_db[c].year = datetime.datetime.now().year
                class_in_db[c].winter = datetime.datetime.now().month >= 9
                class_in_db[c].time = s[7]
                class_in_db[c].subject = subject
                class_in_db[c].semester = semester

                first_name, last_name = label.replace(',', '').replace(
                    'Ph.D.',
                    '').replace('Bc', '').replace('DiS',
                                                  '').strip().split(' ')[-2:]
                if first_name and last_name:
                    teacher = User.objects.filter(first_name=first_name,
                                                  last_name=last_name)
                    if not teacher:
                        raise ImportException(
                            f"Teacher '{first_name}' '{last_name}' not found")
                    class_in_db[c].teacher = teacher[0]

                class_in_db[c].save()

        for row in doc.xpath('//table[@class="dataTable"]//tr')[1:]:

            def clean_name(s):
                for remove in [
                        'Ing', 'Bc', 'BA', 'MBA', 'Mgr', 'MgrA', '.', ','
                ]:
                    s = s.replace(remove, '')

                return ' '.join(s.split()).strip()

            login = row.xpath('./td[2]/a/text()')[0].strip()
            email = row.xpath('./td[2]/a/@href')[0].replace('mailto:',
                                                            '').strip()
            name = clean_name(row.xpath('./td[3]/a/text()')[0])
            lastname, firstname = name.strip().split(' ', 1)

            member_of = []
            created = False

            user = None
            try:
                user = User.objects.get(username=login)
            except User.DoesNotExist:
                user = User.objects.create_user(login.upper(), email)
                user.first_name = firstname
                user.last_name = lastname
                user.save()
                created = True

            for i, el in enumerate(row.xpath('.//input')):
                clazz = classes[i]
                if "checked" in el.attrib:
                    if not self.is_allowed(clazz, no_lectures, no_exercises):
                        continue

                    if user not in class_in_db[clazz].students.all():
                        member_of.append(clazz)
                        class_in_db[clazz].students.add(user)
                elif clazz in class_in_db:
                    class_in_db[clazz].students.remove(user)

            for clazz in default_classes:
                if user not in clazz.students.all():
                    member_of.append(clazz.code)
                    clazz.students.add(user)

            classess = []
            for c in Class.objects.filter(students__username=login,
                                          semester__year=year,
                                          semester__winter=is_winter,
                                          subject_id=subject.id):
                classess.append(f"{c.timeslot} {c.teacher.username}")

            yield {
                'login': login,
                'firstname': firstname,
                'lastname': lastname,
                'created': created,
                'classes': classess,
            }
예제 #37
0
 def test_torrent_rows(self):
     request = urlopen(str(self.torrents.url))
     document = html.parse(request)
     rows = self.torrents._get_torrent_rows(document.getroot())
     self.assertEqual(len(rows), 30)
예제 #38
0
                text = element.text_content().strip()
            if (element.tag == 'br') and skip_state == 1:
                if element.tail is not None:
                    text = element.tail.strip()
            if text != '':
                full_text.append(text)
    return {
        'title': '%s_%s_%s' % (category, collect, title),
        'text': full_text
    }

url = 'http://www.zwbk.org/MyLemmaShow.aspx?lid=76385'
connect = urlopen(url)

content = connect.read()
page = html.parse(StringIO(content.decode('utf-8')))
table = page.xpath('//table/tr/td[2]/div/div[7]')

collect_list = []
for links in table[0].find_class('classic'):
    title = links.text_content().split(u'·')
    if len(title) > 3:
        page_url = links.attrib.get('href')
        collect_list.append({
            'category': title[1],
            'collect': title[2],
            'title': title[3],
            'page_url': page_url
        })

result = map(lambda x: get_fulltext(**x), collect_list)
예제 #39
0
fpi = "https://www.fpi.nsdl.co.in/web/Reports/Latest.aspx"
header = {
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Host': 'nseindia.com',
    'Referer':
    'https://nseindia.com/live_market/dynaContent/live_watch/live_index_watch.htm',
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1;WOW64;rv:28.0) Gecko Firefox/45',
    'X-Requested-With': 'XMLHttpRequest'
}

req = urllib2.Request(fpi, headers=header)
page = urllib2.urlopen(req)
parsed = parse(page)
soup = BeautifulSoup(page)
page.status_code == 200

doc = parsed.getroot()
tables = doc.findall('.//table')
table = parse_options_data(tables[0])

xfc = requests.get(fpi, headers=header)
xfc.status_code == 200
xpage = urllib2.urlopen(xfc.content)
xparse = parse(xfc.content)

xtsf = soup(xfc.content)
tsd = table_to_2d(xtsf)
xtsf.find_all("row")
예제 #40
0
# -*- coding: UTF-8 -*-
import lxml.html as html
from lxml.etree import Element, SubElement, ElementTree
from lxml import etree
tag=[]
url_list=[]
an_news_list=[]
newsTags_list = []
news_title_list = []
for i in range(1,10):
    root = html.parse('http://www.kinopoisk.ru/news/perpage/200/page/{0}/'.format(i)).getroot()
    tag.extend(root.find_class('item'))
for i in tag:
    for j in i.iterlinks():
        if j[2] == '/name/7418/' or 'id_actor=7418' in j[2]:
            for y in i.find_class('title').pop().iterlinks():
                if 'news' in y[2]:
                    url_list.append('http://www.kinopoisk.ru'+y[2])
            an_news_list.append(i.find_class('descr').pop().text_content())

for ind, url in enumerate(url_list):
    page1 = html.parse(url)
    root1 = page1.getroot()
    tag1 = root1.find_class('newsHeaderTitle').pop()
    news_title = tag1.text_content().strip()
    news_title_list.append(news_title)
    tag2 = root1.find_class('newsTags').pop()
    newsTags = tag2.text_content().split()
    for i in range(len(newsTags)-1):
        if u'премьер' in newsTags[i]:
            newsTags[i] = newsTags[i] + " " + newsTags.pop(i+1)
# -*- coding: utf-8 -*-
import lxml.html as html
tag=[]
url_list=[]
title_list=[]
for i in range(1,493):
    root = html.parse('http://kinogo.co/page/{0}/'.format(i)).getroot()
    tag.extend(root.find_class('shortstory'))
for i in tag:
    for j in i.iterlinks():
        if 'indijskie_filmy' in j[2]:
            for y in i.find_class('zagolovki').pop().iterlinks():
                if '2010' in y[2]:
                    url_list.append(y[2])
                    title_list.append(i.find_class('zagolovki').pop().text_content())
for i in range(len(url_list)):
    print url_list[i]
    print title_list[i]
예제 #42
0
csvr = csv.reader(f)
csvw = csv.writer(fo)
url_tpl = 'http://www.expansion.com/mercados/bolsa/dividendos/{suffix}'

p = re.compile('(\d*\.?\d+,\d+)')
got_to_sps = False
for r in csvr:
    if r[0] == 'SPS': got_to_sps = True
    if not got_to_sps: continue
    url = url_tpl.format(suffix=r[1])
    print('processing %s' % url)
    try:
        page = urlopen(url)
    except HTTPError:
        continue
    root = html.parse(page)
    ttrr = root.findall('.//div[@id="dividendos_doble_izquierda"]//tr')
    if ttrr and len(ttrr) > 1:
        for tr in ttrr[1:]:
            ttdd = tr.findall('.//td')
            d = ttdd[0].text.replace('.', '-')
            net = p.match(ttdd[2].text).group(0).replace('.', '').replace(',', '.')
            try:
                gross = p.match(ttdd[1].text).group(0).replace('.', '').replace(',', '.')
            except AttributeError: # shit happens
                gross = float(net) * 1.3333
            csvw.writerow([r[0], d, gross, net, ttdd[3].text, ttdd[4].text])

f.close()
fo.close()
예제 #43
0
def get_categorie_content(category_link):
    # Get the page
    allrecords = []

    parser = etree.HTMLParser(encoding='utf-8')
    data = etree.parse(rooturl + category_link, parser)
    # Get the category
    category = data.xpath('/html/body/div/div[5]/div/div[1]//h1/text()')[0].strip()
    # category = urllib.unquote(category).decode('utf8')
    if (verbose): print 'Category: ' + ascii_only(category)

    datasets = get_datasets(data)
    numdatasets = len(datasets)

    if (verbose): print 'There are ' + str(numdatasets) + ' datasets'

    # Now get the html for each one. This is painful.
    # The bit of html concerning the datasets:
    corehtml = data.xpath('//div[@id=\'ContentBlock\']')[0]
    # First try to split by the horizontal rules. This usually works, but not always
    datasetparts = etree.tostring(corehtml).split('<hr id="hr')
    if (verbose): print 'Found ' + str(len(datasetparts)) + ' datasets by splitting by hr elements with ids'
    if len(datasetparts) != numdatasets:
        if (verbose): print 'This doesn\'t match. Trying with links to TOC'
        # If there is TOC, this works. There isn\'t always one.
        datasetparts = etree.tostring(corehtml).split('nach oben')
        del datasetparts[len(datasetparts) - 1]
        for index in range(0, len(datasetparts)):
            datasetparts[index] = datasetparts[index] + '</a>'
        if (verbose): print 'Found ' + str(len(datasetparts)) + ' datasets by splitting by links to TOC'
        if len(datasetparts) != numdatasets:
            if (verbose): print 'Well, that didn\'t work either. Giving up'
            print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem'
            exit()
    else:
        if numdatasets > 1:
            for index in range(1, len(datasetparts)):
                # That split makes for bad HTML. Make it better.
                datasetparts[index] = '<hr id="hr' + datasetparts[index]

    count = 1

    for datasetpart in datasetparts:
        data = etree.HTML(datasetpart)
        record = {}
        record['city'] = 'bochum'
        record['categories'] = []
        record['categories'].append(category)

        datasets = get_datasets(data)
        record['title'] = datasets[0]

        if (verbose): print 'Parsing dataset ' + ascii_only(record['title'])
        if 'noch im Aufbau' in record['title']:
           # Nothing to see here
           if (verbose): print 'Empty category'
           continue
        record['url'] = rooturl + category_link + '#par' + str(count)
        count += 1
        datatables, filetables = findfilesanddata(data)

        if len(datatables) == 0:
            if (verbose): print 'This record contains no data... checking for link to another page...'
            checkforsubpage = data.xpath('//span//a')

            for link in checkforsubpage:
                if (verbose): print etree.tostring(link)
                if len(link.xpath('text()')) > 0 and u'zu den Daten' in link.xpath('text()')[0]:
                    testurl = link.xpath('@href')[0]
                    if (verbose): print 'Following/updating URL: ' + rooturl + testurl
                    record['url'] = rooturl + testurl
                    datatables, filetables = findfilesanddata(html.parse(rooturl + testurl))

        # get the data on the files, and get each link in it
        record['filelist'] = []
        for table in filetables:
            record['filelist'].extend([(rooturl + x) for x in etree.HTML(table).xpath('//a/@href')])

        record['formats'] = set()
        record['spatial'] = False
        for file in record['filelist']:
            formatarray = file.split('/')[-1].split('.')
            format = 'Unknown'
            if len(formatarray)>1:
                format = formatarray[1].upper().split('?')[0]
            elif 'WMS' in formatarray[0]:
                format = 'WMS'
            elif 'WFS' in formatarray[0]:
                format = 'WFS'
            record['formats'].add(format)
            if (format.upper() in metautils.geoformats):
                record['spatial'] = True
        record['formats'] = list(record['formats'])

        if len(datatables) > 1:
            if (verbose): print 'ERROR: More than one data table'
            print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem'
            exit()
        elif len(datatables) == 0:
            if (verbose): print 'ERROR: No data table'
            print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem'
            exit()

        # parse the data table by row
        if (verbose): print 'Reading datatable...'
        rowelements = etree.HTML(datatables[0]).xpath('//tr')
        for row in rowelements:
            if len(row.xpath('td[1]/text()')) == 0: continue
            key = row.xpath('td[1]/text()')[0]
            if (verbose): print ascii_only(key)
            if len(row.xpath('td[2]/text()')) != 0:
                val = row.xpath('td[2]/text()')[0]
            elif len(row.xpath('td[2]//a')) != 0:
                val = row.xpath('td[2]//a/text()')[0]
            else:
                if (verbose): print 'ERROR: Missing value'
                print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem'
                exit()
            if (verbose): print ascii_only('Parsing key ' + key.replace(':', '') + ' with value ' + val)
            if u'veröffentlicht' in key:
                record['publisher'] = val
            elif u'geändert' in key:
                record['temporalextent'] = val.split(' ')[2]
            elif u'Lizenz' in key:
                record['licenseshort'] = metautils.long_license_to_short(val)
                record['open'] = metautils.isopen(record['licenseshort'])
            elif u'Webseite' in key:
                record['website'] = row.xpath('td[2]//a/@href')[0]  # keep, as 'original' metadata
                if 'http://' not in record['website']:
                    record['website'] = rooturl + record['website']
            elif u'Kontakt' in key:
                record['contact'] = rooturl + row.xpath('td[2]//a/@href')[0]

        allrecords.append(record)
    return allrecords
예제 #44
0
    search_tags = [tag.strip() for tag in search_tags.split(",")]
    search_tags = [tag for tag in search_tags if not (tag == "" or "*" in tag)]

    info = {
        "item_id": item_id,
        "title": title,
        "subtitle": subtitle,
        "description_parts": description_parts,
        "item_pools": item_pools,
        "item_types": item_types,
        "search_tags": search_tags
    }
    return info


platinumgod_content = html.parse('http://platinumgod.co.uk')

print("Extracting image information")
item_containers = platinumgod_content.xpath(
    '//div[contains(@class, "items-container")]')
item_infos = {}
for item_container in item_containers:
    items = item_container.xpath('.//li[contains(@class, "textbox")]')
    bar = progressbar.ProgressBar()
    for item in bar(items):
        item_info = extract_item_info(item)
        item_infos[item_info["item_id"]] = item_info

print("Extracting trinket information")
trinket_containers = platinumgod_content.xpath(
    '//div[contains(@class, "trinkets-container")]')
if len(sys.argv) < 2:
    print "Usage"
    sys.exit(1)

genre = sys.argv[1]

datadir = 'mc-data/' + genre

csvfile = open(os.path.join(datadir, genre + '.csv'), 'w')
writer = csv.writer(csvfile)
writer.writerow(['Artist', 'Album', 'Score'])
for page in sorted(os.listdir(datadir)):
    if page.endswith('html'):
        print 'parsing page ' + page
        page_file = open(os.path.join(datadir, page), 'r')
        doc = html.parse(page_file).getroot()
        try:
            for li in doc.cssselect('li.release_product'):
                album = li.cssselect(
                    'div.product_title')[0].text_content().strip()
                score = li.cssselect(
                    'span.metascore')[0].text_content().strip()
                artist = li.cssselect('li.product_artist')[0].cssselect(
                    'span.data')[0].text_content().strip()
                print 'artist: %s, album: %s, score: %s' % (artist, album,
                                                            score)
                writer.writerow([artist, album, score])

        except Exception as e:
            print e
def parsePages():

    # get the pagetitle
    path = r'/Users/carolm/Desktop/lingrad'

    for dirpath, subdirs, files in os.walk(path):
        for x in files:
            if fnmatch.fnmatch(x, '*.html'):
                item = os.path.join(dirpath, x)
                doc = parse(item).getroot()
                print doc.text_content()
                cleaner = Cleaner(
                    style=True,
                    links=False,
                )
                cleaned = cleaner.clean_html(doc)

                titles = cleaned.find_class('Pagetitle')
                if titles:
                    # snag the page title - method returns list. . there's really only one
                    title = titles[0].text_content()
                else:
                    try:
                        titlesel = cleaned.xpath('//p[@class="Subhead"]')
                        title = titlesel[0].text_content()
                    except:
                        pass

    # get the description
                descrips = cleaned.find_class('Summarytext')
                if descrips:
                    descrip = descrips[0].text_content()
                else:
                    descrip = "no description"
    #get the body
                if cleaned.find_class('Summarytext'):
                    bodies = cleaned.xpath(
                        '//p[@class="Summarytext"]/following-sibling::p')
                elif cleaned.find_class('Subhead'):
                    bodies = cleaned.xpath(
                        '//p[@class="Subhead"]//following-sibling::p')
                else:
                    bodies = cleaned.xpath('*//p')

                html = "".join([
                    lxml.html.tostring(body, method='xml') for body in bodies
                ])
                html = html.replace('\n', ' ').replace('\r', ' ')
                html = html.replace('&#10;', ' ').replace('&#13;', ' ')
                html = html.replace('&#xa;', ' ').replace('&#xd;', ' ')
                html = html.replace('&#8226;', '').replace('&#160;', '')
                html = html.replace('&nbsp', '')
                html = html.replace('class="msoNormal"', '').replace('###', '')
                html = html.replace('<span> </span>', '')
                #  html = re.sub(r'<p.*?[.*?Body text:.*?].*?</p>', r'', html)
                html = re.sub(r'<p class="Bullettext">(.*?)</p>',
                              r'<li>\1</li>', html)
                html = re.sub(r'<p class="Subhead1">(.*?)</p>', r'<h3>\1</h3>',
                              html)

                newbody = html

                #Need to have temporary id
                id = str(random.randint(0, 99999999))

                target.invokeFactory("Document", id)
                obj = target[id]
                obj.setTitle(title)
                obj.setDescription(descrip)
                obj.setText(newbody)

                # Will finish Archetypes content item creation process,
                # rename-after-creation and such
                obj.processForm()
                transaction.savepoint(optimistic=True)

                # Need to perform manual normalization for id,
                # as we don't have title available during the creation time
                normalizer = getUtility(IIDNormalizer)
                new_id = normalizer.normalize(obj.Title())

                if new_id in target.objectIds():
                    raise RuntimeError("Item already exists:" + new_id +
                                       " in " + target.absolute_url())

                obj.aq_parent.manage_renameObject(id, new_id)
                transaction.commit()
                obj.reindexObject()
예제 #47
0
    def send_FIXATION(self, tokens, tokenvals, targetval, destination,
                      postvars, postvals, fixvars, fixvals, idsrc):
        """ EXPERIMENTAL: This is the fixation handler. It needs a lot of work and is very simple at the moment. It's currently
        Experimental and just used to demonstrate a PoC of this type of attack """

        # Yes, I realize a lot of this is duplicated from the previous payload. It's just because I don't know
        # what the hell is going to happen with it.

        # Give the value to the meta refresh
        metadest = "0;" + destination

        # Make the request for the idsrc
        request = urllib2.Request(idsrc)
        opener = urllib2.build_opener()

        # Add a useragent to the request, Yeah, I know. This should be user definable. Maybe later.
        request.add_header(
            'User-Agent',
            'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.11) Gecko/2009060214 Firefox/3.0.11'
        )
        response = opener.open(request).read()

        root = html.parse(StringIO(response))

        ###########################
        # ToDo: Currently only looks for name = value situations. Needs to
        # possibly look for others too such as id
        ###########################

        # Grab the data values for fixation

        for index, value in enumerate(fixvars):
            for node in root.iter():
                if node.get('name') == value:
                    fixvals[index] = node.get('value')

        # Append the fixated values in to the POST variables and values
        for value in fixvars:
            postvars.append(value)
        for value in fixvals:
            postvals.append(value)

        header = '<meta http-equiv="refresh" content="%s" />' % metadest

        page2 = open("page2.html", "wb")

        innerpage = markup.page()

        innerpage.init()
        formsubmit = "javascript:document.myform.submit()"
        formname = "myform"
        formaction = "post"
        inputtype = "hidden"

        innerpage.body(onload=formsubmit)

        innerpage.form(name=formname, action=targetval, method=formaction)

        for index, val in enumerate(postvars):
            innerpage.input(name=val, type=inputtype, value=postvals[index])

        innerpage.form.close()

        page2.write(str(innerpage))
        page2.close()

        # Create primary page
        page = markup.page()

        page.init(header=header)

        # This is a hack for Markup.py so it will properly close the iframe tag
        ifrmtext = "this"

        ifrmsrc = "page2.html"
        page.iframe(ifrmtext, height="1", width="1", src=ifrmsrc)

        # page.form(formvals, name=formname, method=attacktype, action=targetval)

        # print(page)
        self.wfile.write(page)
예제 #48
0
import smtplib
import main

from os.path import basename
#from pandas import DataFrame
from time import gmtime, strftime

addresses, email_list = main.ReadConfig("main.ini", "mosclinic")
main_domain_stat = addresses[0].split("/")[2]
print main_domain_stat
today = strftime("%d.%m.%Y %H:%M", gmtime())
a = []
b = []
table_data = []
for page_link in addresses:
    page = html.parse(page_link)

    #for el in page.getroot().find_class('noline'):
    for el in page.getroot().find_class('margin15 font_arial12 as_a2'):
        link = el.values()[2]
        if "medreview" in link:
            page1 = html.parse('%s' % (link))
            content = page1.getroot().find_class(
                'margin15 font_arial12')[0].text_content()
            #imgs = page1.getroot().findall(".//img[@style]")
            dates = page1.getroot().findall(".//meta[@itemprop]")
            for date in dates:
                if date.items()[0][0] == "itemprop" and date.items(
                )[0][1] == "datePublished":
                    time = date.items()[1][1]
            content_link = content + "<br>" + link
예제 #49
0
    def on_data(self, data):
        global f
        global filecnt
        global tweetcnt
        global chkFlag

        #Checking if the file count has reached 50 (i.e 5GB)
        if (filecnt >= 50):
            print "filecnt"
            chkFlag = False
            return False

        #Checks the number of tweets
        if tweetcnt >= numTweets and numTweets != 0:
            print "first"
            chkFlag = False
            return False

        #Create a new text file every 100MB
        if (f.tell() >= 104857600):
            print "last"
            f.close()
            chkFlag = True
            filecnt += 1

            file_output_path = dirName + "/tweets_data{0}.txt".format(filecnt)
            f = open(file_output_path, 'a')

        decoded = json.loads(data)

        #Get Hastags
        hashTags = decoded['entities']['hashtags']
        if (hashTags != "[]"):
            for htags in hashTags:
                hashTags = unicode(htags['text']).encode("ascii", "ignore")

        #Get tweet
        tweet = unicode(decoded['text']).encode("ascii", "ignore").replace(
            '\n', ' ').replace('\t', '').replace('\r', '')

        #Get Co-ordinates
        coord = unicode(decoded['coordinates']).encode("ascii", "ignore")

        #Get tweet time
        tweetTime = unicode(decoded['created_at'])

        #Get retweet count
        retweetCount = unicode(decoded['retweet_count']).encode(
            "ascii", "ignore")

        #Get reply count
        replyCount = unicode(decoded['reply_count']).encode("ascii", "ignore")

        #Get favorite count
        favoriteCount = unicode(decoded['favorite_count']).encode(
            "ascii", "ignore")

        #Get URLs
        urls = unicode(decoded['entities']['urls']).encode("ascii", "ignore")

        #Get title
        pageTitle = None
        expanded_url = None
        if urls != "[]":
            expanded_url = unicode(
                decoded['entities']['urls'][0]['expanded_url']).encode(
                    "ascii", "ignore")
            try:
                page = urllib2.urlopen(expanded_url)
                p = parse(page)
                pageT = p.find(".//title")
                if (pageT != None):
                    pageTitle = unicode(p.find(".//title").text).encode(
                        "ascii", "ignore")

            except urllib2.HTTPError, err:
                if err.code == 404:
                    print "Page not found!"
                elif err.code == 403:
                    print "Access denied!"
                else:
                    print "Error:", err.code
            except urllib2.URLError, err:
                print "URL error:", err.reason
예제 #50
0
    def convert(self, fn=None, dirn=None):
        global sample_name, lab, fl

        def extract_file_data(lns, dlm):
            chapters = re.split(dlm + "{3,}", lns)
            for chapter in chapters:
                chapter = chapter.strip()
                div = etree.SubElement(doc, 'block')

                paragraphs = re.split(delim + "{1,2}", chapter)
                for i in range(len(paragraphs)):
                    pgph = etree.SubElement(div, 'block')
                    pgph.text = paragraphs[i]

        if fn is None:
            print('Type in filename')
            fn = input()
            fl = fn
            sample_name = ''
            lab = 'lab1'
        else:
            fl = Path(dirn) / fn
            sample_name = ''.join(
                filter(bool, re.split(r'/|\w(?!\w*/$)', dirn))) + '/'
            lab = re.match(r'\w+(?=/)', dirn).group(0)

        route = re.split(r'/', fn)
        xml_fn = '.xml'.join(re.split(r'\.\w+$', route[len(route) - 1]))

        doc = etree.Element('doc')
        doc.attrib['name'] = xml_fn

        if re.search(r'\.txt$', fn):
            f = open(fl, encoding='utf8')
            lines = ''.join(f.readlines())
            delim = r'\n'
            extract_file_data(lines, delim)

        elif re.search(r'\.html$', fn):
            file = codecs.open(fl, 'r')
            file_content = file.read()
            parser = html.HTMLParser()
            html_tree = html.parse(io.StringIO(file_content), parser)
            for b in html_tree.xpath('//div[p]'):
                block = etree.SubElement(doc, 'block')
                for idx, p in enumerate(html_tree.xpath('//div/p')):
                    paragraph = etree.SubElement(block, 'block')
                    p_child_text = ''
                    for el in html_tree.xpath('//div/p[' + str(idx + 1) +
                                              ']/*'):
                        p_inner = etree.SubElement(paragraph, 'block')
                        p_inner.text = escape(el.text_content())

                        p_child_text = ''.join(p_child_text.split(el.text_content())) \
                            if p_child_text \
                            else ''.join(p.text_content().split(el.text_content()))
                    paragraph.text = escape(''.join(
                        re.split(r'\n{2,}| +\n', p_child_text)))
        elif re.search(r'\.docx$', fn):
            file = docx.Document(fl)
            lines = []
            for p in file.paragraphs:
                lines.append(p.text)
            lines = '\n'.join(lines)
            delim = r'\n'
            extract_file_data(lines, delim)
        elif re.search(r'\.pdf$', fn):
            rsc_mngr = PDFResourceManager()
            fh = io.StringIO()
            converter = TextConverter(rsc_mngr, fh)
            pg_interp = PDFPageInterpreter(rsc_mngr, converter)

            fp = open(fl, 'rb')
            for pg in PDFPage.get_pages(fp,
                                        caching=True,
                                        check_extractable=True):
                pg_interp.process_page(pg)

            lines = ''.join(re.split(r'\n{2,}|\x0c', fh.getvalue()))
            converter.close()
            fh.close()

            delim = ' '
            extract_file_data(lines, delim)
        else:
            print('Incorrect filename extension!')

        tree = etree.ElementTree(doc)
        tree.write("%s/xml_samples/%s%s" % (lab, sample_name, xml_fn),
                   pretty_print=True,
                   xml_declaration=True,
                   encoding='UTF-8')
        return '%s/xml_samples/%s%s' % (lab, sample_name, xml_fn)
예제 #51
0
def get_listings():
    page = 1
    totalPages = 1
    shows = []
    while page <= totalPages:

        response = urlopen(DATA_URL % str(page))
        data = json.loads(response.read().decode())

        totalPages = data["totalPages"]
        for product in data["data"]:

            show = {}

            show["title"] = product["name"]
            show["image"] = product["imageUrl"]
            show["type"] = "movie"

            print(show["title"], product["prodUrl"])
            # get price from data url
            try:
                doc = lh.parse(urlopen(product["prodUrl"]))
                prices = doc.xpath(".//span[contains(@class, 'price')]")
                if len(prices) > 0:
                    price = prices[0].text.strip()[1:]
                    print(price)
                    show["episodes"] = [{
                        "show": product["name"],
                        "uri": product["prodUrl"],
                        "s": 0,
                        "e": 0,
                        "price": price
                    }]
                    shows.append(show)
            except:
                pass
        page = page + 1

    page = 1
    totalPages = 1
    while page <= totalPages:

        response = urlopen(DATA_URL_TV % str(page))
        data = json.loads(response.read().decode())

        totalPages = data["totalPages"]
        for product in data["data"]:

            show = {}
            id = product["id"]

            series = 0
            title = re.sub(r' (Series|Season) \d+[a-zA-Z]?\b', '',
                           product["name"].strip())

            matches = re.search(r"\d+[a-zA-Z]?\b", product["name"].strip())
            if matches:
                series = matches.group(0)

            episodes = get_episodes(id, series, product["prodUrl"])

            for x in shows:
                # merge seasons
                if x["title"] == title:
                    x["episodes"] = x["episodes"] + episodes
                    break
            else:
                # new show
                show = {}
                show["title"] = title
                show["type"] = "tv"
                show["episodes"] = episodes
                show["image"] = product["imageUrl"]
                shows.append(show)

        page = page + 1

    return shows
예제 #52
0
파일: dhs.py 프로젝트: habinez/lxml_crawler
download DHS Yearbook tables of Immigration Statistics 2015
"""

from lxml import html
from urllib import request
import numpy as np
import pandas as pd

if __name__ == "__main__":
    captions = set()
    writer = pd.ExcelWriter('dhs.xlsx', engine="xlsxwriter")
    for table_num in range(1, 42):
        url = "https://www.dhs.gov/immigration-statistics/"\
                    "yearbook/2015/table{0}".format(table_num)
        tree = html.parse(request.urlopen(url))
        _path = '//*[@id="content-area"]/div/div/article/div[1]/div[2]/div/div/table'
        tables = tree.xpath(_path)
        """ Some tables contains data  by continents and by countries.
        The data by countries will override the one for continents
        in the following for loop
        """
        for table in tables:
            data = [row.text_content().strip().split("\n")
                        for row in table.xpath('//tr')]
            cap = table.xpath('//caption')[0].text_content()
            df = pd.DataFrame(data=data[1:])
            if df.shape[1] == len(data[0]):
                df.columns=data[0]
                df.replace('-', np.nan, inplace=True)
                df.to_excel(writer, sheet_name="Table{}".format(table_num), index=False)
예제 #53
0
movie_comment_xpath = "//ol/li[{}]/div/div/div/p/span/text()"
movie_ratting_xpath = "//ol/li[{}]/div/div/div/div/span[@class='rating_num']/text()"

movie_name_Chineses = []
movie_name_Englishs = []
movie_comments = []
movie_rattings = []
movie_directors = []
movie_actors = []
movie_ages = []
movie_countrys = []
movie_categorys = []

#2. 开始爬取网页内容
while next_pages:
    dom = html.parse(urlopen(next_pages))
    movies = dom.xpath(movie_xpath)
    for i in range(len(movies)):
        movie_name_Chinese = dom.xpath(movie_name_Chinese_xpath.format(i + 1))
        movie_name_English = dom.xpath(
            movie_name_English_xpath.format(i + 1))  # 去掉斜线空格
        movie_info = dom.xpath(movie_info_xpath.format(i + 1))
        movie_comment = dom.xpath(movie_comment_xpath.format(i + 1))
        movie_ratting = dom.xpath(movie_ratting_xpath.format(i + 1))
        movie_info_detail = regular_expression(movie_info)
        director = movie_info_detail[0]
        list_append(director, movie_directors)

        actor = movie_info_detail[1]
        list_append(actor, movie_actors)
예제 #54
0
 a8 = []
 a9 = []
 a10 = []
 a21 = []
 a22 = []
 a23 = []
 a24 = []
 a25 = []
 a26 = []
 a27 = []
 a28 = []
 bar = Bar('Processing', max=part)
 for i in range(s, s + part):
     br = mec.Browser()
     page = br.open(sitelinks[i])
     tree = html.parse(page)
     get1 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[1]/text()')
     get2 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[2]/text()')
     get3 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[3]/text()')
     get4 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[4]/text()')
     get5 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[5]/text()')
     get6 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[6]/text()')
     get7 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[7]/text()')
     get8 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[8]/text()')
     get9 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[9]/text()')
     get10 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[10]/text()')
     try:
         get21 = tree.xpath('/html/body/div[2]/div[3]/div[2]/div[1]//tr/td[1]/text()')[0]
         get22 = tree.xpath('/html/body/div[2]/div[3]/div[2]/div[1]//tr/td[1]/text()')[1]
         get23 = tree.xpath('/html/body/div[2]/div[3]/div[2]/div[1]//tr/td[1]/text()')[2]
         get24 = tree.xpath('/html/body/div[2]/div[3]/div[2]/div[1]//tr/td[1]/text()')[3]
예제 #55
0
import scraperwiki
from itertools import count
from lxml import html

BASE = "http://www.openpr.de/news/%s"


initial = scraperwiki.sqlite.get_var('num', 1)

for i in count(initial):
    url = BASE % i
    try:
        doc = html.parse(url)
        pm = doc.find('//div[@id="pm"]')
        pm_str = html.tostring(pm)
        scraperwiki.sqlite.save(["id"], {'id': i, 'pm': pm_str, 'url': url})
        print "AYE", i
    except:
        print "FAIL", i
    scraperwiki.sqlite.save_var('num', i)import scraperwiki
from itertools import count
from lxml import html

BASE = "http://www.openpr.de/news/%s"


initial = scraperwiki.sqlite.get_var('num', 1)

for i in count(initial):
    url = BASE % i
    try:
예제 #56
0
    url = data_url % (street, house)
    print url
    doc = html.parse(url)
    data = {'strnr': street, 'hausnr': house}
    for row in doc.findall('//table[@class="hnrresult"]//tr'):
        name, value = row.findall('./td')
        name = str(name.text_content().encode('ascii', 'ignore'))
        name = name.replace(':', '_').replace('.', '-').replace(' ', '_')
        value = value.xpath('string()').strip()
        data[name] = value
    print data
    #scraperwiki.sqlite.save(unique_keys=["strnr", "hausnr"],
    #    data=data)


doc = html.parse(base_url)
for option in doc.findall('//select[@name="otnr"]/option'):
    sdoc = html.parse(streets_url % option.get('value'))
    for street in sdoc.findall('//input[@name="strnr"]'):
        hdoc = html.parse(houses_url % street.get('value'))
        for house in hdoc.findall('//input[@name="hausnr"]'):
            print house.items()
            print dir(house)
            #get_data(street.get('value'), house.get('value'))

import scraperwiki
from lxml import html

base_url = "http://fbinter.stadt-berlin.de/rbs/rbs-lookup.jsp"
#streets_url = "http://fbinter.stadt-berlin.de/rbs/rbs-lookup.jsp?beznr=&otnr=%s"
#streets_url = "http://fbinter.stadt-berlin.de/rbs/rbs-slct-str.jsp?beznr=&otnr=%s&strnr=&strname=&hausnr=&go=&mapLabel=&targetUrl="
예제 #57
0
def getsurf():

    if debugParse:
        try:
            tree = html.parse('Status.html')
        except Exception as e:
            eprint(e)
            return os.EX_IOERR
    else:

        # Wait for network to come up from system sleep
        if sleepsec > 0:
            time.sleep(sleepsec)

        # Try to bring up network device with ping.
        if pings > 0:
            try:
                ping = subprocess.run([
                    "ping", "-o", "-q", "-i",
                    str(pingwait), "-c",
                    str(pings), "-n", ip
                ],
                                      stdin=subprocess.DEVNULL,
                                      stdout=subprocess.DEVNULL,
                                      stderr=subprocess.DEVNULL)
                if ping.returncode != 0:
                    eprint("warning: {} returned {}".format(
                        ' '.join(ping.args), ping.returncode))
            except Exception as e:
                eprint(e)

        # read surfboard admin password from file on working directory
        try:
            with open('surfboard_password.txt', 'r') as pwdfile:
                passwd = pwdfile.readline().strip()
        except Exception as e:
            eprint(e)
            return os.EX_IOERR

        login_url = 'http://' + ip + '/cgi-bin/adv_pwd_cgi'
        status_url = 'http://' + ip + '/cgi-bin/status'
        logout_url = 'http://' + ip + '/cgi-bin/status#'
        ar_nonce = '{:08d}'.format(random.randint(0, 99999999))

        payload = {
            'username': '******',
            'password': passwd,
            'ar_nonce': ar_nonce
        }

        try:
            with requests.Session() as s:
                p = s.post(login_url, data=payload, timeout=30)
                # print(p.text)
                if p.status_code != requests.codes.ok:
                    eprint("{}, code={}".format(login_url, p.status_code))

                # An authorised request.
                r = s.get(status_url, timeout=30)
                if r.status_code != requests.codes.ok:
                    eprint("{}, code={}".format(status_url, r.status_code))

                tree = html.fromstring(r.text)

                lo = s.get(logout_url, timeout=30)
                if lo.status_code != requests.codes.ok:
                    eprint("{}, code={}".format(logout_url, lo.status_code))

                if tree is None:
                    eprint("{}, no content, code={}".format(
                        status_url, r.status_code))
                    return os.EX_IOERR

        except Exception as e:
            eprint(e)
            return os.EX_IOERR

    try:
        timeel = tree.xpath('//*[text()=\'Current System Time:\']')
        if not timeel or len(timeel) < 1:
            eprint("Time not found")
            return os.EX_IOERR

        if timeel[0].tag != 'p':
            timeel = timeel[0].xpath('./ancestor::p')
            if not timeel or len(timeel) < 1:
                eprint("Time not found")
                return os.EX_IOERR

        timestr = timeel[0].text_content().encode("UTF-8").decode()

        timestr = timestr.split(':', 1)
        if not timestr or len(timestr) != 2:
            eprint("time={}, not parseable".format(timestr))
            return os.EX_IOERR

        timestr = timestr[1].strip()

        try:
            timeval = datetime.datetime.strptime(timestr,
                                                 '%a %b %d %H:%M:%S %Y')
        except ValueError as e:
            eprint("time={}, not parseable: {}".format(timestr, e))
            return os.EX_IOERR

        tbls = tree.xpath('//table')

        for tbl in tbls:
            # look for Downstream Bonded Channels table
            if tbl.xpath(
                    './/*[contains(text(),"Downstream Bonded Channels")]'):

                rows = tbl.getchildren()
                for row in rows:
                    # first row has only the "Downstream ..." th
                    # second row has "Channel" header
                    tds = row.xpath('./td')
                    if len(tds) == 0 or tds[0].text_content() == "Channel":
                        continue

                    vals = [
                        col.text_content().encode('UTF-8').decode().strip()
                        for col in tds
                    ]
                    if len(vals) < 7:
                        eprint("Only {} values in table row".format(len(vals)))
                        continue

                    vals[4] = vals[4].replace('MHz', '').strip()
                    vals[5] = vals[5].replace('dBmV', '').strip()
                    vals[6] = vals[6].replace('dB', '').strip()
                    vals = [val.replace('----', '') for val in vals]
                    print("{0},{1}".format(timeval, ','.join(vals)))

    except etree.XPathEvalError as e:
        eprint('xpath exception={}'.format(e))
        return os.EX_IOERR

    return os.EX_OK
예제 #58
0
import re

# Matplotlib module
import matplotlib.pyplot as plt

# general urllib2 config
user_agent = 'Mozilla/5.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent}
url = "http://it.wikipedia.org/wiki/Demografia_d'Italia"

# prepare the request and open the url
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req)

# we parse the webpage, getroot() return the document root
doc = parse(response).getroot()

# find the data table, using css elements
table = doc.cssselect('table.wikitable')[0]

# prepare data structures, will contain actual data
years = []
people = []

# iterate over the rows of the table, except first and last ones
for row in table.cssselect('tr')[1:-1]:
    # get the row cell (we will use only the first two)
    data = row.cssselect('td')

    # the first cell is the year
    tmp_years = data[0].text_content()
예제 #59
0
import lxml.html as html
tag=[]
url_list=[]
title_list=[]
for i in range(1,137):
    root = html.parse('http://gidonline.club/genre/melodrama/page/{0}/'.format(i)).getroot()
    tag.extend(root.find_class('mainlink'))
for i in tag:
    try:
        y = i.find_class('mqn').pop().text_content()
        if y == '2010':
            for y in i.find_class('mainlink').pop().iterlinks():
                if y[1] == 'href':
                    url_list.append(y[2])
            title_list.append(i.text_content().split('\n')[0])
    except IndexError:
        y = i.find_class('mqx').pop().text_content()
for i in range(len(url_list)):
    print "Title: ", title_list[i]
    print "URL: ", url_list[i], '\n'
예제 #60
0
import requests
from lxml.html import parse
from io import StringIO

# 검색할 이미지의 키워드 입력
keyword = input("검색할 이미지를 입력하세요 : ")
url = 'https://www.google.co.kr/search?q=' + keyword + '&source=lnms&tbm=isch&sa=X&ved=0ahUKEwic-taB9IXVAhWDHpQKHXOjC14Q_AUIBigB&biw=1842&bih=990'

# html 소스 가져오기
text = requests.get(url).text

# html 문서로 파싱
text_source = StringIO(text)
parsed = parse(text_source)

# root node
doc = parsed.getroot()

# img 경로는 img 태그안에 src에 있음(20개만 크롤링 됨.. 이유 찾아봐야 됨)
imgs = doc.findall('.//img')

img_list = []  # 이미지 경로가 담길 list
for a in imgs:
    img_list.append(a.get('src'))
    print(a.get('src'))