Пример #1
0
def process_feeds(tuples, user):
    """ Take tuples from detect_feeds_in_HTML() and create an outline entry for opml
   """
    for t in tuples:
        html = parser(user.url,
                      convertEntities=parser.HTML_ENTITIES).contents[0]

        # Ignore feeds for comments
        if "comment" in t:
            next

        # Ignore annoying typo in the html from a friend
        if "\"" in t:
            next

        # Convert relative URLs
        if "http" in t:
            xml = parser(t, convertEntities=parser.HTML_ENTITIES).contents[0]
        else:
            myxml = html + t
            xml = parser(myxml,
                         convertEntities=parser.HTML_ENTITIES).contents[0]

        # If we've got something, rather than nothing, return a hash
        if xml:
            return {'title': user.name, 'html_url': html, 'xml_url': xml}
        else:
            return None
Пример #2
0
    def process_feeds(self, tuples, user):
       """ Take tuples from detect_feeds_in_HTML() and create an outline entry for opml
       """
       for t in tuples:
           html = parser(user.url, convertEntities=parser.HTML_ENTITIES).contents[0]

           # Ignore feeds for comments
           if "comment" in t:
               next

           # Ignore annoying typo in the html from a friend
           if "\"" in t:
               next

           # Convert relative URLs
           if "http" in t:
               xml = parser(t, convertEntities=parser.HTML_ENTITIES).contents[0]
           else:
               myxml = html + t
               xml = parser(myxml, convertEntities=parser.HTML_ENTITIES).contents[0]

           # If we've got something, rather than nothing, return a hash
           if xml:
               return {'title': user.name, 'html_url': html, 'xml_url': xml}
           else:
               return None
Пример #3
0
def detect_feeds_in_HTML(input_stream):
    """ examines an open text stream with HTML for referenced feeds.

    This is achieved by detecting all ``link`` tags that reference a feed in HTML.

    :param input_stream: an arbitrary opened input stream that has a :func:`read` method.
    :type input_stream: an input stream (e.g. open file or URL)
    :return: a list of tuples ``(url, feed_type)``
    :rtype: ``list(tuple(str, str))``
    """
    # check if really an input stream
    if not hasattr(input_stream, "read"):
        raise TypeError("An opened input *stream* should be given, was %s instead!" % type(input_stream))
    result = []

    # get the textual data (the HTML) from the input stream
    html = parser(input_stream.read())

    # find all links that have an "alternate" attribute
    feed_urls = html.findAll("link", rel="alternate")

    # extract URL and type
    for feed_link in feed_urls:
        url = feed_link.get("href", None)
        # if a valid URL is there
        if url:
            result.append(url)
    return result
Пример #4
0
def detect_feeds_in_HTML(input_stream):
    """ examines an open text stream with HTML for referenced feeds.

    This is achieved by detecting all ``link`` tags that reference a feed in HTML.

    :param input_stream: an arbitrary opened input stream that has a :func:`read` method.
    :type input_stream: an input stream (e.g. open file or URL)
    :return: a list of tuples ``(url, feed_type)``
    :rtype: ``list(tuple(str, str))``
    """
    # check if really an input stream
    if not hasattr(input_stream, "read"):
        raise TypeError(
            "An opened input *stream* should be given, was %s instead!" %
            type(input_stream))
    result = []

    # get the textual data (the HTML) from the input stream
    html = parser(input_stream.read())

    # find all links that have an "alternate" attribute
    feed_urls = html.findAll("link", rel="alternate")

    # extract URL and type
    for feed_link in feed_urls:
        url = feed_link.get("href", None)
        # if a valid URL is there
        if url:
            result.append(url)
    return result
Пример #5
0
def find_url(domain, page, text, only_out=True):
    soup = parser(text)
    links = soup('a')
    #print 'links',links
    links_final = []
    #print 'domain',domain
    page_root = page.replace('http://', '').replace('www', '').split('/')[0]
    for link in links:
        try:
            if ('href' in dict(link.attrs)):
                url = urljoin(page, link['href'])
                if url.find("'") != -1: continue
                url = url.split('#')[0]  # remove location portion
                if url[0:4] == 'http':
                    linkText = gettextonly(link)
                    if only_out:
                        link_root = link['href'].replace(
                            'http://', '').replace('www', '').split('/')[0]
                        if link_root != page_root:
                            links_final.append((url_uniformer(url), linkText))
                    else:
                        links_final.append((url_uniformer(url), linkText))
        except:
            pass
    return links_final
Пример #6
0
    def request(self, url):
        if True:
            self.response = requests.get(self.valid_url)
            status_code = self.response.status_code
        else:
            status_code = None

        if status_code == 200:
            self.parsed_html = parser(self.response.text)
            if not self.folder_created:  # Create folder on first request
                print("Fetching links from server %s" % self.valid_url),
                self.directory = "./" + self.title + "/"
                self.create_folder(self.directory)  #
                self.folder_created = True
                self.crawl_links(primary_links=True)
            else:
                self.crawl_links()
Пример #7
0
def find_url(domain,page,text,only_out=True):
	soup=parser(text)
	links=soup('a')
	#print 'links',links
	links_final=[]
	#print 'domain',domain
	page_root = page.replace('http://','').replace('www','').split('/')[0]
	for link in links:	
		try:
			if ('href' in dict(link.attrs)):
				url=urljoin(page,link['href'])
				if url.find("'")!=-1: continue
				url=url.split('#')[0]	 # remove location portion
				if url[0:4]=='http':
					linkText=gettextonly(link)
					if only_out:
						link_root=link['href'].replace('http://','').replace('www','').split('/')[0]
						if link_root!=page_root:
							links_final.append((url_uniformer(url),linkText))
					else:
						links_final.append((url_uniformer(url),linkText))
		except:
			pass
	return links_final
Пример #8
0
 def parse_raw_html(self, raw_html):
     return parser(raw_html)
Пример #9
0
 def beautify(self, url):
     response_data = self.simple_request(url)        # Parsing response data to HTML
     if response_data:
         return parser(response_data)
     else:
         return None