Exemplo n.º 1
0
 def parse_url(self, url):
     url = url.rstrip("/")
     self.visited[url] = True
     self.unvisited.pop(0)
     parser = LinkParser(url)
     parser.get_all_links()
     return parser.links
Exemplo n.º 2
0
def parserURL(baseUrl,url):

    # get html
    header = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
    }
    r = requests.get(url, headers=header)
    htmlString =  r.text
    
    # parser for links
    parser = LinkParser()
    parser.baseUrl = baseUrl
    parser.feed(htmlString)

    return htmlString, parser.links
Exemplo n.º 3
0
def get_links(url, keyword=None, parent=None):
    #create URL object and set parent
    link = URL(url)
    link.parent = parent

    try:
        #make get request
        response = urllib.request.urlopen(link.url)

        #set status
        link.status = response.getcode()

    except urllib.error.HTTPError as e:
        #set response to nothing since an error occurred
        response = None
        #log the returned status code
        link.status = e.code

    #check for errors from response
    if response:
        #read response data
        page = str(response.read())
        #remove any unwanted newlines prior to parsing
        page = page.replace("\\n", "")

        #parse HTML content
        parser = LinkParser(keyword)
        parser.reset_parser()
        parser.feed(page)

        #check if no follow is set
        if not parser.no_follow:
            #assign links to URL object
            link.links = parser.links.copy()

        #check if keyword was found and store title
        link.key = parser.key_found
        link.title = parser.title
        parser.close()

        #iterate through hrefs and convert relative URLs
        for i in range(len(link.links)):
            #takes base url and combines with link if link does not have base of its own
            link.links[i] = urllib.parse.urljoin(link.url, link.links[i])

    return link
Exemplo n.º 4
0
 def setUp(self):
     self.p = LinkParser.Parser()
 def setUp(self):
     self.p = LinkParser.Parser()
     with open(
             f"{environ['CONTAINER_DATA_PATH']}/html/List_of_Italian_flags.html"
     ) as _f:
         self.soup = BeautifulSoup(_f.read(), features="html.parser")
 def setUp(self):
     lf = LinkParser.Parser()
     with open(f"{environ['CONTAINER_DATA_PATH']}/html/Flag_of_Italy.html"
               ) as _f:
         self.all_links = lf.find_links(_f.read())
Exemplo n.º 7
0
 def parseAndGetLinks(self):  # parse HTML, save links
     linkparser = LinkParser(self.file)
     return linkparser.anchorlist()