Python CheckUrl.validifyUrl примеры использования

Язык программирования: Python

Класс/Тип: CheckUrl

Метод/Функция: validifyUrl

Примеров на hotexamples.com: 4

Python CheckUrl.validifyUrl - 4 примера найдено. Это лучшие примеры Python кода для CheckUrl.validifyUrl, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

validifyUrl(3)

checkUrl(1)

Основные методы

validifyUrl (3)

checkUrl (1)

Пример #1

Показать файл

Файл: Crawlable.py Проект: WangCHX/Crawler

def isCrawlable(url):
    # use naive method to get root for given url
    url = CheckUrl.validifyUrl(url)
    strs = url.split('/')
    if len(strs) > 2:
        url = strs[0] + "//" + strs[2]

    robotUrl = url + "/robots.txt"

    if robotHash.get(robotUrl) == None:
         rerp = RobotExclusionRulesParser.RobotExclusionRulesParser()
         try:
             rerp.fetch(robotUrl,3)
         except urllib2.URLError as e:
             return False
         if rerp.is_allowed("*",url):
             return True
         else:
             return False
    else:
        rerp = robotHash[robotUrl]
        if rerp.is_allowed("*", url):
            return True
        else:
            return False

Пример #2

Показать файл

def isCrawlable(url):
    # use naive method to get root for given url
    url = CheckUrl.validifyUrl(url)
    strs = url.split('/')
    if len(strs) > 2:
        url = strs[0] + "//" + strs[2]

    robotUrl = url + "/robots.txt"

    if robotHash.get(robotUrl) == None:
        rerp = RobotExclusionRulesParser.RobotExclusionRulesParser()
        try:
            rerp.fetch(robotUrl, 3)
        except urllib2.URLError as e:
            return False
        if rerp.is_allowed("*", url):
            return True
        else:
            return False
    else:
        rerp = robotHash[robotUrl]
        if rerp.is_allowed("*", url):
            return True
        else:
            return False

Пример #3

Показать файл

 def processUrl(self, href):
     """
     :param href: Current url to be processed.
     if this href is not in dict, so we just push it in;
     else this href is already in dict, we must compute new score for it,
     In this way, I just give average score for it.
     """
     href = urlparse.urljoin(self.baseUrl, href)
     href = CheckUrl.validifyUrl(href)
     if not href == -1:
     # not in dict
         if not dict.get(href) == None:
             #heapq.heappush(queue, [self.score, href])
             if not queue.get(href) == None:
                 queue[href] = queue[href] + self.score
         else :
             # in dict ,first find that url, then compute new score, and heapify it again.
             queue[href] = self.score
             dict[href] = self.score

Пример #4

Показать файл

    this is used to computer priority score of that page, use naive method
    compute the number of keywords in the content of that page
    """
    content = content.lower()
    content = content.split()
    priorityScore = 0
    for keyword in keywords:
        for word in content:
            if keyword == word:
                priorityScore += 1
    return priorityScore

queryUrl = "https://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=" + urllib.quote(query)
response1 = urllib2.urlopen(queryUrl + "&rsz=8")
for result in json.load(response1)['responseData']['results']:
    url = CheckUrl.validifyUrl(result['unescapedUrl'])
    queue[url] = -1000 # put them into heap, [-1000, url] represents the score is -1000, and url is url.
    dict[url] = -1000

response1.close()
response2 = urllib2.urlopen(queryUrl + "&rsz=2&start=8")

for result in json.load(response2)['responseData']['results']:
    url = CheckUrl.validifyUrl(result['unescapedUrl'])
    #heapq.heappush(queue, [-1000,url])
    queue[url] = -1000
    """
    because heapq in python is small root based, so using negative num can make it big-root-based.
    """
    dict[url] = -1000