예제 #1
0
 def links(self, soup):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     metas = filter(
         lambda meta: meta.attrs.get('http-equiv', '').lower() == 'refresh',
         soup.find_all('meta'))
     metas = filter(lambda meta: '=' in meta.attrs.get('content', ''),
                    metas)
     links += list(
         map(
             lambda meta: full_url_address(
                 meta.attrs['content'].split('=', 1)[1], self.crawler_url.
                 url), metas))
     for link in filter(bool, links):
         url = Url(link)
         if not url.is_valid():
             continue
         depth = self.crawler_url.depth
         if url.domain != self.crawler_url.url.domain or \
                 not url.path.startswith(self.crawler_url.url.directory_path):
             depth -= 1
         if depth <= 0:
             continue
         self.add_url(link, depth)
예제 #2
0
 def get_links(self, text, soup=None):
     """
     :param text:
     :param soup:
     :return:
     """
     contents = list(
         filter(lambda x: isinstance(x, NavigableString) or is_link(x),
                soup.find('pre').contents))
     links = []
     for i, content in enumerate(contents):
         if not is_link(content) or '?' in content.attrs.get('href', ''):
             continue
         link = Url(
             full_url_address(content.attrs.get('href'),
                              self.processor.crawler_url.url))
         if i + 1 < len(contents) and isinstance(contents[i + 1],
                                                 NavigableString):
             extra = {}
             text = str(contents[i + 1])
             dt = DATETIME_PATTERN.findall(text)
             if dt:
                 extra['created_at'] = dt[0]
             size = FILESIZE_PATTERN.findall(text)
             if size:
                 extra['filesize'] = size[0].rstrip(' ')
             link.add_extra(extra)
         links.append(link)
     return links
예제 #3
0
 def get_links(self, text, soup=None):
     links = [
         full_url_address(link.attrs.get('href'),
                          self.processor.crawler_url.url)
         for link in soup.find_all('a')
     ]
     return [Url(link) for link in links]
예제 #4
0
 def links(self, soup):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     for link in filter(bool, links):
         url = Url(link)
         if not url.is_valid():
             continue
         depth = self.crawler_url.depth
         if url.domain != self.crawler_url.url.domain or \
                 not url.path.startswith(self.crawler_url.url.directory_path):
             depth -= 1
         if depth <= 0:
             continue
         self.add_url(link, depth)
예제 #5
0
 def process(self, text, soup=None):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     for link in filter(lambda x: x.url.endswith('/'), links):
         self.add_url(link, type='directory')
     self.files = [Url(link) for link in links]
예제 #6
0
    def __init__(self,
                 crawler,
                 url,
                 depth=3,
                 source=None,
                 exists=None,
                 type=None,
                 timeout=10):
        """

        :type crawler: Crawler
        :type depth: int Máxima recursión sin haber subido respecto esta url
        """
        self.flags = set()
        self.depth = depth
        if not isinstance(url, Url):
            url = Url(url)
        if url.is_valid():
            url.query = ''
            url.fragment = ''
        self.url = url
        self.crawler = crawler
        self.source = source
        self.exists = exists
        self.type = type
        self.timeout = timeout
        if url.is_valid() and (not url.path or url.path == '/'):
            self.type = 'directory'
        self.resp = None
예제 #7
0
def is_url_loop(url, ignore_end=True):
    url = url if isinstance(url, Url) else Url(url)
    directories = list(filter(bool, url.directories))
    directories.reverse()
    for i in range(1, (len(directories) // MATCHS_LOOP_NUM) + 1):
        groups = [
            tuple(directories[j:j + i])
            for j in range(0, MATCHS_LOOP_NUM * i, i)
        ]
        if len(set(groups)) == 1 and len(groups) >= MATCHS_LOOP_NUM:
            return True
    if ignore_end:
        return is_url_loop(url.parent(), False)
    return False
예제 #8
0
def full_url_address(address, url):
    """

    :type url: Url
    :type address: str
    :rtype :Url

    """
    if address is None:
        return
    protocol_match = address.split(':', 1)[0] if ':' in address else ''
    protocol_match = re.match('^([A-z0-9\\-]+)$', protocol_match)
    if protocol_match and protocol_match.group(1) not in ACCEPTED_PROTOCOLS:
        return
    # TODO: mejorar esto. Aceptar otros protocolos  a rechazar
    if address.startswith('//'):
        address = address.replace('//', '{}://'.format(url.protocol), 1)
    if '://' not in address or address.startswith('/'):
        url = url.copy()
        url.path = address
        return url
    url = Url(address)
    if url.is_valid():
        return url
예제 #9
0
 def _get_url_info(self):
     return UrlInfo(Sessions(), Url(self.url))
예제 #10
0
 def test_callback(self):
     with patch.object(UrlsInfo, '_get_url_info') as m:
         UrlsInfo([self.url], Sessions()).callback(len(self.url),
                                                   Url(self.url), 0)
         m.assert_called_once()