Exemplo n.º 1
0
    def _http_parse_result(self, result):
        rfiles = []
        rdirs = []

        dirs = re.findall(self.http_parse.dir_line, result)
        if dirs is not None and len(dirs) > 0:
            for founddir in dirs:
                rfile = {}
                rfile['permissions'] = ''
                rfile['group'] = ''
                rfile['user'] = ''
                rfile['size'] = 0
                date = founddir[self.http_parse.dir_date - 1]
                dirdate = date.split()
                parts = dirdate[0].split('-')
                # 19-Jul-2014 13:02
                rfile['month'] = Utils.month_to_num(parts[1])
                rfile['day'] = int(parts[0])
                rfile['year'] = int(parts[2])
                rfile['name'] = founddir[self.http_parse.dir_name - 1]
                rdirs.append(rfile)

        files = re.findall(self.http_parse.file_line, result)

        if files is not None and len(files) > 0:
            for foundfile in files:
                rfile = {}
                rfile['permissions'] = ''
                rfile['group'] = ''
                rfile['user'] = ''
                if self.http_parse.file_size != -1:
                    rfile['size'] = humanfriendly.parse_size(foundfile[self.http_parse.file_size - 1])
                else:
                    rfile['size'] = 0
                if self.http_parse.file_date != -1:
                    date = foundfile[self.http_parse.file_date - 1]
                    if self.http_parse.file_date_format:
                        date_object = datetime.strptime(date, self.http_parse.file_date_format.replace('%%', '%'))
                        rfile['month'] = date_object.month
                        rfile['day'] = date_object.day
                        rfile['year'] = date_object.year
                    else:
                        dirdate = date.split()
                        parts = dirdate[0].split('-')
                        # 19-Jul-2014 13:02
                        rfile['month'] = Utils.month_to_num(parts[1])
                        rfile['day'] = int(parts[0])
                        rfile['year'] = int(parts[2])
                else:
                    today = datetime.now()
                    date = '%s-%s-%s' % (today.year, today.month, today.day)
                    rfile['month'] = today.month
                    rfile['day'] = today.day
                    rfile['year'] = today.year
                rfile['name'] = foundfile[self.http_parse.file_name - 1]
                filehash = (rfile['name'] + str(date) + str(rfile['size'])).encode('utf-8')
                rfile['hash'] = hashlib.md5(filehash).hexdigest()
                rfiles.append(rfile)
        return (rfiles, rdirs)
Exemplo n.º 2
0
    def list(self, directory=''):
        '''
        Try to get file headers to get last_modification and size
        '''
        self._network_configuration()
        # Specific configuration
        # With those options, cURL will issue a HEAD request. This may not be
        # supported especially on resources that are accessed using POST. In
        # this case, HTTP will return code 405. We explicitely handle this case
        # in this method.
        # Note also that in many cases, there is no Last-Modified field in
        # headers since this is usually dynamic content (Content-Length is
        # usually present).
        self.crl.setopt(pycurl.HEADER, True)
        self.crl.setopt(pycurl.NOBODY, True)
        for rfile in self.files_to_download:
            if self.save_as is None:
                self.save_as = rfile['name']

            rfile['save_as'] = self.save_as

            file_url = self._file_url(rfile)
            try:
                self.crl.setopt(pycurl.URL, file_url)
            except Exception:
                self.crl.setopt(pycurl.URL, file_url.encode('ascii', 'ignore'))

            # Create a buffer and assign it to the pycurl object
            output = BytesIO()
            self.crl.setopt(pycurl.WRITEFUNCTION, output.write)

            try:
                self.crl.perform()
                errcode = int(self.crl.getinfo(pycurl.RESPONSE_CODE))
                if errcode == 405:
                    # HEAD not supported by the server for this URL so we can
                    # skip the rest of the loop (we won't have metadata about
                    # the file but biomaj should be fine).
                    msg = 'Listing ' + file_url + ' not supported. This is fine, continuing.'
                    self.logger.info(msg)
                    continue
                elif errcode not in self.ERRCODE_OK:
                    msg = 'Error while listing ' + file_url + ' - ' + str(
                        errcode)
                    self.logger.error(msg)
                    raise Exception(msg)
            except Exception as e:
                msg = 'Error while listing ' + file_url + ' - ' + str(e)
                self.logger.error(msg)
                raise e

            # Figure out what encoding was sent with the response, if any.
            # Check against lowercased header name.
            encoding = None
            if 'content-type' in self.headers:
                content_type = self.headers['content-type'].lower()
                match = re.search(r'charset=(\S+)', content_type)
                if match:
                    encoding = match.group(1)
            if encoding is None:
                # Default encoding for HTML is iso-8859-1.
                # Other content types may have different default encoding,
                # or in case of binary data, may have no encoding at all.
                encoding = 'iso-8859-1'

            # lets get the output in a string
            result = output.getvalue().decode(encoding)

            lines = re.split(r'[\n\r]+', result)
            for line in lines:
                parts = line.split(':')
                if parts[0].strip() == 'Content-Length':
                    rfile['size'] = int(parts[1].strip())
                if parts[0].strip() == 'Last-Modified':
                    # Sun, 06 Nov 1994
                    res = re.match(r'(\w+),\s+(\d+)\s+(\w+)\s+(\d+)',
                                   parts[1].strip())
                    if res:
                        rfile['hash'] = hashlib.md5(
                            str(res.group(0)).encode('utf-8')).hexdigest()
                        rfile['day'] = int(res.group(2))
                        rfile['month'] = Utils.month_to_num(res.group(3))
                        rfile['year'] = int(res.group(4))
                        continue
                    # Sunday, 06-Nov-94
                    res = re.match(r'(\w+),\s+(\d+)-(\w+)-(\d+)',
                                   parts[1].strip())
                    if res:
                        rfile['hash'] = hashlib.md5(
                            str(res.group(0)).encode('utf-8')).hexdigest()
                        rfile['day'] = int(res.group(2))
                        rfile['month'] = Utils.month_to_num(res.group(3))
                        rfile['year'] = 2000 + int(res.group(4))
                        continue
                    # Sun Nov  6 08:49:37 1994
                    res = re.match(
                        r'(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)',
                        parts[1].strip())
                    if res:
                        rfile['hash'] = hashlib.md5(
                            str(res.group(0)).encode('utf-8')).hexdigest()
                        rfile['day'] = int(res.group(3))
                        rfile['month'] = Utils.month_to_num(res.group(2))
                        rfile['year'] = int(res.group(4))
                        continue
        return (self.files_to_download, [])
Exemplo n.º 3
0
    def list(self, directory=''):
        '''
        Try to get file headers to get last_modification and size
        '''
        self._basic_curl_configuration()
        # Specific configuration
        self.crl.setopt(pycurl.HEADER, True)
        self.crl.setopt(pycurl.NOBODY, True)
        for rfile in self.files_to_download:
            if self.save_as is None:
                self.save_as = rfile['name']

            rfile['save_as'] = self.save_as

            file_url = self._file_url(rfile)
            try:
                self.crl.setopt(pycurl.URL, file_url)
            except Exception:
                self.crl.setopt(pycurl.URL, file_url.encode('ascii', 'ignore'))

            # Create a buffer and assign it to the pycurl object
            output = BytesIO()
            self.crl.setopt(pycurl.WRITEFUNCTION, output.write)

            self.crl.perform()

            # Figure out what encoding was sent with the response, if any.
            # Check against lowercased header name.
            encoding = None
            if 'content-type' in self.headers:
                content_type = self.headers['content-type'].lower()
                match = re.search(r'charset=(\S+)', content_type)
                if match:
                    encoding = match.group(1)
            if encoding is None:
                # Default encoding for HTML is iso-8859-1.
                # Other content types may have different default encoding,
                # or in case of binary data, may have no encoding at all.
                encoding = 'iso-8859-1'

            # lets get the output in a string
            result = output.getvalue().decode(encoding)

            lines = re.split(r'[\n\r]+', result)
            for line in lines:
                parts = line.split(':')
                if parts[0].strip() == 'Content-Length':
                    rfile['size'] = int(parts[1].strip())
                if parts[0].strip() == 'Last-Modified':
                    # Sun, 06 Nov 1994
                    res = re.match(r'(\w+),\s+(\d+)\s+(\w+)\s+(\d+)',
                                   parts[1].strip())
                    if res:
                        rfile['hash'] = hashlib.md5(
                            str(res.group(0)).encode('utf-8')).hexdigest()
                        rfile['day'] = int(res.group(2))
                        rfile['month'] = Utils.month_to_num(res.group(3))
                        rfile['year'] = int(res.group(4))
                        continue
                    # Sunday, 06-Nov-94
                    res = re.match(r'(\w+),\s+(\d+)-(\w+)-(\d+)',
                                   parts[1].strip())
                    if res:
                        rfile['hash'] = hashlib.md5(
                            str(res.group(0)).encode('utf-8')).hexdigest()
                        rfile['day'] = int(res.group(2))
                        rfile['month'] = Utils.month_to_num(res.group(3))
                        rfile['year'] = 2000 + int(res.group(4))
                        continue
                    # Sun Nov  6 08:49:37 1994
                    res = re.match(
                        r'(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)',
                        parts[1].strip())
                    if res:
                        rfile['hash'] = hashlib.md5(
                            str(res.group(0)).encode('utf-8')).hexdigest()
                        rfile['day'] = int(res.group(3))
                        rfile['month'] = Utils.month_to_num(res.group(2))
                        rfile['year'] = int(res.group(4))
                        continue
        return (self.files_to_download, [])