def download(uri): """ Downloads the given `uri` in a temporary directory and returns that directory path. """ response = requests.get(uri) if response.status_code != 200: raise requests.RequestException content_disposition = response.headers.get("content-disposition", "") _, params = cgi.parse_header(content_disposition) filename = params.get("filename") if not filename: # Using `response.url` in place of provided `Scan.uri` since the former # will be more accurate in case of HTTP redirect. filename = os.path.basename(urlparse(response.url).path) download_directory = tempfile.mkdtemp() download_file = Path(download_directory, filename) file_content = response.content with open(download_file, "wb") as f: f.write(file_content) checksums = multi_checksums(download_file, ("md5", "sha1")) return Download( directory=download_directory, filename=filename, size=len(file_content), sha1=checksums["sha1"], md5=checksums["md5"], )
def get_file_info(location, **kwargs): """ Return a mapping of file information collected for the file at `location`. """ result = OrderedDict() # TODO: move date and size these to the inventory collection step??? result['date'] = get_last_modified_date(location) or None result['size'] = getsize(location) or 0 sha1, md5, sha256 = multi_checksums(location, ('sha1', 'md5', 'sha256')).values() result['sha1'] = sha1 result['md5'] = md5 result['sha256'] = sha256 collector = get_type(location) result['mime_type'] = collector.mimetype_file or None result['file_type'] = collector.filetype_file or None result['programming_language'] = collector.programming_language or None result['is_binary'] = bool(collector.is_binary) result['is_text'] = bool(collector.is_text) result['is_archive'] = bool(collector.is_archive) result['is_media'] = bool(collector.is_media) result['is_source'] = bool(collector.is_source) result['is_script'] = bool(collector.is_script) return result
def test_multi_checksums_custom(self): test_file = self.get_test_loc('hash/dir1/a.png') result = multi_checksums(test_file, ('sha512',)) expected = dict([ ('sha512', u'5be9e01cd20ff288fd3c3fc46be5c2747eaa2c526197125330947a95cdb418222176b182a4680f0e435ba8f114363c45a67b30eed9a9222407e63ccbde46d3b4'), ]) assert result == expected
def get_file_infos(location): """ Return a mapping of file information collected from the file or directory at `location`. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from typecode import contenttype if on_linux: location = path_to_bytes(location) else: location = path_to_unicode(location) infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) name = fileutils.file_name(location) if is_file: base_name, extension = fileutils.splitext(location) else: base_name = name extension = '' if on_linux: infos['name'] = path_to_unicode(name) infos['base_name'] = path_to_unicode(base_name) infos['extension'] = path_to_unicode(extension) else: infos['name'] = name infos['base_name'] = base_name infos['extension'] = extension infos['date'] = is_file and filetype.get_last_modified_date( location) or None infos['size'] = T.size infos.update(multi_checksums(location, ( 'sha1', 'md5', ))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) return infos
def test_multi_checksums(self): test_file = self.get_test_loc('hash/dir1/a.png') expected = dict([ ('md5', u'4760fb467f1ebf3b0aeace4a3926f1a4'), ('sha1', u'34ac5465d48a9b04fc275f09bc2230660df8f4f7'), ('sha256', u'1b598db6fee8f1ec7bb919c0adf68956f3d20af8c9934a9cf2db52e1347efd35'), ]) result = multi_checksums(test_file, 'md5 sha1 sha256'.split()) assert result == expected
def test_multi_checksums_shattered1(self): test_file = self.get_test_loc('hash/sha1-collision/shattered-1.pdf') expected = dict([ ('md5', 'ee4aa52b139d925f8d8884402b0a750c'), ('sha1', '38762cf7f55934b34d179ae6a4c80cadccbb7f0a'), ('sha256', '2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0'), ('sha512', '3c19b2cbcf72f7f5b252ea31677b8f2323d6119e49bcc0fb55931d00132385f1e749bb24cbd68c04ac826ae8421802825d3587fe185abf709669bb9693f6b416'), ('sha1_git', 'ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0'), ]) result = multi_checksums(test_file) assert result == expected
def get_file_infos(location): """ Return a mapping of file information collected from the file or directory at `location`. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from typecode import contenttype if on_linux: location = path_to_bytes(location) else: location = path_to_unicode(location) infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) name = fileutils.file_name(location) if is_file: base_name, extension = fileutils.splitext(location) else: base_name = name extension = '' if on_linux: infos['name'] = path_to_unicode(name) infos['base_name'] = path_to_unicode(base_name) infos['extension'] = path_to_unicode(extension) else: infos['name'] = name infos['base_name'] = base_name infos['extension'] = extension infos['date'] = is_file and filetype.get_last_modified_date(location) or None infos['size'] = T.size infos.update(multi_checksums(location, ('sha1', 'md5',))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) return infos
def test_multi_checksums_shattered2(self): test_file = self.get_test_loc('hash/sha1-collision/shattered-2.pdf') expected = dict([ ('md5', '5bd9d8cabc46041579a311230539b8d1'), ('sha1', '38762cf7f55934b34d179ae6a4c80cadccbb7f0a'), ('sha256', 'd4488775d29bdef7993367d541064dbdda50d383f89f0aa13a6ff2e0894ba5ff'), ('sha512', 'f39a04842e4b28e04558496beb7cb84654ded9c00b2f873c3ef64f9dfdbc760cd0273b816858ba5b203c0dd71af8b65d6a0c1032e00e48ace0b4705eedcc1bab'), # Note: this is not the same as the sha1_git for shattered-1.pdf ;) ('sha1_git', 'b621eeccd5c7edac9b7dcba35a8d5afd075e24f2'), ]) result = multi_checksums(test_file) assert result == expected
def get_file_infos(location, as_list=True): """ Return a list of dictionaries of informations collected from the file or directory at location. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from scancode import utils from typecode import contenttype infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) infos['name'] = fileutils.file_name(location) infos['extension'] = is_file and fileutils.file_extension(location) or '' infos['date'] = is_file and filetype.get_last_modified_date( location) or None infos['size'] = T.size infos.update(multi_checksums(location, ( 'sha1', 'md5', ))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) if as_list: return [infos] else: return infos