def test_document_is_valid_when_using_or_later_licenses(self): doc = Document( Version(2, 1), License.from_identifier('CC0-1.0'), 'Sample_Document-V2.1', spdx_id='SPDXRef-DOCUMENT', namespace= 'https://spdx.org/spdxdocs/spdx-example-444504E0-4F89-41D3-9A0C-0305E82C3301' ) doc.creation_info.add_creator(Tool('ScanCode')) doc.creation_info.set_created_now() package = doc.package = Package(name='some/path', download_location=NoAssert()) package.spdx_id = 'SPDXRef-Package' package.cr_text = 'Some copyrught' package.verif_code = 'SOME code' package.license_declared = NoAssert() package.conc_lics = NoAssert() file1 = File('./some/path/tofile') file1.name = './some/path/tofile' file1.spdx_id = 'SPDXRef-File' file1.chk_sum = Algorithm('SHA1', 'SOME-SHA1') file1.conc_lics = NoAssert() file1.copyright = NoAssert() lic1 = License.from_identifier('LGPL-2.1-or-later') file1.add_lics(lic1) package.add_lics_from_file(lic1) package.add_file(file1) messages = ErrorMessages() messages = doc.validate(messages) assert not messages
def test_document_validate_failures_returns_informative_messages(self): doc = Document( Version(2, 1), License.from_identifier('CC0-1.0'), 'Sample_Document-V2.1', spdx_id='SPDXRef-DOCUMENT', namespace= 'https://spdx.org/spdxdocs/spdx-example-444504E0-4F89-41D3-9A0C-0305E82C3301' ) pack = doc.package = Package('some/path', NoAssert()) file1 = File('./some/path/tofile') file1.name = './some/path/tofile' file1.spdx_id = 'SPDXRef-File' file1.chk_sum = Algorithm('SHA1', 'SOME-SHA1') lic1 = License.from_identifier('LGPL-2.1-only') file1.add_lics(lic1) pack.add_lics_from_file(lic1) messages = [] messages = doc.validate(messages) expected = [ 'No creators defined, must have at least one.', 'Creation info missing created date.', 'Package checksum must be instance of spdx.checksum.Algorithm', 'Package download_location can not be None.', 'Package verif_code can not be None.', 'Package cr_text can not be None.', 'Package must have at least one file.', 'Package concluded license must be instance of spdx.utils.SPDXNone ' 'or spdx.utils.NoAssert or spdx.document.License', 'Package declared license must be instance of spdx.utils.SPDXNone ' 'or spdx.utils.NoAssert or spdx.document.License' ] assert expected == messages
def _get_lgpl_doc(self, or_later=False): doc = Document(Version(2, 1), License.from_identifier('CC0-1.0')) doc.creation_info.add_creator(Tool('ScanCode')) doc.creation_info.set_created_now() package = doc.package = Package(name='some/path', download_location=NoAssert()) package.cr_text = 'Some copyrught' package.verif_code = 'SOME code' package.license_declared = NoAssert() package.conc_lics = NoAssert() file1 = File('./some/path/tofile') file1.name = './some/path/tofile' file1.chk_sum = Algorithm('SHA1', 'SOME-SHA1') file1.conc_lics = NoAssert() file1.copyright = NoAssert() lic1 = License.from_identifier('LGPL-2.1') if or_later: lic1 = License.from_identifier('LGPL-2.1+') file1.add_lics(lic1) package.add_lics_from_file(lic1) package.add_file(file1) return doc
def _get_lgpl_doc(self, or_later=False): doc = Document( Version(2, 1), License.from_identifier('CC0-1.0'), 'Sample_Document-V2.1', spdx_id='SPDXRef-DOCUMENT', namespace= 'https://spdx.org/spdxdocs/spdx-example-444504E0-4F89-41D3-9A0C-0305E82C3301' ) doc.creation_info.add_creator(Tool('ScanCode')) doc.creation_info.set_created_now() package = doc.package = Package(name='some/path', download_location=NoAssert()) package.cr_text = 'Some copyrught' package.verif_code = 'SOME code' package.license_declared = NoAssert() package.conc_lics = NoAssert() file1 = File('./some/path/tofile') file1.name = './some/path/tofile' file1.spdx_id = 'SPDXRef-File' file1.chk_sum = Algorithm('SHA1', 'SOME-SHA1') file1.conc_lics = NoAssert() file1.copyright = NoAssert() lic1 = License.from_identifier('LGPL-2.1') if or_later: lic1 = License.from_identifier('LGPL-2.1+') file1.add_lics(lic1) package.add_lics_from_file(lic1) package.add_file(file1) return doc
def test_document_is_valid_when_using_or_later_licenses(self): doc = Document(Version(2, 1), License.from_identifier('CC0-1.0')) doc.creation_info.add_creator(Tool('ScanCode')) doc.creation_info.set_created_now() package = doc.package = Package(name='some/path', download_location=NoAssert()) package.cr_text = 'Some copyrught' package.verif_code = 'SOME code' package.license_declared = NoAssert() package.conc_lics = NoAssert() file1 = File('./some/path/tofile') file1.name = './some/path/tofile' file1.chk_sum = Algorithm('SHA1', 'SOME-SHA1') file1.conc_lics = NoAssert() file1.copyright = NoAssert() lic1 = License.from_identifier('LGPL-2.1+') file1.add_lics(lic1) package.add_lics_from_file(lic1) package.add_file(file1) messages = [] is_valid = doc.validate(messages) assert is_valid assert not messages
def test_document_validate_failures_returns_informative_messages(self): doc = Document(Version(2, 1), License.from_identifier('CC0-1.0')) pack = doc.package = Package('some/path', NoAssert()) file1 = File('./some/path/tofile') file1.name = './some/path/tofile' file1.chk_sum = Algorithm('SHA1', 'SOME-SHA1') lic1 = License.from_identifier('LGPL-2.1') file1.add_lics(lic1) pack.add_lics_from_file(lic1) messages = [] is_valid = doc.validate(messages) assert not is_valid expected = ['No creators defined, must have at least one.'] assert expected == messages
def test_document_validate_failures_returns_informative_messages(self): doc = Document( Version(2, 1), License.from_identifier('CC0-1.0'), 'Sample_Document-V2.1', spdx_id='SPDXRef-DOCUMENT', namespace= 'https://spdx.org/spdxdocs/spdx-example-444504E0-4F89-41D3-9A0C-0305E82C3301' ) pack = doc.package = Package('some/path', NoAssert()) file1 = File('./some/path/tofile') file1.name = './some/path/tofile' file1.spdx_id = 'SPDXRef-File' file1.chk_sum = Algorithm('SHA1', 'SOME-SHA1') lic1 = License.from_identifier('LGPL-2.1') file1.add_lics(lic1) pack.add_lics_from_file(lic1) messages = [] is_valid = doc.validate(messages) assert not is_valid expected = ['No creators defined, must have at least one.'] assert expected == messages
def generate_spdx_file(self) -> File: """Generates the SPDX file. SPDX File example: FileName: ./tests/test_mbed_targets.py SPDXID: SPDXRef-cb9cce30c285e6083c2d19a463cbe592 FileChecksum: SHA1: d3db49873bd2b1cab45bf81e7d88617dea6caaff LicenseConcluded: NOASSERTION FileCopyrightText: NONE Returns: the corresponding file """ source_file = File(determine_spdx_value(self.unix_relative_path)) source_file.type = FileType.SOURCE source_file.comment = determine_spdx_value(None) source_file.chk_sum = Algorithm("SHA1", self.sha1_check_sum) source_file.conc_lics = License.from_identifier( str(determine_spdx_value(self.licence))) source_file.spdx_id = f"SPDXRef-{self.id}" source_file.copyright = determine_spdx_value(self.copyright) source_file.add_lics( License.from_identifier(str(determine_spdx_value(self.licence)))) return source_file
def create(self): """ Write identifier scan results as SPDX Tag/value or RDF. """ self.get_output_file() self.spdx_document = Document( version=Version(2, 1), data_license=License.from_identifier( self.code_extra_params["lic_identifier"])) self.set_creation_info() if isdir(self.path_or_file): input_path = self.path_or_file else: input_path = dirname(self.path_or_file) package = self.spdx_document.package = Package( download_location=NoAssert(), version=self.get_package_version()) self.set_package_info(package) all_files_have_no_license = True all_files_have_no_copyright = True file_license_list = [] file_license_ids = [] bar = Bar('Writing to spdx file', max=len(self.id_scan_results)) if isPath(self.path_or_file): for idx, file_data in enumerate(self.id_scan_results): file_data_instance = open(file_data["FileName"], "r") if not shouldSkipFile(file_data["FileName"], self.output_file_name): name = file_data["FileName"].replace( self.path_or_file, '.') file_entry = File(name=name, chk_sum=Algorithm( 'SHA1', get_file_hash(file_data["FileName"]) or '')) spdx_license = None if self.doc_type == TAG_VALUE: spdx_license = License.from_identifier( file_data["SPDXID"]) else: licenseref_id = 'SPDXID-Doc-Generator-' + file_data[ "SPDXID"] file_license_ids.append(licenseref_id) if licenseref_id in file_license_ids: spdx_license = ExtractedLicense(licenseref_id) spdx_license.name = NoAssert() comment = "N/A" spdx_license.comment = comment text = NoAssert() if not text: text = comment spdx_license.text = text self.spdx_document.add_extr_lic(spdx_license) package.add_lics_from_file(spdx_license) file_entry.add_lics(spdx_license) file_license_list.append(spdx_license) file_entry.conc_lics = NoAssert() file_entry.copyright = SPDXNone() file_entry.spdx_id = self.code_extra_params[ "file_ref"].format(idx + 1) package.add_file(file_entry) bar.next() if self.doc_type == TAG_VALUE: for spdx_license in list(set(file_license_list)): package.add_lics_from_file(spdx_license) bar.finish() if len(package.files) == 0: if self.doc_type == TAG_VALUE: self.output_file.write( "# No results for package '{}'.\n".format(package.name)) else: self.output_file.write( "<!-- No results for package '{}'. -->\n".format( package.name)) if self.doc_type == TAG_VALUE: from spdx.writers.tagvalue import write_document # NOQA else: from spdx.writers.rdf import write_document # NOQA if package.files: spdx_output = io.StringIO() if self.doc_type == TAG_VALUE: write_document(self.spdx_document, spdx_output, validate=True) else: spdx_output = io.BytesIO() write_document(self.spdx_document, spdx_output, validate=True) result = spdx_output.getvalue() if self.doc_type == TAG_VALUE: result = result.encode('utf-8') self.output_file.write(result)
def write_spdx(output_file, files, tool_name, tool_version, notice, input_file, as_tagvalue=True): """ Write scan output as SPDX Tag/value or RDF. """ _patch_license_list() absinput = abspath(input_file) if isdir(absinput): input_path = absinput else: input_path = dirname(absinput) doc = Document(Version(2, 1), License.from_identifier('CC0-1.0')) doc.comment = notice tool_name = tool_name or 'ScanCode' doc.creation_info.add_creator(Tool(tool_name + ' ' + tool_version)) doc.creation_info.set_created_now() package = doc.package = Package( name=basename(input_path), download_location=NoAssert() ) # Use a set of unique copyrights for the package. package.cr_text = set() all_files_have_no_license = True all_files_have_no_copyright = True # FIXME: this should walk the codebase instead!!! for file_data in files: # Skip directories. if file_data.get('type') != 'file': continue # Set a relative file name as that is what we want in # SPDX output (with explicit leading './'). name = './' + file_data.get('path') file_entry = File( name=name, chk_sum=Algorithm('SHA1', file_data.get('sha1') or '') ) file_licenses = file_data.get('licenses') if file_licenses: all_files_have_no_license = False for file_license in file_licenses: spdx_id = file_license.get('spdx_license_key') if spdx_id: spdx_license = License.from_identifier(spdx_id) else: license_key = file_license.get('key') licenseref_id = 'LicenseRef-scancode-' + license_key spdx_license = ExtractedLicense(licenseref_id) spdx_license.name = file_license.get('short_name') comment = ('See details at https://github.com/nexB/scancode-toolkit' '/blob/develop/src/licensedcode/data/licenses/%s.yml\n' % license_key) spdx_license.comment = comment text = file_license.get('matched_text') # always set some text, even if we did not extract the matched text if not text: text = comment spdx_license.text = text doc.add_extr_lic(spdx_license) # Add licenses in the order they appear in the file. Maintaining the order # might be useful for provenance purposes. file_entry.add_lics(spdx_license) package.add_lics_from_file(spdx_license) elif file_licenses is None: all_files_have_no_license = False file_entry.add_lics(NoAssert()) else: file_entry.add_lics(SPDXNone()) file_entry.conc_lics = NoAssert() file_copyrights = file_data.get('copyrights') if file_copyrights: all_files_have_no_copyright = False file_entry.copyright = [] for file_copyright in file_copyrights: file_entry.copyright.append(file_copyright.get('value')) package.cr_text.update(file_entry.copyright) # Create a text of copyright statements in the order they appear in the file. # Maintaining the order might be useful for provenance purposes. file_entry.copyright = '\n'.join(file_entry.copyright) + '\n' elif file_copyrights is None: all_files_have_no_copyright = False file_entry.copyright = NoAssert() else: file_entry.copyright = SPDXNone() package.add_file(file_entry) if len(package.files) == 0: if as_tagvalue: msg = "# No results for package '{}'.\n".format(package.name) else: # rdf msg = "<!-- No results for package '{}'. -->\n".format(package.name) output_file.write(msg) # Remove duplicate licenses from the list for the package. unique_licenses = {(l.identifier, l.full_name): l for l in package.licenses_from_files} unique_licenses = list(unique_licenses.values()) if not len(package.licenses_from_files): if all_files_have_no_license: package.licenses_from_files = [SPDXNone()] else: package.licenses_from_files = [NoAssert()] else: # List license identifiers alphabetically for the package. package.licenses_from_files = sorted(unique_licenses, key=lambda x: x.identifier) if len(package.cr_text) == 0: if all_files_have_no_copyright: package.cr_text = SPDXNone() else: package.cr_text = NoAssert() else: # Create a text of alphabetically sorted copyright # statements for the package. package.cr_text = '\n'.join(sorted(package.cr_text)) + '\n' package.verif_code = doc.package.calc_verif_code() package.license_declared = NoAssert() package.conc_lics = NoAssert() # The spdx-tools write_document returns either: # - unicode for tag values # - UTF8-encoded bytes for rdf because somehow the rdf and xml # libraries do the encoding and do not return text but bytes # The file passed by ScanCode for output is opened in text mode Therefore in # one case we do need to deal with bytes and decode before writing (rdf) and # in the other case we deal with text all the way. if package.files: if as_tagvalue: from spdx.writers.tagvalue import write_document # NOQA else: from spdx.writers.rdf import write_document # NOQA if as_tagvalue: # unicode text everywhere spdx_output = StringIO() else: # rdf as utf-encoded bytes on Py2 spdx_output = BytesIO() write_document(doc, spdx_output, validate=False) result = spdx_output.getvalue() if as_tagvalue: # unicode text everywhere pass else: # rdf as utf-encoded bytes on Py2 result = result.decode('utf-8') output_file.write(result)
doc.comment = 'Example Document' doc.data_license = License.from_identifier('CC0-1.0') doc.creation_info.add_creator(Person('Alice', '*****@*****.**')) doc.creation_info.set_created_now() review = Review(Person('Joe', None)) review.set_review_date_now() review.comment = 'Joe reviewed this document' doc.add_review(review) # File testfile1 = File('TestFile1') testfile1.type = FileType.BINARY testfile1.comment = 'This is a test file.' testfile1.chk_sum = Algorithm('SHA1', 'c537c5d99eca5333f23491d47ededd083fefb7ad') testfile1.conc_lics = License.from_identifier('BSD-2-Clause') testfile1.add_lics(License.from_identifier('BSD-2-Clause')) testfile1.copyright = SPDXNone() testfile1.add_artifact('name', 'TagWriteTest') testfile1.add_artifact('home', UnKnown()) testfile1.add_artifact('uri', 'http://tagwritetest.test') testfile2 = File('TestFile2') testfile2.type = FileType.SOURCE testfile2.comment = 'This is a test file.' testfile2.chk_sum = Algorithm('SHA1', 'bb154f28d1cf0646ae21bb0bec6c669a2b90e113') testfile2.conc_lics = License.from_identifier('Apache-2.0') testfile2.add_lics(License.from_identifier('Apache-2.0')) testfile2.copyright = NoAssert() # Package
doc.version = Version(1, 2) doc.comment = 'Example Document' doc.data_license = License.from_identifier('CC0-1.0') doc.creation_info.add_creator(Person('Alice', '*****@*****.**')) doc.creation_info.set_created_now() review = Review(Person('Joe', None)) review.set_review_date_now() review.comment = 'Joe reviewed this document' doc.add_review(review) # File testfile1 = File('TestFile1') testfile1.type = FileType.BINARY testfile1.comment = 'This is a test file.' testfile1.chk_sum = Algorithm('SHA1', 'c537c5d99eca5333f23491d47ededd083fefb7ad') testfile1.conc_lics = License.from_identifier('BSD-2-Clause') testfile1.add_lics(License.from_identifier('BSD-2-Clause')) testfile1.copyright = SPDXNone() testfile1.add_artifact('name', 'TagWriteTest') testfile1.add_artifact('home', UnKnown()) testfile1.add_artifact('uri', 'http://tagwritetest.test') testfile2 = File('TestFile2') testfile2.type = FileType.SOURCE testfile2.comment = 'This is a test file.' testfile2.chk_sum = Algorithm('SHA1', 'bb154f28d1cf0646ae21bb0bec6c669a2b90e113') testfile2.conc_lics = License.from_identifier('Apache-2.0') testfile2.add_lics(License.from_identifier('Apache-2.0')) testfile2.copyright = NoAssert() # Package
doc.namespace = "spdx" doc.data_license = License.from_identifier("CC0-1.0") doc.creation_info.add_creator(Person("Alice", "*****@*****.**")) doc.creation_info.set_created_now() review = Review(Person("Joe", None)) review.set_review_date_now() review.comment = "Joe reviewed this document" doc.add_review(review) # File testfile1 = File("TestFile1") testfile1.type = FileType.BINARY testfile1.spdx_id = "TestFilet#SPDXRef-FILE" testfile1.comment = "This is a test file." testfile1.chk_sum = Algorithm("SHA1", "c537c5d99eca5333f23491d47ededd083fefb7ad") testfile1.conc_lics = License.from_identifier("BSD-2-Clause") testfile1.add_lics(License.from_identifier("BSD-2-Clause")) testfile1.copyright = SPDXNone() testfile1.add_artifact("name", "TagWriteTest") testfile1.add_artifact("home", UnKnown()) testfile1.add_artifact("uri", "http://tagwritetest.test") testfile2 = File("TestFile2") testfile2.type = FileType.SOURCE testfile2.spdx_id = "TestFile2#SPDXRef-FILE" testfile2.comment = "This is a test file." testfile2.chk_sum = Algorithm("SHA1", "bb154f28d1cf0646ae21bb0bec6c669a2b90e113") testfile2.conc_lics = License.from_identifier("Apache-2.0") testfile2.add_lics(License.from_identifier("Apache-2.0")) testfile2.copyright = NoAssert() # Package
def write_spdx(version, notice, scanned_files, input, output_file, as_tagvalue=True): """ Write scan output formatted as SPDX Tag/value or RDF. """ absinput = abspath(input) if os.path.isdir(absinput): input_path = absinput else: input_path = os.path.dirname(absinput) doc = Document(Version(2, 1), License.from_identifier('CC0-1.0')) doc.comment = notice doc.creation_info.add_creator(Tool('ScanCode ' + version)) doc.creation_info.set_created_now() package = doc.package = Package( name=os.path.basename(input_path), download_location=NoAssert() ) # Use a set of unique copyrights for the package. package.cr_text = set() all_files_have_no_license = True all_files_have_no_copyright = True for file_data in scanned_files: # Construct the absolute path in case we need to access the file # to calculate its SHA1. file_entry = File(os.path.join(input_path, file_data.get('path'))) file_sha1 = file_data.get('sha1') if not file_sha1: if os.path.isfile(file_entry.name): # Calculate the SHA1 in case it is missing, e.g. for empty files. file_sha1 = file_entry.calc_chksum() else: # Skip directories. continue # Restore the relative file name as that is what we want in # SPDX output (with explicit leading './'). file_entry.name = './' + file_data.get('path') file_entry.chk_sum = Algorithm('SHA1', file_sha1) file_licenses = file_data.get('licenses') if file_licenses: all_files_have_no_license = False for file_license in file_licenses: spdx_id = file_license.get('spdx_license_key') if spdx_id: # spdx_id = spdx_id.rstrip('+') spdx_license = License.from_identifier(spdx_id) else: license_key = file_license.get('key') # FIXME: we should prefix this with ScanCode- licenseref_id = 'LicenseRef-' + license_key spdx_license = ExtractedLicense(licenseref_id) spdx_license.name = file_license.get('short_name') comment = 'See details at https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/%s.yml\n' % license_key spdx_license.comment = comment text = file_license.get('matched_text') # always set some text, even if we did not extract the matched text if not text: text = comment spdx_license.text = text doc.add_extr_lic(spdx_license) # Add licenses in the order they appear in the file. Maintaining the order # might be useful for provenance purposes. file_entry.add_lics(spdx_license) package.add_lics_from_file(spdx_license) elif file_licenses is None: all_files_have_no_license = False file_entry.add_lics(NoAssert()) else: file_entry.add_lics(SPDXNone()) file_entry.conc_lics = NoAssert() file_copyrights = file_data.get('copyrights') if file_copyrights: all_files_have_no_copyright = False file_entry.copyright = [] for file_copyright in file_copyrights: file_entry.copyright.extend(file_copyright.get('statements')) package.cr_text.update(file_entry.copyright) # Create a text of copyright statements in the order they appear in the file. # Maintaining the order might be useful for provenance purposes. file_entry.copyright = '\n'.join(file_entry.copyright) + '\n' elif file_copyrights is None: all_files_have_no_copyright = False file_entry.copyright = NoAssert() else: file_entry.copyright = SPDXNone() package.add_file(file_entry) if len(package.files) == 0: if as_tagvalue: output_file.write("# No results for package '{}'.\n".format(package.name)) else: output_file.write("<!-- No results for package '{}'. -->\n".format(package.name)) # Remove duplicate licenses from the list for the package. unique_licenses = set(package.licenses_from_files) if not len(package.licenses_from_files): if all_files_have_no_license: package.licenses_from_files = [SPDXNone()] else: package.licenses_from_files = [NoAssert()] else: # List license identifiers alphabetically for the package. package.licenses_from_files = sorted(unique_licenses, key=lambda x: x.identifier) if len(package.cr_text) == 0: if all_files_have_no_copyright: package.cr_text = SPDXNone() else: package.cr_text = NoAssert() else: # Create a text of alphabetically sorted copyright # statements for the package. package.cr_text = '\n'.join(sorted(package.cr_text)) + '\n' package.verif_code = doc.package.calc_verif_code() package.license_declared = NoAssert() package.conc_lics = NoAssert() if as_tagvalue: from spdx.writers.tagvalue import write_document else: from spdx.writers.rdf import write_document # The spdx-tools write_document returns either: # - unicode for tag values # - UTF8-encoded bytes for rdf because somehow the rd and xml # libraries do the encoding # The file passed by ScanCode for output is alwasy opened in binary # mode and needs to receive UTF8-encoded bytes. # Therefore in one case we do nothing (rdf) and in the other case we # encode to UTF8 bytes. from StringIO import StringIO spdx_output = StringIO() write_document(doc, spdx_output, validate=True) result = spdx_output.getvalue() if as_tagvalue: result = result.encode('utf-8') output_file.write(result)
def create_spdx_document(self): """ Write identifier scan results as SPDX Tag/value or RDF. """ logging.basicConfig(level=logging.INFO) logging.info("Creating spdx document") self.get_output_file() self.spdx_document = Document( version=Version(2, 1), data_license=License.from_identifier( self.code_extra_params["lic_identifier"]), ) self.set_creation_info() if isdir(self.path_or_file): input_path = self.path_or_file else: input_path = dirname(self.path_or_file) package = self.spdx_document.package = Package( download_location=NoAssert(), version=self.get_package_version()) self.set_package_info(package) all_files_have_no_license = True all_files_have_no_copyright = True file_license_list = [] file_license_ids = [] if is_dir(self.path_or_file): for idx, file_data in enumerate(self.id_scan_results): file_data_instance = open(file_data["FileName"], "r") if not should_skip_file(file_data["FileName"], self.output_file_name): name = file_data["FileName"].replace( self.path_or_file, ".") file_entry = File( name=name, chk_sum=Algorithm( "SHA1", get_file_hash(file_data["FileName"]) or ""), ) spdx_license = None if self.doc_type == TAG_VALUE: spdx_license = License.from_identifier( file_data["SPDXID"]) else: licenseref_id = "SPDXID-Doc-Generator-" + file_data[ "SPDXID"] file_license_ids.append(licenseref_id) if licenseref_id in file_license_ids: spdx_license = ExtractedLicense(licenseref_id) spdx_license.name = NoAssert() comment = "N/A" spdx_license.comment = comment text = NoAssert() if not text: text = comment spdx_license.text = text self.spdx_document.add_extr_lic(spdx_license) package.add_lics_from_file(spdx_license) file_entry.add_lics(spdx_license) file_license_list.append(spdx_license) file_entry.conc_lics = NoAssert() file_entry.copyright = SPDXNone() file_entry.spdx_id = self.code_extra_params[ "file_ref"].format(idx + 1) package.add_file(file_entry) if self.doc_type == TAG_VALUE: for spdx_license in list(set(file_license_list)): package.add_lics_from_file(spdx_license) if len(package.files) == 0: if self.doc_type == TAG_VALUE: self.output_file.write( "# No results for package '{}'.\n".format(package.name)) else: self.output_file.write( "<!-- No results for package '{}'. -->\n".format( package.name)) if self.doc_type == TAG_VALUE: from spdx.writers.tagvalue import write_document # NOQA else: from spdx.writers.rdf import write_document # NOQA if package.files: spdx_output = io.StringIO() if self.doc_type == TAG_VALUE: write_document(self.spdx_document, spdx_output, validate=False) logging.info("SPDX Tag-Value Document created successfully.") else: # spdx_output = io.BytesIO() write_document(self.spdx_document, spdx_output, validate=False) logging.info("SPDX RDF Document created successfully.") result = spdx_output.getvalue() if self.doc_type == TAG_VALUE: result = result.encode("utf-8") self.output_file.write(result)
def write_spdx( output_file, files, tool_name, tool_version, notice, package_name='', download_location=NoAssert(), as_tagvalue=True, spdx_version=(2, 2), with_notice_text=False, ): """ Write scan output as SPDX Tag/value to ``output_file`` file-like object using the ``files`` list of scanned file data. Write as RDF XML if ``as_tagvalue`` is False. Use the ``notice`` string as a notice included in a document comment. Include the ``tool_name`` and ``tool_version`` to indicate which tool is producing this SPDX document. Use ``package_name`` as a Package name and as a namespace prefix base. """ as_rdf = not as_tagvalue _patch_license_list() ns_prefix = '_'.join(package_name.lower().split()) comment = notice + f'\nSPDX License List: {scancode_config.spdx_license_list_version}' doc = Document( version=Version(*spdx_version), data_license=License.from_identifier('CC0-1.0'), comment=notice, namespace=f'http://spdx.org/spdxdocs/{ns_prefix}-{uuid.uuid4()}', license_list_version=scancode_config.spdx_license_list_version, name='SPDX Document created by ScanCode Toolkit') tool_name = tool_name or 'ScanCode' doc.creation_info.add_creator(Tool(f'{tool_name} {tool_version}')) doc.creation_info.set_created_now() package_id = '001' package = doc.package = Package( name=package_name, download_location=download_location, spdx_id=f'SPDXRef-{package_id}', ) # Use a set of unique copyrights for the package. package.cr_text = set() all_files_have_no_license = True all_files_have_no_copyright = True # FIXME: this should walk the codebase instead!!! for sid, file_data in enumerate(files, 1): # Skip directories. if file_data.get('type') != 'file': continue # Set a relative file name as that is what we want in # SPDX output (with explicit leading './'). name = './' + file_data.get('path') file_entry = File(spdx_id=f'SPDXRef-{sid}', name=name, chk_sum=Algorithm('SHA1', file_data.get('sha1') or '')) file_licenses = file_data.get('licenses') if file_licenses: all_files_have_no_license = False for file_license in file_licenses: license_key = file_license.get('key') spdx_id = file_license.get('spdx_license_key') if not spdx_id: spdx_id = f'LicenseRef-scancode-{license_key}' is_license_ref = spdx_id.lower().startswith('licenseref-') if not is_license_ref: spdx_license = License.from_identifier(spdx_id) else: spdx_license = ExtractedLicense(spdx_id) spdx_license.name = file_license.get('short_name') # FIXME: replace this with the licensedb URL comment = ( f'See details at https://github.com/nexB/scancode-toolkit' f'/blob/develop/src/licensedcode/data/licenses/{license_key}.yml\n' ) spdx_license.comment = comment text = file_license.get('matched_text') # always set some text, even if we did not extract the # matched text if not text: text = comment spdx_license.text = text doc.add_extr_lic(spdx_license) # Add licenses in the order they appear in the file. Maintaining # the order might be useful for provenance purposes. file_entry.add_lics(spdx_license) package.add_lics_from_file(spdx_license) elif file_licenses is None: all_files_have_no_license = False file_entry.add_lics(NoAssert()) else: file_entry.add_lics(SPDXNone()) file_entry.conc_lics = NoAssert() file_copyrights = file_data.get('copyrights') if file_copyrights: all_files_have_no_copyright = False file_entry.copyright = [] for file_copyright in file_copyrights: file_entry.copyright.append(file_copyright.get('copyright')) package.cr_text.update(file_entry.copyright) # Create a text of copyright statements in the order they appear in # the file. Maintaining the order might be useful for provenance # purposes. file_entry.copyright = '\n'.join(file_entry.copyright) + '\n' elif file_copyrights is None: all_files_have_no_copyright = False file_entry.copyright = NoAssert() else: file_entry.copyright = SPDXNone() package.add_file(file_entry) if len(package.files) == 0: if as_tagvalue: msg = "# No results for package '{}'.\n".format(package.name) else: # rdf msg = "<!-- No results for package '{}'. -->\n".format( package.name) output_file.write(msg) # Remove duplicate licenses from the list for the package. unique_licenses = {l.identifier: l for l in package.licenses_from_files} unique_licenses = list(unique_licenses.values()) if not len(package.licenses_from_files): if all_files_have_no_license: package.licenses_from_files = [SPDXNone()] else: package.licenses_from_files = [NoAssert()] else: # List license identifiers alphabetically for the package. package.licenses_from_files = sorted( unique_licenses, key=lambda x: x.identifier, ) if len(package.cr_text) == 0: if all_files_have_no_copyright: package.cr_text = SPDXNone() else: package.cr_text = NoAssert() else: # Create a text of alphabetically sorted copyright # statements for the package. package.cr_text = '\n'.join(sorted(package.cr_text)) + '\n' package.verif_code = doc.package.calc_verif_code() package.license_declared = NoAssert() package.conc_lics = NoAssert() # The spdx-tools write_document returns either: # - unicode for tag values # - UTF8-encoded bytes for rdf because somehow the rdf and xml # libraries do the encoding and do not return text but bytes # The file passed by ScanCode for output is opened in text mode Therefore in # one case we do need to deal with bytes and decode before writing (rdf) and # in the other case we deal with text all the way. if package.files: if as_tagvalue: from spdx.writers.tagvalue import write_document # NOQA elif as_rdf: from spdx.writers.rdf import write_document # NOQA if as_tagvalue: spdx_output = StringIO() elif as_rdf: # rdf is utf-encoded bytes spdx_output = BytesIO() write_document(doc, spdx_output, validate=False) result = spdx_output.getvalue() if as_rdf: # rdf is utf-encoded bytes result = result.decode('utf-8') output_file.write(result)
def write_formatted_output(scanners, files_count, version, notice, scanned_files, format, options, input, output_file, _echo, _save): """ Save scan results to file or screen. """ if format == 'html': for template_chunk in as_template(scanned_files, files_count, output_file): try: output_file.write(template_chunk) except Exception as e: extra_context = 'ERROR: Failed to write output to HTML for: ' + repr( template_chunk) _echo(extra_context, fg='red') _save(extra_context + '\n', output_file) e.args += (extra_context, ) raise e elif format == 'html-app': output_file.write(as_html_app(input, output_file)) try: create_html_app_assets(scanned_files, output_file) except HtmlAppAssetCopyWarning: _echo('\nHTML app creation skipped when printing to stdout.', fg='yellow') _save('\nHTML app creation skipped when printing to stdout.', output_file) except HtmlAppAssetCopyError: _echo('\nFailed to create HTML app.', fg='red') _save('\nFailed to create HTML app.', output_file) elif format == 'json' or format == 'json-pp': import simplejson as json meta = OrderedDict() meta['scancode_notice'] = notice meta['scancode_version'] = version meta['scancode_options'] = options meta['files_count'] = files_count meta['files'] = scanned_files if format == 'json-pp': output_file.write( unicode( json.dumps(meta, indent=2 * ' ', iterable_as_array=True, encoding='utf-8'))) else: output_file.write( unicode( json.dumps(meta, separators=(',', ':'), iterable_as_array=True, encoding='utf-8'))) output_file.write('\n') elif format in ('spdx-tv', 'spdx-rdf'): from spdx.checksum import Algorithm from spdx.creationinfo import Tool from spdx.document import Document, License from spdx.file import File from spdx.package import Package from spdx.utils import NoAssert from spdx.utils import SPDXNone from spdx.version import Version input = abspath(input) if os.path.isdir(input): input_path = input else: input_path = os.path.dirname(input) doc = Document(Version(2, 1), License.from_identifier('CC0-1.0')) doc.creation_info.add_creator(Tool('ScanCode ' + version)) doc.creation_info.set_created_now() doc.package = Package(os.path.basename(input_path), NoAssert()) # Use a set of unique copyrights for the package. doc.package.cr_text = set() all_files_have_no_license = True all_files_have_no_copyright = True for file_data in scanned_files: # Construct the absolute path in case we need to access the file # to calculate its SHA1. file_entry = File(os.path.join(input_path, file_data.get('path'))) file_sha1 = file_data.get('sha1') if not file_sha1: if os.path.isfile(file_entry.name): # Calculate the SHA1 in case it is missing, e.g. for empty files. file_sha1 = file_entry.calc_chksum() else: # Skip directories. continue # Restore the relative file name as that is what we want in # SPDX output (with explicit leading './'). file_entry.name = './' + file_data.get('path') file_entry.chk_sum = Algorithm('SHA1', file_sha1) file_licenses = file_data.get('licenses') if file_licenses: all_files_have_no_license = False for file_license in file_licenses: spdx_id = file_license.get('spdx_license_key') if spdx_id: spdx_license = License.from_identifier(spdx_id) else: license_key = 'LicenseRef-' + file_license.get('key') spdx_license = License(file_license.get('short_name'), license_key) # Add licenses in the order they appear in the file. Maintaining the order # might be useful for provenance purposes. file_entry.add_lics(spdx_license) doc.package.add_lics_from_file(spdx_license) else: if file_licenses == None: all_files_have_no_license = False spdx_license = NoAssert() else: spdx_license = SPDXNone() file_entry.add_lics(spdx_license) file_entry.conc_lics = NoAssert() file_copyrights = file_data.get('copyrights') if file_copyrights: all_files_have_no_copyright = False file_entry.copyright = [] for file_copyright in file_copyrights: file_entry.copyright.extend( file_copyright.get('statements')) doc.package.cr_text.update(file_entry.copyright) # Create a text of copyright statements in the order they appear in the file. # Maintaining the order might be useful for provenance purposes. file_entry.copyright = '\n'.join(file_entry.copyright) + '\n' else: if file_copyrights == None: all_files_have_no_copyright = False spdx_copyright = NoAssert() else: spdx_copyright = SPDXNone() file_entry.copyright = spdx_copyright doc.package.add_file(file_entry) if len(doc.package.files) == 0: if format == 'spdx-tv': output_file.write("# No results for package '{}'.\n".format( doc.package.name)) else: output_file.write( "<!-- No results for package '{}'. -->\n".format( doc.package.name)) return # Remove duplicate licenses from the list for the package. unique_licenses = set(doc.package.licenses_from_files) if len(doc.package.licenses_from_files) == 0: if all_files_have_no_license: doc.package.licenses_from_files = [SPDXNone()] else: doc.package.licenses_from_files = [NoAssert()] else: # List license identifiers alphabetically for the package. doc.package.licenses_from_files = sorted( unique_licenses, key=lambda x: x.identifier) if len(doc.package.cr_text) == 0: if all_files_have_no_copyright: doc.package.cr_text = SPDXNone() else: doc.package.cr_text = NoAssert() else: # Create a text of alphabetically sorted copyright statements for the package. doc.package.cr_text = '\n'.join(sorted(doc.package.cr_text)) + '\n' doc.package.verif_code = doc.package.calc_verif_code() doc.package.license_declared = NoAssert() doc.package.conc_lics = NoAssert() # As the spdx-tools package can only write the document to a "str" file but ScanCode provides a "unicode" file, # write to a "str" buffer first and then manually write the value to a "unicode" file. from StringIO import StringIO str_buffer = StringIO() if format == 'spdx-tv': from spdx.writers.tagvalue import write_document write_document(doc, str_buffer) else: from spdx.writers.rdf import write_document write_document(doc, str_buffer) output_file.write(str_buffer.getvalue()) else: raise Exception('Unknown format')
def write_spdx(version, notice, scanned_files, format, input, output_file): from spdx.checksum import Algorithm from spdx.creationinfo import Tool from spdx.document import Document from spdx.document import License from spdx.document import ExtractedLicense from spdx.file import File from spdx.package import Package from spdx.utils import NoAssert from spdx.utils import SPDXNone from spdx.version import Version absinput = abspath(input) if os.path.isdir(absinput): input_path = absinput else: input_path = os.path.dirname(absinput) doc = Document(Version(2, 1), License.from_identifier('CC0-1.0')) doc.comment = notice doc.creation_info.add_creator(Tool('ScanCode ' + version)) doc.creation_info.set_created_now() doc.package = Package(os.path.basename(input_path), NoAssert()) # Use a set of unique copyrights for the package. doc.package.cr_text = set() all_files_have_no_license = True all_files_have_no_copyright = True for file_data in scanned_files: # Construct the absolute path in case we need to access the file # to calculate its SHA1. file_entry = File(os.path.join(input_path, file_data.get('path'))) file_sha1 = file_data.get('sha1') if not file_sha1: if os.path.isfile(file_entry.name): # Calculate the SHA1 in case it is missing, e.g. for empty files. file_sha1 = file_entry.calc_chksum() else: # Skip directories. continue # Restore the relative file name as that is what we want in # SPDX output (with explicit leading './'). file_entry.name = './' + file_data.get('path') file_entry.chk_sum = Algorithm('SHA1', file_sha1) file_licenses = file_data.get('licenses') if file_licenses: all_files_have_no_license = False for file_license in file_licenses: spdx_id = file_license.get('spdx_license_key') if spdx_id: spdx_license = License.from_identifier(spdx_id) else: license_key = file_license.get('key') # FIXME: we should prefix this with ScanCode- licenseref_id = 'LicenseRef-' + license_key spdx_license = ExtractedLicense(licenseref_id) spdx_license.name = file_license.get('short_name') comment = 'See details at https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/%s.yml\n' % license_key spdx_license.comment = comment text = file_license.get('matched_text') # always set some text, even if we did not extract the matched text if not text: text = comment spdx_license.text = text doc.add_extr_lic(spdx_license) # Add licenses in the order they appear in the file. Maintaining the order # might be useful for provenance purposes. file_entry.add_lics(spdx_license) doc.package.add_lics_from_file(spdx_license) elif file_licenses is None: all_files_have_no_license = False file_entry.add_lics(NoAssert()) else: file_entry.add_lics(SPDXNone()) file_entry.conc_lics = NoAssert() file_copyrights = file_data.get('copyrights') if file_copyrights: all_files_have_no_copyright = False file_entry.copyright = [] for file_copyright in file_copyrights: file_entry.copyright.extend(file_copyright.get('statements')) doc.package.cr_text.update(file_entry.copyright) # Create a text of copyright statements in the order they appear in the file. # Maintaining the order might be useful for provenance purposes. file_entry.copyright = '\n'.join(file_entry.copyright) + '\n' elif file_copyrights is None: all_files_have_no_copyright = False file_entry.copyright = NoAssert() else: file_entry.copyright = SPDXNone() doc.package.add_file(file_entry) if len(doc.package.files) == 0: if format == 'spdx-tv': output_file.write("# No results for package '{}'.\n".format( doc.package.name)) elif format == 'spdx-rdf': output_file.write("<!-- No results for package '{}'. -->\n".format( doc.package.name)) # Remove duplicate licenses from the list for the package. unique_licenses = set(doc.package.licenses_from_files) if not len(doc.package.licenses_from_files): if all_files_have_no_license: doc.package.licenses_from_files = [SPDXNone()] else: doc.package.licenses_from_files = [NoAssert()] else: # List license identifiers alphabetically for the package. doc.package.licenses_from_files = sorted(unique_licenses, key=lambda x: x.identifier) if len(doc.package.cr_text) == 0: if all_files_have_no_copyright: doc.package.cr_text = SPDXNone() else: doc.package.cr_text = NoAssert() else: # Create a text of alphabetically sorted copyright # statements for the package. doc.package.cr_text = '\n'.join(sorted(doc.package.cr_text)) + '\n' doc.package.verif_code = doc.package.calc_verif_code() doc.package.license_declared = NoAssert() doc.package.conc_lics = NoAssert() if format == 'spdx-tv': from spdx.writers.tagvalue import write_document elif format == 'spdx-rdf': from spdx.writers.rdf import write_document # As the spdx-tools package can only write the document to a # "str" file but ScanCode provides a "unicode" file, write to a # "str" buffer first and then manually write the value to a # "unicode" file. from StringIO import StringIO str_buffer = StringIO() write_document(doc, str_buffer, validate=True) as_unicode = str_buffer.getvalue().decode('utf-8') output_file.write(as_unicode)
def write_spdx(version, notice, scanned_files, input, output_file, as_tagvalue=True): """ Write scan output formatted as SPDX Tag/value or RDF. """ absinput = abspath(input) if os.path.isdir(absinput): input_path = absinput else: input_path = os.path.dirname(absinput) doc = Document(Version(2, 1), License.from_identifier('CC0-1.0')) doc.comment = notice doc.creation_info.add_creator(Tool('ScanCode ' + version)) doc.creation_info.set_created_now() package = doc.package = Package(name=os.path.basename(input_path), download_location=NoAssert()) # Use a set of unique copyrights for the package. package.cr_text = set() all_files_have_no_license = True all_files_have_no_copyright = True for file_data in scanned_files: # Construct the absolute path in case we need to access the file # to calculate its SHA1. file_entry = File(os.path.join(input_path, file_data.get('path'))) file_sha1 = file_data.get('sha1') if not file_sha1: if os.path.isfile(file_entry.name): # Calculate the SHA1 in case it is missing, e.g. for empty files. file_sha1 = file_entry.calc_chksum() else: # Skip directories. continue # Restore the relative file name as that is what we want in # SPDX output (with explicit leading './'). file_entry.name = './' + file_data.get('path') file_entry.chk_sum = Algorithm('SHA1', file_sha1) file_licenses = file_data.get('licenses') if file_licenses: all_files_have_no_license = False for file_license in file_licenses: spdx_id = file_license.get('spdx_license_key') if spdx_id: # spdx_id = spdx_id.rstrip('+') spdx_license = License.from_identifier(spdx_id) else: license_key = file_license.get('key') # FIXME: we should prefix this with ScanCode- licenseref_id = 'LicenseRef-' + license_key spdx_license = ExtractedLicense(licenseref_id) spdx_license.name = file_license.get('short_name') comment = 'See details at https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/%s.yml\n' % license_key spdx_license.comment = comment text = file_license.get('matched_text') # always set some text, even if we did not extract the matched text if not text: text = comment spdx_license.text = text doc.add_extr_lic(spdx_license) # Add licenses in the order they appear in the file. Maintaining the order # might be useful for provenance purposes. file_entry.add_lics(spdx_license) package.add_lics_from_file(spdx_license) elif file_licenses is None: all_files_have_no_license = False file_entry.add_lics(NoAssert()) else: file_entry.add_lics(SPDXNone()) file_entry.conc_lics = NoAssert() file_copyrights = file_data.get('copyrights') if file_copyrights: all_files_have_no_copyright = False file_entry.copyright = [] for file_copyright in file_copyrights: file_entry.copyright.extend(file_copyright.get('statements')) package.cr_text.update(file_entry.copyright) # Create a text of copyright statements in the order they appear in the file. # Maintaining the order might be useful for provenance purposes. file_entry.copyright = '\n'.join(file_entry.copyright) + '\n' elif file_copyrights is None: all_files_have_no_copyright = False file_entry.copyright = NoAssert() else: file_entry.copyright = SPDXNone() package.add_file(file_entry) if len(package.files) == 0: if as_tagvalue: output_file.write("# No results for package '{}'.\n".format( package.name)) else: output_file.write("<!-- No results for package '{}'. -->\n".format( package.name)) # Remove duplicate licenses from the list for the package. unique_licenses = set(package.licenses_from_files) if not len(package.licenses_from_files): if all_files_have_no_license: package.licenses_from_files = [SPDXNone()] else: package.licenses_from_files = [NoAssert()] else: # List license identifiers alphabetically for the package. package.licenses_from_files = sorted(unique_licenses, key=lambda x: x.identifier) if len(package.cr_text) == 0: if all_files_have_no_copyright: package.cr_text = SPDXNone() else: package.cr_text = NoAssert() else: # Create a text of alphabetically sorted copyright # statements for the package. package.cr_text = '\n'.join(sorted(package.cr_text)) + '\n' package.verif_code = doc.package.calc_verif_code() package.license_declared = NoAssert() package.conc_lics = NoAssert() if as_tagvalue: from spdx.writers.tagvalue import write_document else: from spdx.writers.rdf import write_document # The spdx-tools write_document returns either: # - unicode for tag values # - UTF8-encoded bytes for rdf because somehow the rd and xml # libraries do the encoding # The file passed by ScanCode for output is alwasy opened in binary # mode and needs to receive UTF8-encoded bytes. # Therefore in one case we do nothing (rdf) and in the other case we # encode to UTF8 bytes. from StringIO import StringIO spdx_output = StringIO() write_document(doc, spdx_output, validate=True) result = spdx_output.getvalue() if as_tagvalue: result = result.encode('utf-8') output_file.write(result)