def setUp(self): self.temp_repos_dir = mkdtemp(dir=SPEC_DATA_DIR, prefix='test-run-') Repoman.setup( storage_backend='dulwich.fsrepo.FsRepo', location=self.temp_repos_dir )
def deploy_project(name, apikey, changed_files=None, repo=None, branch='master'): """Archive a GIT project and upload it to Dash.""" if repo is None: repo = Repoman.open_repo(name) archiver = GitProjectArchiver(repo, branch=branch, ignore_deleted=False, version=(0, 9), required_files=REQUIRED_FILES) spiders = None if changed_files is not None: spiders = {archiver._spider_name(name) for name in changed_files if name.startswith('spiders/')} zbuff = archiver.archive(spiders) payload = {'apikey': apikey, 'project': name} req = requests.post( DASH_API_URL + 'as/import.json?version=portia', files=[('archive', ('archive', zbuff, 'application/zip'))], params=payload ) if req.status_code == 200: project_url = DASH_API_URL.rsplit('/', 2)[0] + '/p/' + name return { 'status': 'ok', 'schedule_url': project_url } else: raise DeployError('Deploy to Dash failed: %s' % req.text)
def test_save_file(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) contents = j({'a': 1}) repoman.save_file('f1', contents, 'testbranch') self.assertEqual(['f1'], repoman.list_files_for_branch('testbranch')) self.assertEqual( contents, repoman.file_contents_for_branch('f1', 'testbranch'))
def test_two_interleaved_publishes_2(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) f1 = j({'a': 1, 'c': 3}) repoman.save_file('f1', f1, 'b1') self.assertTrue(repoman.publish_branch('b1')) repoman.delete_branch('b1') # b1 adds x/f2. f2 = j({'b': 2}) repoman.save_file('x/f2', f2, 'b1') # branch b2 adds a file with the same name but different content f2 = j({'a': 2, 'c': {'d': 1}}) repoman.save_file('x/f2', f2, 'b2') repoman.delete_file('f1', 'b2') # both publish their changes, but the automerge should solve conflicts self.assertTrue(repoman.publish_branch('b1')) self.assertTrue(repoman.publish_branch('b2')) self.assertEqual(j({ 'a': 2, 'b': 2, 'c': { 'd': 1 } }), repoman.file_contents_for_branch('x/f2', 'master')) self.assertEqual(len(repoman.get_published_revisions()), 3)
def test_branch_ops(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) repoman.create_branch('b1') self.assertTrue(repoman.has_branch('b1')) self.assertEqual(len(repoman.get_branch('b1')), 40) repoman.delete_branch('b1') self.assertFalse(repoman.has_branch('b1'))
def test_simple_publish(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) f1, f2, f3 = j({'a': 1}), j({'b': 2}), j({'c': 3}) repoman.save_file('f1', f1, 'b1') repoman.save_file('f2', f2, 'b1') repoman.save_file('x/f3', f3, 'b1') repoman.save_file('f4', '{}', 'b1') repoman.delete_file('f4', 'b1') self.assertTrue(repoman.has_branch('b1')) self.assertTrue(repoman.has_branch('master')) self.assertEqual([], repoman.list_files_for_branch('master')) self.assertTrue(repoman.publish_branch('b1')) self.assertItemsEqual(['f1', 'f2', 'x/f3'], repoman.list_files_for_branch('master')) self.assertEqual([f1, f2, f3], [ repoman.file_contents_for_branch(x, 'b1') for x in ('f1', 'f2', 'x/f3') ]) self.assertEqual([f1, f2, f3], [ repoman.file_contents_for_branch(x, 'master') for x in ('f1', 'f2', 'x/f3') ]) # Only one published revision self.assertEqual(len(repoman.get_published_revisions()), 1) # 6 checkpoints, 1 per operation (5) + 1 for the original state. self.assertEqual(len(repoman.get_branch_checkpoints('b1')), 6)
def test_unresolved_conflicts_both_add(self): repoman = Repoman.create_repo(self.get_full_name("my_repo")) # both add the same file with a conflicting key repoman.save_file("f1", j({"a": 1}), "b1") repoman.save_file("f1", j({"a": 2}), "b2") self.assertTrue(repoman.publish_branch("b1")) self.assertFalse(repoman.publish_branch("b2")) # the file appears as published by b1 in the master branch self.assertEqual(j({"a": 1}), repoman.file_contents_for_branch("f1", "master")) # the file in b2 has an unresolved conflict self.assertIn("__CONFLICT", j(repoman.file_contents_for_branch("f1", "b2")))
def test_sequential_publishes(self): repoman = Repoman.create_repo(self.get_full_name("my_repo")) f1, f2 = j({"a": 1}), j({"b": 2}) repoman.save_file("f1", f1, "b1") repoman.save_file("x/f2", f2, "b1") repoman.publish_branch("b1") repoman.delete_branch("b1") # f1 is modified in branch b2 f1 = j({"a": 3}) repoman.save_file("f1", f1, "b2") self.assertTrue(repoman.publish_branch("b2")) self.assertEqual([f1, f2], [repoman.file_contents_for_branch(x, "master") for x in ("f1", "x/f2")]) self.assertEqual(len(repoman.get_published_revisions()), 2)
def test_unresolved_conflicts_both_add(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) # both add the same file with a conflicting key repoman.save_file('f1', j({'a': 1}), 'b1') repoman.save_file('f1', j({'a': 2}), 'b2') self.assertTrue(repoman.publish_branch('b1')) self.assertFalse(repoman.publish_branch('b2')) # the file appears as published by b1 in the master branch self.assertEqual(j({'a': 1}), repoman.file_contents_for_branch('f1', 'master')) # the file in b2 has an unresolved conflict self.assertIn('__CONFLICT', j(repoman.file_contents_for_branch('f1', 'b2')))
def _archive_project(name, buff): """Archive a project stored in GIT into a zip file.""" repo = Repoman.open_repo(name) now = datetime.now().timetuple()[:6] archive = zipfile.ZipFile(buff, "w", zipfile.ZIP_DEFLATED) files_list = repo.list_files_for_branch('master') extractors = {} for file_path in files_list: if file_path == 'extractors.json': extractors = json.loads( repo.file_contents_for_branch(file_path, 'master')) seen_files = set() spiders = {} templates = defaultdict(list) for file_path in files_list: file_contents = repo.file_contents_for_branch(file_path, 'master') if file_path.startswith('spiders'): as_json = json.loads(file_contents) try: parts = file_path.split("/") if len(parts) == 2: # spider json spider_name = parts[1].rsplit(".", 1)[0] spiders[spider_name] = file_path, as_json elif len(parts) == 3: # template json existing = {} for field, eids in as_json.get('extractors', {}).items(): existing[field] = [ eid for eid in eids if eid in extractors ] as_json['extractors'] = existing spider_name = parts[1] templates[spider_name].append(as_json) except ValueError: continue else: _add_to_archive(archive, file_path, file_contents, now) seen_files.add(file_path) # Add empty placeholders for missing files required by dash for file_path in {'extractors.json', 'items.json'} - seen_files: _add_to_archive(archive, file_path, '{}', now) for name, (path, json_spec) in spiders.iteritems(): json_spec.pop('template_names') json_spec['templates'] = templates[name] _add_to_archive(archive, path, json.dumps(json_spec), now) archive.close()
def test_modify_delete(self): # Although this is usually treated as a conflict, here we just keep the # modified version and ignore the delete. repoman = Repoman.create_repo(self.get_full_name("my_repo")) repoman.save_file("f1", j({"a": 1}), "b1") self.assertTrue(repoman.publish_branch("b1")) repoman.delete_branch("b1") # b1 deletes f1 and b2 modifies it. repoman.delete_file("f1", "b1") repoman.save_file("f1", j({"a": 2, "c": 3}), "b2") self.assertTrue(repoman.publish_branch("b1")) self.assertTrue(repoman.publish_branch("b2")) # master has f1. self.assertEqual(["f1"], repoman.list_files_for_branch("master")) self.assertEqual(j({"a": 2, "c": 3}), repoman.file_contents_for_branch("f1", "master"))
def test_sequential_publishes(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) f1, f2 = j({'a': 1}), j({'b': 2}) repoman.save_file('f1', f1, 'b1') repoman.save_file('x/f2', f2, 'b1') repoman.publish_branch('b1') repoman.delete_branch('b1') # f1 is modified in branch b2 f1 = j({'a': 3}) repoman.save_file('f1', f1, 'b2') self.assertTrue(repoman.publish_branch('b2')) self.assertEqual([f1, f2], [repoman.file_contents_for_branch(x, 'master') for x in ('f1', 'x/f2')]) self.assertEqual(len(repoman.get_published_revisions()), 2)
def test_modify_delete(self): # Although this is usually treated as a conflict, here we just keep the # modified version and ignore the delete. repoman = Repoman.create_repo(self.get_full_name('my_repo')) repoman.save_file('f1', j({'a': 1}), 'b1') self.assertTrue(repoman.publish_branch('b1')) repoman.delete_branch('b1') # b1 deletes f1 and b2 modifies it. repoman.delete_file('f1', 'b1') repoman.save_file('f1', j({'a': 2, 'c': 3}), 'b2') self.assertTrue(repoman.publish_branch('b1')) self.assertTrue(repoman.publish_branch('b2')) # master has f1. self.assertEqual(['f1'], repoman.list_files_for_branch('master')) self.assertEqual(j({'a': 2, 'c': 3}), repoman.file_contents_for_branch('f1', 'master'))
def test_two_interleaved_publishes_1(self): repoman = Repoman.create_repo(self.get_full_name("my_repo")) f1, f2 = j({"a": 1}), j({"b": 2}) repoman.save_file("f1", f1, "b1") repoman.save_file("x/f2", f2, "b1") # branch b2 modifies the same files concurrently f1, f2 = j({"c": 3}), j({"d": 4}) repoman.save_file("f1", f1, "b2") repoman.save_file("x/f2", f2, "b2") # both publish their changes, but the automerge should solve conflicts self.assertTrue(repoman.publish_branch("b1")) self.assertTrue(repoman.publish_branch("b2")) self.assertEqual(j({"a": 1, "c": 3}), repoman.file_contents_for_branch("f1", "master")) self.assertEqual(j({"b": 2, "d": 4}), repoman.file_contents_for_branch("x/f2", "master")) self.assertEqual(len(repoman.get_published_revisions()), 2)
def test_two_interleaved_publishes_1(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) f1, f2 = j({'a': 1}), j({'b': 2}) repoman.save_file('f1', f1, 'b1') repoman.save_file('x/f2', f2, 'b1') # branch b2 modifies the same files concurrently f1, f2 = j({'c': 3}), j({'d': 4}) repoman.save_file('f1', f1, 'b2') repoman.save_file('x/f2', f2, 'b2') # both publish their changes, but the automerge should solve conflicts self.assertTrue(repoman.publish_branch('b1')) self.assertTrue(repoman.publish_branch('b2')) self.assertEqual(j({'a': 1, 'c': 3}), repoman.file_contents_for_branch('f1', 'master')) self.assertEqual(j({'b': 2, 'd': 4}), repoman.file_contents_for_branch('x/f2', 'master')) self.assertEqual(len(repoman.get_published_revisions()), 2)
def test_unresolved_conflicts_both_modify(self): repoman = Repoman.create_repo(self.get_full_name("my_repo")) repoman.save_file("f1", j({"a": 1}), "b1") self.assertTrue(repoman.publish_branch("b1")) repoman.delete_branch("b1") # both branches update the same key of the same file with different # values. This conflict must be manually resolved repoman.save_file("f1", j({"a": 2}), "b1") repoman.save_file("f1", j({"a": 3}), "b2") self.assertTrue(repoman.publish_branch("b1")) self.assertFalse(repoman.publish_branch("b2")) # the file appears as published by b1 in the master branch self.assertEqual(j({"a": 2}), repoman.file_contents_for_branch("f1", "master")) # the file in b2 has an unresolved conflict self.assertIn("__CONFLICT", j(repoman.file_contents_for_branch("f1", "b2"))) # b2 solves the conflict, saves again and forces the publish repoman.save_file("f1", j({"a": 3}), "b2") self.assertTrue(repoman.publish_branch("b2", force=True)) self.assertEqual(j({"a": 3}), repoman.file_contents_for_branch("f1", "master"))
def test_simple_publish(self): repoman = Repoman.create_repo(self.get_full_name("my_repo")) f1, f2, f3 = j({"a": 1}), j({"b": 2}), j({"c": 3}) repoman.save_file("f1", f1, "b1") repoman.save_file("f2", f2, "b1") repoman.save_file("x/f3", f3, "b1") repoman.save_file("f4", "{}", "b1") repoman.delete_file("f4", "b1") self.assertTrue(repoman.has_branch("b1")) self.assertTrue(repoman.has_branch("master")) self.assertEqual([], repoman.list_files_for_branch("master")) self.assertTrue(repoman.publish_branch("b1")) self.assertItemsEqual(["f1", "f2", "x/f3"], repoman.list_files_for_branch("master")) self.assertEqual([f1, f2, f3], [repoman.file_contents_for_branch(x, "b1") for x in ("f1", "f2", "x/f3")]) self.assertEqual([f1, f2, f3], [repoman.file_contents_for_branch(x, "master") for x in ("f1", "f2", "x/f3")]) # Only one published revision self.assertEqual(len(repoman.get_published_revisions()), 1) # 6 checkpoints, 1 per operation (5) + 1 for the original state. self.assertEqual(len(repoman.get_branch_checkpoints("b1")), 6)
def test_two_interleaved_publishes_3(self): repoman = Repoman.create_repo(self.get_full_name("my_repo")) f1 = j({"a": 1, "c": 3, "d": 4, "e": 5}) repoman.save_file("f1", f1, "b1") self.assertTrue(repoman.publish_branch("b1")) repoman.delete_branch("b1") # b1 heavily edits f1 repoman.save_file("f1", j({"b": 2, "e": 5}), "b1") # this case is VERY tricky. branch 2 renames f1 to f2 and changes # it a bit. The merge algorithm detects the rename and the merged # output ends up containing all b1 changes + all b2 changes, and the # file is stored under the name given by branch2 repoman.delete_file("f1", "b2") repoman.save_file("f2", j({"a": 1, "c": 3, "d": 4, "e": 6}), "b2") # both publish their changes, but the automerge should solve conflicts self.assertTrue(repoman.publish_branch("b1")) self.assertTrue(repoman.publish_branch("b2")) self.assertEqual(j({"b": 2, "e": 6}), repoman.file_contents_for_branch("f2", "master")) self.assertEqual(len(repoman.get_published_revisions()), 3)
def test_two_interleaved_publishes_2(self): repoman = Repoman.create_repo(self.get_full_name("my_repo")) f1 = j({"a": 1, "c": 3}) repoman.save_file("f1", f1, "b1") self.assertTrue(repoman.publish_branch("b1")) repoman.delete_branch("b1") # b1 adds x/f2. f2 = j({"b": 2}) repoman.save_file("x/f2", f2, "b1") # branch b2 adds a file with the same name but different content f2 = j({"a": 2, "c": {"d": 1}}) repoman.save_file("x/f2", f2, "b2") repoman.delete_file("f1", "b2") # both publish their changes, but the automerge should solve conflicts self.assertTrue(repoman.publish_branch("b1")) self.assertTrue(repoman.publish_branch("b2")) self.assertEqual(j({"a": 2, "b": 2, "c": {"d": 1}}), repoman.file_contents_for_branch("x/f2", "master")) self.assertEqual(len(repoman.get_published_revisions()), 3)
def test_two_interleaved_publishes_3(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) f1 = j({'a': 1, 'c': 3, 'd': 4, 'e': 5}) repoman.save_file('f1', f1, 'b1') self.assertTrue(repoman.publish_branch('b1')) repoman.delete_branch('b1') # b1 heavily edits f1 repoman.save_file('f1', j({'b': 2, 'e': 5}), 'b1') # this case is VERY tricky. branch 2 renames f1 to f2 and changes # it a bit. The merge algorithm detects the rename and the merged # output ends up containing all b1 changes + all b2 changes, and the # file is stored under the name given by branch2 repoman.delete_file('f1', 'b2') repoman.save_file('f2', j({'a': 1, 'c': 3, 'd': 4, 'e': 6}), 'b2') # both publish their changes, but the automerge should solve conflicts self.assertTrue(repoman.publish_branch('b1')) self.assertTrue(repoman.publish_branch('b2')) self.assertEqual(j({'b': 2, 'e': 6}), repoman.file_contents_for_branch('f2', 'master')) self.assertEqual(len(repoman.get_published_revisions()), 3)
def test_two_interleaved_publishes_2(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) f1 = j({'a': 1, 'c': 3}) repoman.save_file('f1', f1, 'b1') self.assertTrue(repoman.publish_branch('b1')) repoman.delete_branch('b1') # b1 adds x/f2. f2 = j({'b': 2}) repoman.save_file('x/f2', f2, 'b1') # branch b2 adds a file with the same name but different content f2 = j({'a': 2, 'c': {'d': 1}}) repoman.save_file('x/f2', f2, 'b2') repoman.delete_file('f1', 'b2') # both publish their changes, but the automerge should solve conflicts self.assertTrue(repoman.publish_branch('b1')) self.assertTrue(repoman.publish_branch('b2')) self.assertEqual(j({'a': 2, 'b': 2, 'c': {'d': 1}}), repoman.file_contents_for_branch('x/f2', 'master')) self.assertEqual(len(repoman.get_published_revisions()), 3)
def test_unresolved_conflicts_both_modify(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) repoman.save_file('f1', j({'a': 1}), 'b1') self.assertTrue(repoman.publish_branch('b1')) repoman.delete_branch('b1') # both branches update the same key of the same file with different # values. This conflict must be manually resolved repoman.save_file('f1', j({'a': 2}), 'b1') repoman.save_file('f1', j({'a': 3}), 'b2') self.assertTrue(repoman.publish_branch('b1')) self.assertFalse(repoman.publish_branch('b2')) # the file appears as published by b1 in the master branch self.assertEqual(j({'a': 2}), repoman.file_contents_for_branch('f1', 'master')) # the file in b2 has an unresolved conflict self.assertIn('__CONFLICT', j(repoman.file_contents_for_branch('f1', 'b2'))) # b2 solves the conflict, saves again and forces the publish repoman.save_file('f1', j({'a': 3}), 'b2') self.assertTrue(repoman.publish_branch('b2', force=True)) self.assertEqual(j({'a': 3}), repoman.file_contents_for_branch('f1', 'master'))
def test_simple_publish(self): repoman = Repoman.create_repo(self.get_full_name('my_repo')) f1, f2, f3 = j({'a': 1}), j({'b': 2}), j({'c': 3}) repoman.save_file('f1', f1, 'b1') repoman.save_file('f2', f2, 'b1') repoman.save_file('x/f3', f3, 'b1') repoman.save_file('f4', '{}', 'b1') repoman.delete_file('f4', 'b1') self.assertTrue(repoman.has_branch('b1')) self.assertTrue(repoman.has_branch('master')) self.assertEqual([], repoman.list_files_for_branch('master')) self.assertTrue(repoman.publish_branch('b1')) self.assertItemsEqual(['f1', 'f2', 'x/f3'], repoman.list_files_for_branch('master')) self.assertEqual([f1, f2, f3], [repoman.file_contents_for_branch(x, 'b1') for x in ('f1', 'f2', 'x/f3')]) self.assertEqual([f1, f2, f3], [repoman.file_contents_for_branch(x, 'master') for x in ('f1', 'f2', 'x/f3')]) # Only one published revision self.assertEqual(len(repoman.get_published_revisions()), 1) # 6 checkpoints, 1 per operation (5) + 1 for the original state. self.assertEqual(len(repoman.get_branch_checkpoints('b1')), 6)
def edit_project(self, name, revision): if not Repoman.repo_exists(name): repo = Repoman.create_repo(name, author=self.user) import_project(name, self.auth_info['service_token'], repo) return GitProjectsManager.edit_project(self, name, revision)
def _init_project(self, name): repo = Repoman.create_repo(name, author=self.user) import_project(name, self.auth_info['service_token'], repo)
def _archive_project(name, buff, files=None, repo=None, branch='master', ignore_deleted=False): """Archive a project stored in GIT into a zip file.""" if repo is None: repo = Repoman.open_repo(name) now = datetime.now().timetuple()[:6] archive = zipfile.ZipFile(buff, "w", zipfile.ZIP_DEFLATED) files_list = files if files is not None else \ repo.list_files_for_branch(branch) all_files = files_list if files is None else \ repo.list_files_for_branch(branch) template_paths = defaultdict(list) for file_path in all_files: split_file_path = file_path.split('/') if len(split_file_path) > 2: template_paths[split_file_path[1]].append(file_path) extractors = json.loads(repo.file_contents_for_branch('extractors.json', branch) or '{}') seen_files = set() spiders = set() for file_path in files_list: if file_path.startswith('spiders'): try: parts = file_path.split("/") if len(parts) >= 2: spider_name = parts[1] if spider_name.endswith('.json'): spider_name = spider_name[:-5] if spider_name not in spiders: # Load spider if necessary if len(parts) > 2: file_path = 'spiders/' + spider_name + '.json' file_contents = repo.file_contents_for_branch( file_path, branch) as_json = json.loads(file_contents) templates = [] # Load all spider templates spider_templates = template_paths.get(spider_name, []) for template_path in spider_templates: seen_files.add(template_path) existing = {} # Ignore deleted templates try: templ_contents = repo.file_contents_for_branch( template_path, branch) except (TypeError, ValueError): continue json_template = json.loads(templ_contents) # Validate extractors template_extractors = json_template.get( 'extractors', {}) for field, eids in template_extractors.items(): existing[field] = [eid for eid in eids if eid in extractors] json_template['extractors'] = existing spider_name = parts[1] templates.append(json_template) spiders.add(spider_name) as_json.pop('template_names', None) as_json['templates'] = templates _add_to_archive(archive, file_path, json.dumps(as_json), now) except TypeError: if ignore_deleted: continue # Handle Deleted Spiders file_contents = repo.file_contents_for_branch(file_path, 'master') file_info = {'deleted': True} if file_contents: as_json = json.loads(file_contents) _add_to_archive(archive, file_path, json.dumps(file_info), now) else: file_contents = repo.file_contents_for_branch(file_path, branch) if file_contents: _add_to_archive(archive, file_path, file_contents, now) seen_files.add(file_path) # Add empty placeholders for missing files required by dash for file_path in {'extractors.json', 'items.json'} - seen_files: _add_to_archive(archive, file_path, '{}', now) archive.close()
def test_create(self): repoman = Repoman.open_repo('new_project_5', self._connection) print repoman.list_files_for_branch('marcos') print repoman.get_branch_changed_files('marcos')
def test_delete_file(self): repoman = Repoman.create_repo(self.get_full_name("my_repo")) contents = j({"a": 1}) repoman.save_file("f1", contents, "testbranch") repoman.delete_file("f1", "testbranch") self.assertEqual([], repoman.list_files_for_branch("testbranch"))
def setUp(self): self.temp_repos_dir = mkdtemp(dir=SPEC_DATA_DIR, prefix='test-run-') Repoman.setup(storage_backend='dulwich.fsrepo.FsRepo', location=self.temp_repos_dir)
def test_create(self): Repoman.create_repo(self.get_full_name('my_repo')) self.assertTrue(Repoman.repo_exists(self.get_full_name('my_repo')))
def edit_project(self, name, revision): if not Repoman.repo_exists(name): repo = Repoman.create_repo(name, author=self.user) import_project(name, self.auth_info['service_token'], repo) GitProjectsManager.edit_project(self, name, revision)
def copy_data(self, source, destination, spiders, items): if not Repoman.repo_exists(destination): self._init_project(destination) return GitProjectsManager.copy_data(self, source, destination, spiders, items)
def _archive_project(name, buff, files=None, repo=None, branch='master'): """Archive a project stored in GIT into a zip file.""" if repo is None: repo = Repoman.open_repo(name) now = datetime.now().timetuple()[:6] archive = zipfile.ZipFile(buff, "w", zipfile.ZIP_DEFLATED) files_list = files if files is not None else \ repo.list_files_for_branch(branch) all_files = files_list if files is None else \ repo.list_files_for_branch(branch) template_paths = defaultdict(list) for file_path in all_files: split_file_path = file_path.split('/') if len(split_file_path) > 2: template_paths[split_file_path[1]].append(file_path) extractors = json.loads( repo.file_contents_for_branch('extractors.json', branch) or '{}') seen_files = set() spiders = set() for file_path in files_list: if file_path.startswith('spiders'): try: parts = file_path.split("/") if len(parts) >= 2: spider_name = parts[1] if spider_name.endswith('.json'): spider_name = spider_name[:-5] if spider_name not in spiders: # Load spider if necessary if len(parts) > 2: file_path = 'spiders/' + spider_name + '.json' file_contents = repo.file_contents_for_branch( file_path, branch) as_json = json.loads(file_contents) templates = [] # Load all spider templates spider_templates = template_paths.get(spider_name, []) for template_path in spider_templates: seen_files.add(template_path) existing = {} # Ignore deleted templates try: templ_contents = repo.file_contents_for_branch( template_path, branch) except (TypeError, ValueError): continue json_template = json.loads(templ_contents) # Validate extractors template_extractors = json_template.get( 'extractors', {}) for field, eids in template_extractors.items(): existing[field] = [ eid for eid in eids if eid in extractors ] json_template['extractors'] = existing spider_name = parts[1] templates.append(json_template) spiders.add(spider_name) as_json.pop('template_names', None) as_json['templates'] = templates _add_to_archive(archive, file_path, json.dumps(as_json), now) except TypeError: if not ALLOW_DELETE: continue # Handle Deleted Spiders file_contents = repo.file_contents_for_branch( file_path, 'master') file_info = {'deleted': True} if file_contents: as_json = json.loads(file_contents) _add_to_archive(archive, file_path, json.dumps(file_info), now) else: file_contents = repo.file_contents_for_branch(file_path, branch) _add_to_archive(archive, file_path, file_contents, now) seen_files.add(file_path) # Add empty placeholders for missing files required by dash for file_path in {'extractors.json', 'items.json'} - seen_files: _add_to_archive(archive, file_path, '{}', now) archive.close()
def edit_project(self, name, revision): if not Repoman.repo_exists(name): self._init_project(name) return GitProjectsManager.edit_project(self, name, revision)