def deploy_project(name, apikey, changed_files=None, repo=None, branch='master'): """Archive a GIT project and upload it to Dash.""" if repo is None: repo = Repoman.open_repo(name) archiver = GitProjectArchiver(repo, branch=branch, ignore_deleted=False, version=(0, 9), required_files=REQUIRED_FILES) spiders = None if changed_files is not None: spiders = {archiver._spider_name(name) for name in changed_files if name.startswith('spiders/')} zbuff = archiver.archive(spiders) payload = {'apikey': apikey, 'project': name} req = requests.post( DASH_API_URL + 'as/import.json?version=portia', files=[('archive', ('archive', zbuff, 'application/zip'))], params=payload ) if req.status_code == 200: project_url = DASH_API_URL.rsplit('/', 2)[0] + '/p/' + name return { 'status': 'ok', 'schedule_url': project_url } else: raise DeployError('Deploy to Dash failed: %s' % req.text)
def _archive_project(name, buff): """Archive a project stored in GIT into a zip file.""" repo = Repoman.open_repo(name) now = datetime.now().timetuple()[:6] archive = zipfile.ZipFile(buff, "w", zipfile.ZIP_DEFLATED) files_list = repo.list_files_for_branch('master') extractors = {} for file_path in files_list: if file_path == 'extractors.json': extractors = json.loads( repo.file_contents_for_branch(file_path, 'master')) seen_files = set() spiders = {} templates = defaultdict(list) for file_path in files_list: file_contents = repo.file_contents_for_branch(file_path, 'master') if file_path.startswith('spiders'): as_json = json.loads(file_contents) try: parts = file_path.split("/") if len(parts) == 2: # spider json spider_name = parts[1].rsplit(".", 1)[0] spiders[spider_name] = file_path, as_json elif len(parts) == 3: # template json existing = {} for field, eids in as_json.get('extractors', {}).items(): existing[field] = [ eid for eid in eids if eid in extractors ] as_json['extractors'] = existing spider_name = parts[1] templates[spider_name].append(as_json) except ValueError: continue else: _add_to_archive(archive, file_path, file_contents, now) seen_files.add(file_path) # Add empty placeholders for missing files required by dash for file_path in {'extractors.json', 'items.json'} - seen_files: _add_to_archive(archive, file_path, '{}', now) for name, (path, json_spec) in spiders.iteritems(): json_spec.pop('template_names') json_spec['templates'] = templates[name] _add_to_archive(archive, path, json.dumps(json_spec), now) archive.close()
def _archive_project(name, buff, files=None, repo=None, branch='master'): """Archive a project stored in GIT into a zip file.""" if repo is None: repo = Repoman.open_repo(name) now = datetime.now().timetuple()[:6] archive = zipfile.ZipFile(buff, "w", zipfile.ZIP_DEFLATED) files_list = files if files is not None else \ repo.list_files_for_branch(branch) all_files = files_list if files is None else \ repo.list_files_for_branch(branch) template_paths = defaultdict(list) for file_path in all_files: split_file_path = file_path.split('/') if len(split_file_path) > 2: template_paths[split_file_path[1]].append(file_path) extractors = json.loads( repo.file_contents_for_branch('extractors.json', branch) or '{}') seen_files = set() spiders = set() for file_path in files_list: if file_path.startswith('spiders'): try: parts = file_path.split("/") if len(parts) >= 2: spider_name = parts[1] if spider_name.endswith('.json'): spider_name = spider_name[:-5] if spider_name not in spiders: # Load spider if necessary if len(parts) > 2: file_path = 'spiders/' + spider_name + '.json' file_contents = repo.file_contents_for_branch( file_path, branch) as_json = json.loads(file_contents) templates = [] # Load all spider templates spider_templates = template_paths.get(spider_name, []) for template_path in spider_templates: seen_files.add(template_path) existing = {} # Ignore deleted templates try: templ_contents = repo.file_contents_for_branch( template_path, branch) except (TypeError, ValueError): continue json_template = json.loads(templ_contents) # Validate extractors template_extractors = json_template.get( 'extractors', {}) for field, eids in template_extractors.items(): existing[field] = [ eid for eid in eids if eid in extractors ] json_template['extractors'] = existing spider_name = parts[1] templates.append(json_template) spiders.add(spider_name) as_json.pop('template_names', None) as_json['templates'] = templates _add_to_archive(archive, file_path, json.dumps(as_json), now) except TypeError: if not ALLOW_DELETE: continue # Handle Deleted Spiders file_contents = repo.file_contents_for_branch( file_path, 'master') file_info = {'deleted': True} if file_contents: as_json = json.loads(file_contents) _add_to_archive(archive, file_path, json.dumps(file_info), now) else: file_contents = repo.file_contents_for_branch(file_path, branch) _add_to_archive(archive, file_path, file_contents, now) seen_files.add(file_path) # Add empty placeholders for missing files required by dash for file_path in {'extractors.json', 'items.json'} - seen_files: _add_to_archive(archive, file_path, '{}', now) archive.close()
def _archive_project(name, buff, files=None, repo=None, branch='master', ignore_deleted=False): """Archive a project stored in GIT into a zip file.""" if repo is None: repo = Repoman.open_repo(name) now = datetime.now().timetuple()[:6] archive = zipfile.ZipFile(buff, "w", zipfile.ZIP_DEFLATED) files_list = files if files is not None else \ repo.list_files_for_branch(branch) all_files = files_list if files is None else \ repo.list_files_for_branch(branch) template_paths = defaultdict(list) for file_path in all_files: split_file_path = file_path.split('/') if len(split_file_path) > 2: template_paths[split_file_path[1]].append(file_path) extractors = json.loads(repo.file_contents_for_branch('extractors.json', branch) or '{}') seen_files = set() spiders = set() for file_path in files_list: if file_path.startswith('spiders'): try: parts = file_path.split("/") if len(parts) >= 2: spider_name = parts[1] if spider_name.endswith('.json'): spider_name = spider_name[:-5] if spider_name not in spiders: # Load spider if necessary if len(parts) > 2: file_path = 'spiders/' + spider_name + '.json' file_contents = repo.file_contents_for_branch( file_path, branch) as_json = json.loads(file_contents) templates = [] # Load all spider templates spider_templates = template_paths.get(spider_name, []) for template_path in spider_templates: seen_files.add(template_path) existing = {} # Ignore deleted templates try: templ_contents = repo.file_contents_for_branch( template_path, branch) except (TypeError, ValueError): continue json_template = json.loads(templ_contents) # Validate extractors template_extractors = json_template.get( 'extractors', {}) for field, eids in template_extractors.items(): existing[field] = [eid for eid in eids if eid in extractors] json_template['extractors'] = existing spider_name = parts[1] templates.append(json_template) spiders.add(spider_name) as_json.pop('template_names', None) as_json['templates'] = templates _add_to_archive(archive, file_path, json.dumps(as_json), now) except TypeError: if ignore_deleted: continue # Handle Deleted Spiders file_contents = repo.file_contents_for_branch(file_path, 'master') file_info = {'deleted': True} if file_contents: as_json = json.loads(file_contents) _add_to_archive(archive, file_path, json.dumps(file_info), now) else: file_contents = repo.file_contents_for_branch(file_path, branch) if file_contents: _add_to_archive(archive, file_path, file_contents, now) seen_files.add(file_path) # Add empty placeholders for missing files required by dash for file_path in {'extractors.json', 'items.json'} - seen_files: _add_to_archive(archive, file_path, '{}', now) archive.close()
def test_create(self): repoman = Repoman.open_repo('new_project_5', self._connection) print repoman.list_files_for_branch('marcos') print repoman.get_branch_changed_files('marcos')
def test_create(self): repoman = Repoman.open_repo('new_project_5', self._connection) print repoman.list_files_for_branch('marcos') print repoman.get_branch_changed_files('marcos')