def __init__(self, spider, n_git_threads, n_file_threads, worker_file=None): self.spider = spider self.n_git_threads = n_git_threads self.n_file_threads = n_file_threads self.queue_repositories = Queue() self.queue_out = Queue() self.workers_git = list() self.worker_file = worker_file(self.queue_out) self.repositories = [] self.logger = init_main()
import json import logging from Config import Config from concurrency.Pipeline import Pipeline from concurrency.workers.WorkerFile import WorkerFile from spiders.GitHubSpider import GitHubSpider from tools.Logger import init_main # Logging logger = init_main() logger.setLevel(logging.DEBUG) # Pipeline pipeline = Pipeline(GitHubSpider, Config.get_n_git_threads(), Config.get_n_file_threads(), WorkerFile) def pull_collection(url): # Load repositories from collection repo_urls = pipeline.search_collections([url]) # Save repositories with open(Config.get_dir_out() + "repos_" + url.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile: outfile.write(json.dumps(repo_urls, indent=2)) return repo_urls def pull_repository(url, file_ending_whitelist):
def __init__(self, spider): super().__init__() self.stop() self.spider = spider() self.name = 'Worker Collections' self.logger = init_main()