def __init__(self, *sc_queues, **kwargs): Process.__init__(self) SCSpider.Num += 1 self.pnum = SCSpider.Num if sc_queues: self.scqs = sc_queues else: self.scqs = list() self.due = DUEUnit() self.link_extro = LinkExtractorTPool(feed=False) #The self.headers keeps the HTTP headers Agent information for Masking the Crawler self.headers = { 'User-Agent' : kwargs.pop("spider_spoof_id", None) } if self.headers['User-Agent'] == None: self.headers = { 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux x86_64; en-GB; rv:1.9.1.9)' } self.kill_evt = kwargs.pop("kill_evt", multiprocessing.Event().clear()) self.urls_l = [ kwargs.pop("seed", None) ] self.xtrees_q = kwargs.pop("xtrees_q", Queue()) #Use external Queue only for Interprocess Communication if any #ext_due_q is a Queue of URL Links for an External DUE-Unit self.ext_url_q = kwargs.pop("ext_due_q", None) self.base_url_drop_none = kwargs.pop("base_url_drop_none", True) #urls_number_stop : Stop in a Default Values (if none given from user) for Politeness and because there is no point to have more samples of this site (I think) self.urls_number = kwargs.pop("urls_number_stop", 1000) self.webpg_vect_tu = kwargs.pop("webpg_vect_tu", None) self.save_path = kwargs.pop("save_path", None) if self.save_path and not os.path.isdir(self.save_path): os.mkdir(self.save_path) self.file_counter = 0
def __init__(self): self._shells = { } # Keys are (username, shell_id) tuples. Each user has his/her own set of shell ids. self._command_by_short_name = { } # Map each short name to its command (e.g. ["pig", "-l", "/dev/null"]) self._meta = {} # Map usernames to utils.UserMetadata objects self._greenlets_by_hid = { } # Map each Hue Instance ID (HID) to greenlet currently fetching output for that HID. self._hids_by_pid = { } # Map each process ID (PID) to the HID whose greenlet is currently doing a "select" on the process's output fd. self._greenlets_to_notify = { } # For each PID, maintain a set of greenlets who are also interested in the output from that process, but are not doing the select. self._shells_by_fds = { } # Map each file descriptor to the Shell instance whose output it represents. self._greenlet_interruptable = { } # For each greenlet, store if it can be safely interrupted. self._env_by_short_name = { } # Map each short name to a dictionary which contains the environment for shells of that type. self._delegation_token_dir = shell.conf.SHELL_DELEGATION_TOKEN_DIR.get( ) if not os.path.exists(self._delegation_token_dir): os.mkdir(self._delegation_token_dir) self._parse_configs() eventlet.spawn_after(1, self._handle_periodic)
def __init__(self, path=None): self.id = None self.base_url = dict() #Keeps the hash and the Base URL self.seen = dict() #Keeps the URLs with or without the Base part self.filelist = list() self.conditonal_var = threading.Condition() self.green_pool = GreenPool(100) if path: self.filespath = path else: self.filespath = "/home/dimitrios/Documents/Synergy-Crawler/seen_urls/" if self.filespath and not os.path.isdir(self.filespath): os.mkdir(self.filespath)
def __init__(self): self._shells = {} # Keys are (username, shell_id) tuples. Each user has his/her own set of shell ids. self._command_by_short_name = {} # Map each short name to its command (e.g. ["pig", "-l", "/dev/null"]) self._meta = {} # Map usernames to utils.UserMetadata objects self._greenlets_by_hid = {} # Map each Hue Instance ID (HID) to greenlet currently fetching output for that HID. self._hids_by_pid = {} # Map each process ID (PID) to the HID whose greenlet is currently doing a "select" on the process's output fd. self._greenlets_to_notify = {} # For each PID, maintain a set of greenlets who are also interested in the output from that process, but are not doing the select. self._shells_by_fds = {} # Map each file descriptor to the Shell instance whose output it represents. self._greenlet_interruptable = {} # For each greenlet, store if it can be safely interrupted. self._env_by_short_name = {} # Map each short name to a dictionary which contains the environment for shells of that type. self._delegation_token_dir = shell.conf.SHELL_DELEGATION_TOKEN_DIR.get() if not os.path.exists(self._delegation_token_dir): os.mkdir(self._delegation_token_dir) self._parse_configs() eventlet.spawn_after(1, self._handle_periodic)
def __init__(self): self._shells = {} # Keys are (username, shell_id) tuples. Each user has his/her own set of shell ids. shell_types = [] # List of available shell types. For each shell type, we have a nice name (e.g. "Python Shell") and a short name (e.g. "python") self._command_by_short_name = {} # Map each short name to its command (e.g. ["pig", "-l", "/dev/null"]) self._meta = {} # Map usernames to utils.UserMetadata objects self._greenlets_by_hid = {} # Map each Hue Instance ID (HID) to greenlet currently fetching output for that HID. self._hids_by_pid = {} # Map each process ID (PID) to the HID whose greenlet is currently doing a "select" on the process's output fd. self._greenlets_to_notify = {} # For each PID, maintain a set of greenlets who are also interested in the output from that process, but are not doing the select. self._shells_by_fds = {} # Map each file descriptor to the Shell instance whose output it represents. self._greenlet_interruptable = {} # For each greenlet, store if it can be safely interrupted. self._delegation_token_dir = shell.conf.SHELL_DELEGATION_TOKEN_DIR.get() if not os.path.exists(self._delegation_token_dir): os.mkdir(self._delegation_token_dir) for item in shell.conf.SHELL_TYPES.keys(): command = shell.conf.SHELL_TYPES[item].command.get().strip().split() nice_name = shell.conf.SHELL_TYPES[item].nice_name.get().strip() executable_exists = utils.executable_exists(command) if executable_exists: self._command_by_short_name[item] = command shell_types.append({ constants.NICE_NAME: nice_name, constants.KEY_NAME: item, constants.EXISTS:executable_exists }) self.shell_types = shell_types eventlet.spawn_after(1, self._handle_periodic)