def _initialize(self): # If we're already initialized, then do nothing. if self._basedir == get_basedir(): return # Make sure the corpus is installed. basedir = get_basedir() if not os.path.isdir(os.path.join(basedir, self._original_rootdir)): raise IOError('%s is not installed' % self._name) self._basedir = basedir self._rootdir = os.path.join(basedir, self._original_rootdir) # Read in the data file. datapath = os.path.join(self._rootdir, self._data_file) data = open(datapath).read() # Extract the license self._license = self.LICENSE_RE.search(data).group(1) # Extract the description self._description = self.DESCRIPTION_RE.search(data).group(1) # Remove line number markings and other comments data = re.sub(r'<--\s+.*?-->', '', data) #p\.\s+\d+\s+-->', '', data) #data = re.sub(r'<--\s+p\.\s+\d+\s+-->', '', data) # Divide the thesaurus into items. items = re.split('\n #', data) self._itemlist = [] self._items = {} for item in items[1:]: (key, contents) = item.split('--', 1) key = ' '.join(key.split()) # Normalize the key. self._itemlist.append(key) self._items[key] = contents.strip() self._itemlist = tuple(self._itemlist)
def _initialize(self): "Make sure that we're initialized." # If we're already initialized, then do nothing. if self._initialized: return # Make sure the corpus is installed. basedir = get_basedir() if not os.path.isabs(self._original_rootdir): if not os.path.isdir(os.path.join(basedir, self._original_rootdir)): raise IOError('%s is not installed' % self._name) self._basedir = basedir self._rootdir = os.path.join(basedir, self._original_rootdir) else: if not os.path.isdir(self._original_rootdir): raise IOError('%s is not installed' % self._name) self._basedir = '' # empty self._rootdir = self._original_rootdir # # Check the directory for 'merged', and change it to # # 'combined' if appropriate. # if 'merged' in self._groups: # if os.path.isdir(os.path.join(self._rootdir, 'combined')): # self._group_directory['merged'] = 'combined' # Get the list of items in each group. self._group_items = {} for group in self._groups: self._find_items(group) if not self._group_items.has_key('combined'): self._virtual_merged = 1 self._find_virtual_merged_items() # Get the overall list of items self._items = [] for items in self._group_items.values(): self._items += items # Read metadata from files if self._description is None and self._description_file is not None: path = os.path.join(self._rootdir, self._description_file) self._description = open(path).read() if self._license is None and self._license_file is not None: path = os.path.join(self._rootdir, self._license_file) self._license = open(path).read() if self._copyright is None and self._copyright_file is not None: path = os.path.join(self._rootdir, self._copyright_file) self._copyright = open(path).read() self._initialized = True