def _save_stemdict(filename: str) -> None: logger.debug('Saving stemming dictionary...') f = File(filename).openBin(mode='w') global stemdict global unstemdict pickle.dump((stemdict, unstemdict), f) f.close()
def getTerms(self, filename, filters=[], relaxed=False, overwrite=False): """Input file, output a FreqDist of terms""" filterfname = os.path.join(os.path.dirname(filename), 'filter.save') if os.path.exists(filename + '.nps') and os.path.exists(filterfname): f = File(filename + '.nps').openBin(mode='r') old_filters, fd = pickle.load(f) f.close() if old_filters == filters: if not Filter.unstemdict: Filter._get_stemdict(filterfname) return fd NPs = self.getNPs(filename) fd = FreqDist() for NP in NPs: # get the possible terms for each NP terms = self.extractPossibleTerms(NP, relaxed) # filter each term by some given criteria # this requires keeping case information until this point # filt = Filter.Filter() # class containing all filters for t in terms: for f in filters: t = Filter.criteria[f]( t) # @semanticbeeng @todo global state mutation if t: fd[t] += 1 if overwrite or (not os.path.isfile(filename + '.nps')): f = File[CHUNKNPS](filename + '.nps').openBin('w') pickle.dump((filters, fd), f) f.close() if os.path.exists(filterfname): os.remove(filterfname) return fd
def _get_stemdict(filename: str) -> None: logger.debug('Loading stemming dictionary...') f = File(filename).openBin(mode='r') global stemdict global unstemdict stemdict, unstemdict = pickle.load(f) f.close() # stemdict = dictionary.freeze_dict(stemdict) # @semanticbeeng @todo global state initialization : this fails unstemdict = dictionary.freeze_dict( unstemdict) # @semanticbeeng @todo global state initialization
def _get_stops() -> None: """Import stop words either from a text file or stopwords corpus""" global stops import Settings filename = Settings.dir_name + 'patentstops.txt' if filename: f = File(filename).openText() for line in f.readlines(): stops += line.split() f.close() else: stops = stopwords.words('english')