示例#1
0
def _save_stemdict(filename: str) -> None:
    logger.debug('Saving stemming dictionary...')
    f = File(filename).openBin(mode='w')
    global stemdict
    global unstemdict
    pickle.dump((stemdict, unstemdict), f)
    f.close()
示例#2
0
 def getTerms(self, filename, filters=[], relaxed=False, overwrite=False):
     """Input file, output a FreqDist of terms"""
     filterfname = os.path.join(os.path.dirname(filename), 'filter.save')
     if os.path.exists(filename + '.nps') and os.path.exists(filterfname):
         f = File(filename + '.nps').openBin(mode='r')
         old_filters, fd = pickle.load(f)
         f.close()
         if old_filters == filters:
             if not Filter.unstemdict:
                 Filter._get_stemdict(filterfname)
             return fd
     NPs = self.getNPs(filename)
     fd = FreqDist()
     for NP in NPs:
         # get the possible terms for each NP
         terms = self.extractPossibleTerms(NP, relaxed)
         # filter each term by some given criteria
         # this requires keeping case information until this point
         # filt = Filter.Filter() # class containing all filters
         for t in terms:
             for f in filters:
                 t = Filter.criteria[f](
                     t)  # @semanticbeeng @todo global state mutation
             if t:
                 fd[t] += 1
     if overwrite or (not os.path.isfile(filename + '.nps')):
         f = File[CHUNKNPS](filename + '.nps').openBin('w')
         pickle.dump((filters, fd), f)
         f.close()
     if os.path.exists(filterfname):
         os.remove(filterfname)
     return fd
示例#3
0
def _get_stemdict(filename: str) -> None:
    logger.debug('Loading stemming dictionary...')
    f = File(filename).openBin(mode='r')
    global stemdict
    global unstemdict
    stemdict, unstemdict = pickle.load(f)
    f.close()

    # stemdict = dictionary.freeze_dict(stemdict)           # @semanticbeeng @todo global state initialization : this fails
    unstemdict = dictionary.freeze_dict(
        unstemdict)  # @semanticbeeng @todo global state initialization
示例#4
0
def _get_stops() -> None:
    """Import stop words either from a text file or stopwords corpus"""
    global stops
    import Settings
    filename = Settings.dir_name + 'patentstops.txt'

    if filename:
        f = File(filename).openText()
        for line in f.readlines():
            stops += line.split()
        f.close()
    else:
        stops = stopwords.words('english')