Пример #1
0
def build_test_data():

    dawg.CompletionDAWG(['f', 'bar', 'foo',
                         'foobar']).save('dev_data/small/completion.dawg')
    dawg.CompletionDAWG([]).save('dev_data/small/completion-empty.dawg')

    bytes_data = (('foo', b'data1'), ('bar', b'data2'), ('foo', b'data3'),
                  ('foobar', b'data4'))
    dawg.BytesDAWG(bytes_data).save('dev_data/small/bytes.dawg')

    record_data = (('foo', (3, 2, 256)), ('bar', (3, 1, 0)),
                   ('foo', (3, 2, 1)), ('foobar', (6, 3, 0)))
    dawg.RecordDAWG(str(">3H"), record_data).save('dev_data/small/record.dawg')

    int_data = {'foo': 1, 'bar': 5, 'foobar': 3}
    dawg.IntDAWG(int_data).save('dev_data/small/int_dawg.dawg')
    dawg.IntCompletionDAWG(int_data).save(
        'dev_data/small/int_completion_dawg.dawg')

    dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg')
    dawg.RecordDAWG(str("=H"), [
        (k, (len(k), )) for k in TestPrediction.DATA
    ]).save('dev_data/small/prediction-record.dawg')

    create_dawg().save('dev_data/large/dawg.dawg')
    create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg')
    create_record_dawg().save('dev_data/large/record_dawg.dawg')
    create_int_dawg().save('dev_data/large/int_dawg.dawg')
Пример #2
0
    def createIndex(self):
        '''
        Truncated Deletion Neighborhoods.
        Like the deletion neighborhood technique above, except each dictionary
             entry is truncated to k characters
        Every entry in the index therefore represents a range of dictionary
            words with that prefix.
        k = 7 and delta = 3 gets a index size roughly the same size as the
            dictionary widh identical running time to full-size deletion
            neighborhood.
        k = 6 and delta = 3 gets an index size less than the size of dictionary
            at the cost of apprx double runtime
        Investigate: using different Ks for different words

        Changes from full word algorithm: make a trie instead. to search for
            substrings of length i, we do a traversal of the last trie node
            (the trie tool handles this automatically)
        '''
        prefixes = self.getRanges()  # takes the form {prefix:rangeid}

        # each range is an array of length 2, [index, size]
        # turn the arrays into tuples, and use them as the values for the trie
        # first: get all of the deletion neighborhood for the keys, arrange
        #    into two arrays for feeding into the trie

        keys = []
        values = []
        fmt = "<HH"
        # little-endian two unsigned short tuple Max number and
        # size of ranges, 65000
        # if that's not enough, use I for unsigned int

        i = 0
        for prefix in prefixes.keys():
            substringset = n_deletion_neighborhood(prefix, self.delta)
            # get deletion neighborhood
            for sub in substringset:
                # lemma 3.2.5 in order to index we only need sequences of
                #   length k-ð
                # TODO: decide whether or not to make this line != or >
                #   > should result in a bigger index but somehow doesn't?
                #       I'm confused.
                #   != causes loss of recall for words shorter than k-ð
                if len(sub) > self.k - self.delta or (len(sub) <
                                                      self.minimum_prefix):
                    continue
                keys.append(sub)
                values.append(tuple(self.ranges[prefixes[prefix]]))
                i += 1
        try:
            self.index = dawg.RecordDAWG(fmt, zip(keys, values))
        except:
            print("Big dictionary, trying to index with 4 bytes rather than 2")
            self.index = dawg.RecordDAWG("<II", zip(keys, values))

        del prefixes
Пример #3
0
    def __sents__(self):

        encoding = self._encoding
        sentencizer = self._prep.sentencizer
        clean = self._clean
        path = self._input

        if self.loadas == 'txt' and self._path:
            path = datapath(self._path,
                            datadir=self.datadir,
                            ext=".tagged.txt").full
            if os.path.exists(path):
                encoding = 'utf-8'
                sentencizer = None
                clean = None

        stream = Stream(path, encoding=encoding)
        self.encoding = stream._encoding
        for num, sent in enumerate(stream(sentencizer, clean)):
            tagged_sent = TaggedSentence(sent.strip(), num, self._prep,
                                         self._filters)
            lemmas = tagged_sent.lemmas()
            # в этот словарь попадают все леммы,
            # так как здесь ничего не фильтруется
            self._vocab += FreqDist(lemmas)
            self._nwords += tagged_sent.nwords
            self._sents.append(tagged_sent)
            #self._words.extend(tagged_sent.words())

            yield tagged_sent

        data = ((token.word, (token.nsent, token.idx))
                for sent in self.sents() for token in sent.tokens(lower=True))
        self._trie = dawg.RecordDAWG(">IH", data)
    def _build_dawg(self) -> dawg.RecordDAWG:
        words = self._filter_words(self._words)
        freqs = set(freq for _, (_, _, freq) in words)
        freq_to_index = self._quantize_freqs(freqs)
        words = ((word, (freq_to_index[freq], lem_rule, gr_val))
                 for word, (gr_val, lem_rule, freq) in words)

        return dawg.RecordDAWG('>HHH', words)
Пример #5
0
    def handle(self, *args, **options):
        print 'Emptying table...'
        Term.objects.all().delete()

        for timeframe, dates in TIMEFRAMES.items():
            print 'Retrieving documents for timeframe {}...'.format(timeframe)
            exclude_dist = Distribution.objects.exclude(
                name='Landelijk').values_list('id', flat=True)
            date_range = daterange2dates(dates)

            total_documents = count_search_results(settings.ES_INDEX,
                                                   settings.ES_DOCTYPE, None,
                                                   date_range, exclude_dist,
                                                   [], []).get('count')
            print 'Total documents: {}'.format(total_documents)

            sets = document_id_chunks(10000,
                                      settings.ES_INDEX,
                                      settings.ES_DOCTYPE,
                                      None,
                                      date_range,
                                      dist=exclude_dist)

            print 'Counting terms...'
            counter = Counter()
            for n, s in enumerate(sets):
                start_time = time.time()
                counter += termvector_wordcloud(settings.ES_INDEX,
                                                settings.ES_DOCTYPE,
                                                s,
                                                min_length=2,
                                                add_freqs=False)
                print 'Completed set {} in {} seconds...'.format(
                    n + 1,
                    time.time() - start_time)

            print 'Calculating IDFs...'
            terms = []
            for term, count in counter.items():
                if count > 1:  # don't add single occurrences
                    idf = math.log10(total_documents / float(count))
                    terms.append(
                        Term(timeframe=timeframe,
                             word=term,
                             count=count,
                             idf=idf))

            print 'Transferring to database...'
            Term.objects.bulk_create(terms, batch_size=10000)

            print 'Creating RecordDAWG'
            d = dawg.RecordDAWG(
                '<d', zip([t.word for t in terms], [(t.idf, ) for t in terms]))
            d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg'))
        """ Test code below.
Пример #6
0
    def __init__(self, filename, featname, format=None):
        import dawg

        if format is None:
            self.data = dawg.CompletionDAWG()
        else:
            self.data = dawg.RecordDAWG(format)
        self.data.load(filename)

        self.filename = filename
        super(DAWGGlobalFeature, self).__init__(self.data, featname)
Пример #7
0
 def __init__(self,
              value_format: str = 'h',
              threshold: float = 0.8,
              num_perm: int = 128,
              num_part: int = 32,
              tokenizer: Tokenizer = Tokenizer('zh')):
     self.value_format = value_format
     self.threshold = threshold
     self.num_perm = num_perm
     self.num_part = num_part
     self.tokenizer = tokenizer
     self.lsh = MinHashLSHEnsemble(threshold=self.threshold,
                                   num_perm=self.num_perm)
     self.record_dawg = dawg.RecordDAWG(self.value_format)
Пример #8
0
    def loaddawg(self, name, path=None):
        path_ = path or datapath(self._path, datadir=self.datadir).short
        path = '{}.{}.dawg'.format(path_, name)

        if self._validpath(path):
            if self.verbose:
                print('loading dawg:'.ljust(16),
                      path.replace(nlptk.MODULEDIR, '..'))

            d = dawg.RecordDAWG(">IH")
            obj = d.load(path)
        else:
            obj = None
        return obj
Пример #9
0
    def __init__(self, dict_path: str):
        with TemporaryDirectory('dict') as temp_dir:
            with ZipFile(dict_path) as zip_file:
                zip_file.extractall(temp_dir)

            self._dawg = dawg.RecordDAWG('>HHH')
            self._dawg.load(os.path.join(temp_dir, 'dict.dawg'))

            with open(os.path.join(temp_dir, 'dict.info'), 'rb') as f:
                self._categories, self._grammemes_mappings, self._grammar_value_mapping, self._lemmatize_rule_mapping, \
                self._alphabet, self._similar_letters, self._quantized_freqs_mapping = pickle.load(f)

        self._similar_letters_replacements = self._compile_replacements()
        self._grammemes_matrix = self._build_grammemes_matrix()
Пример #10
0
def to_dawg(df, columns=None, format=None):
    """
    Encode ``pandas.DataFrame`` with GeoNames data
    (loaded using :func:`read_geonames` and maybe filtered in some way)
    to ``dawg.DAWG`` or ``dawg.RecordDAWG``. ``dawg.DAWG`` is created
    if ``columns`` and ``format`` are both None.
    """
    import dawg
    if columns is None:
        assert format is None
        df = _split_names_into_rows(df)
        return dawg.CompletionDAWG(iter(df.name))

    return dawg.RecordDAWG(format, _iter_geonames_items(df, columns))
Пример #11
0
 def add(self, text_list: List[str], value_list: List[tuple]):
     len_text = len(text_list)
     len_value = len(value_list)
     assert len_text == len_value
     data = {}
     entries = []
     for i, text in enumerate(text_list):
         entry = self.text_to_lsh_entry(text)
         key = entry[0]
         if key in data:
             continue
         value = value_list[i]
         self.__check_value_format(value)
         data[key] = value
         entries.append(entry)
     self.lsh.index(entries)
     self.record_dawg = dawg.RecordDAWG(self.value_format, data.items())
Пример #12
0
def normalize_cloud(cloud_data, idf_timeframe=''):
    """
    Normalizes cloud data:
    - if necessary, calculates the tf-idf-scores
    - sort and return the maximum allowed number of words
    """
    # If IDF is set, multiply term frequencies by inverse document frequencies
    if idf_timeframe:
        d = dawg.RecordDAWG('<d')
        d.load(os.path.join(settings.PROJECT_PARENT, idf_timeframe + '.dawg'))
        result = [{
            'term': t,
            'count': c,
            'tfidf': round(tfidf(d, t, c), 2)
        } for t, c in cloud_data.items()]
        result = sorted(result, key=lambda k: k['tfidf'], reverse=True)
    else:
        result = [{'term': t, 'count': c} for t, c in cloud_data.items()]
        result = sorted(result, key=lambda k: k['count'], reverse=True)

    return result[:settings.WORDCLOUD_MAX_WORDS]
    def _filter_words(self, words: Iterable[Tuple[str, Tuple[int, int,
                                                             float]]]):
        base_dawg = dawg.RecordDAWG('>HHf', words)

        filtered_words = []
        for key in set(base_dawg.keys()):
            values = sorted(base_dawg.get_value(key), key=lambda x: x[0])
            prev_val, prev_lemmatize_rule = None, None
            prev_val_freq = 0.
            for val, lemmatize_rule, freq in values:
                if val == prev_val and lemmatize_rule == prev_lemmatize_rule:
                    prev_val_freq += freq
                else:
                    if prev_val is not None:
                        filtered_words.append(
                            (key, (prev_val, prev_lemmatize_rule,
                                   prev_val_freq)))
                    prev_val, prev_lemmatize_rule, prev_val_freq = val, lemmatize_rule, freq
            if prev_val is not None:
                filtered_words.append(
                    (key, (prev_val, prev_lemmatize_rule, prev_val_freq)))

        return filtered_words
Пример #14
0
 def test_record_dawg_items_values(self, word, prediction):
     d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA)
     assert d.similar_item_values(word, self.REPLACES) == prediction
Пример #15
0
    def __init__(
            self,
            intake,
            prep: Prep = None,
            clean: TextCleaner = None,
            filters: TokenFilter = None,
            inplace=False,
            datadir=None,
            encoding=chardetector,
            verbose=True,
            rewrite=False,
            loadas="pickle",
            saveas=("txt", "pickle"),
            input='filename'  # str {'filename', 'file', 'text'}
    ):

        self._path = ''
        self.filename = ''
        self.name = ''
        self.inplace = inplace
        self.verbose = verbose
        self.rewrite = rewrite
        self.loadas = loadas
        self.saveas = saveas
        self.encoding = 'unknown'

        if not datadir:
            self.datadir = os.path.join(
                os.path.abspath(os.path.dirname(__file__)), "data")
        else:
            self.datadir = os.path.abspath(datadir)

        self._encoding = None
        self._nwords = 0
        self._sents = []
        self._vocab = FreqDist()
        self._trie = dawg.RecordDAWG(">IH")

        if input == 'filename':
            self._path = intake
            #self.filename = os.path.basename(os.path.splitext(self._path)[0])
            self.filename = os.path.basename(self._path)
            self.name = os.path.splitext(self.filename)[0]
            self._encoding = encoding
            self._input = intake

            if not self.rewrite:
                if self.loadas == 'pickle':
                    self._sents = self.loadpickle('sents') or [
                    ]  # all sentences from the text
                    self._vocab = self.loadpickle('vocab') or FreqDist(
                    )  # all unique normalized words from the text
                    # итеративная загрузка словаря идет несколько секунд идет, поэтому быстрее
                    # (за доли секунды) прочитать его из pickle
                    #for sent in self._sents:
                    #    self._vocab += FreqDist(sent.lemmas())

                    self._trie = self.loaddawg('trie') or dawg.RecordDAWG(
                        ">IH")  #  prefix tree

        elif input == "text":
            self._input = io.StringIO(intake)
            self._path = ''
            self.filename = self._input.__class__.__name__
            self.name = self.filename

        elif input == "file":
            self._input = intake
            self._path = ''
            self.filename = self._input.__class__.__name__
            self.name = self.filename

        if self._sents:
            self._nwords = sum(map(lambda s: s.nwords, self._sents))

        self._prep = prep
        self._clean = clean
        self._filters = filters

        self._iter = self.__sents__()
        # close the generator if data is loaded
        if self._sents:
            self._iter.close()

        if self.inplace:
            if not self._sents:
                list(self._iter)
Пример #16
0
Файл: speed.py Проект: yyht/DAWG
def create_record_dawg():
    words = words100k()
    values = [[len(word)] for word in words]
    return dawg.RecordDAWG(str('<H'), zip(words, values))
Пример #17
0
 def dawg(self):
     return dawg.RecordDAWG(">3H", self.STRUCTURED_DATA)