Пример #1
0
 def compute(self,
             ngram_db,
             embeddings_db,
             table: str,
             field: str,
             save_every=-1):
     in_fld = ['rowid', field]
     db_manager = DatabaseManager(ngram_db,
                                  table,
                                  in_fld,
                                  save_every=save_every)
     embeddings_db.connect()
     new_fld = [('comp_' + self.fun.__name__, 'float')]
     db_manager.new_fields(new_fld)
     self.sw_vecs = embeddings_db.to_dict(1)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure(new_fld[0][0], n)
         ngrams, _ = db_manager.get_iterator(n)
         self.ngram_vecs = embeddings_db.to_dict(n)
         for i, (rowid, ngram) in enumerate(ngrams):
             self._upd(rowid, ngram)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     embeddings_db.disconnect()
     db_manager.finalize(self)
Пример #2
0
 def compute(self, ngram_db, table: str, field: str, v_table: str):
     in_fld = [field, 'freq']
     db_manager = DatabaseManager(ngram_db, v_table, in_fld)
     new_fld = [('syn_entropy', 'float')]
     db_manager.new_fields(new_fld, table=table)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure(new_fld[0][0], n)
         self.base_frqs = ngram_db.to_dict(table, [field],
                                           ['freq', 'rowid'], n)
         variant_freqs, _ = db_manager.get_iterator(n)
         for i, (v_base, v_freq) in enumerate(variant_freqs):
             self._upd(v_base, v_freq)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Пример #3
0
 def compute(self, ngram_db, table: str, field: str, save_every=-1):
     in_fld = ['rowid', field, 'freq']
     db_manager = DatabaseManager(ngram_db, table, in_fld, save_every)
     new_fld = [('c_value', 'float')]
     db_manager.new_fields(new_fld)
     for n in messages.pbar(range(db_manager.n_max, 1, -1)):
         messages.computing_measure('c_value', n)
         ngram_frqs, N = db_manager.get_iterator(n)
         self.n = n
         for i, (rowid, ngram, freq) in enumerate(ngram_frqs):
             self._upd(rowid, ngram, freq)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
         self._swich_super()
     db_manager.finalize(self)
Пример #4
0
def entropy(ngram_db, variant_dict: dict, table: str, field: str):
    in_fld = ['rowid', field]
    output_db, n_max = initialize(ngram_db, table, in_fld)
    new_fld = [('lex_entropy', 'float')]
    output_db.new_fields(table, new_fld)
    calculator = LexEntCalculator(variant_dict)
    for n in range(2, n_max+1):
        messages.computing_measure(new_fld[0][0], n)
        var_frqs = ngram_db.to_dict(table, [field], ['freq'], n)
        calculator.set_var_frqs(var_frqs)
        base_freqs, N = ngram_db[n]
        for i, (rowid, ngram) in enumerate(messages.pbar(base_freqs, total=N)):
            calculator.upd_stats(rowid, ngram)
            save_every(output_db, calculator.get_list(), table, new_fld[0][0],
                       i)
    output_db.update_data(calculator.get_list(), table, new_fld[0][0])
    finalize(ngram_db, output_db)
Пример #5
0
 def compute(self, ngram_db, table: str, field: str, save_every=-1):
     in_fld = ['rowid', 'word', 'skipgram', field]
     db_manager = DatabaseManager(ngram_db,
                                  table,
                                  in_fld,
                                  save_every=save_every)
     new_fields = [('max_' + field, 'float')]
     db_manager.new_fields(new_fields)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure(new_fields[0][0], n)
         self.pre = db_manager.to_dict(table, ['word', 'skipgram'],
                                       ['max_' + field], n - 1)
         current, _ = db_manager.get_iterator(n)
         for i, (rowid, w, skipgram, value) in enumerate(current):
             self._upd(rowid, w, skipgram, value)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Пример #6
0
 def compute(self, ngram_db, table: str, field: str):
     in_fld = ['rowid', field, 'freq']
     db_manager = DatabaseManager(ngram_db, table, in_fld)
     new_fld = [('fdp_' + m.__name__, 'float') for m in self.measure]
     db_manager.new_fields(new_fld)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         fld_str = ', '.join([f[0] for f in new_fld])
         messages.computing_measure(fld_str, n)
         self.n = n
         for i in range(1, n // 2 + 1):
             freq_12, N = db_manager.get_iterator(n)
             self.N = N
             to_dict_args = [table, [field], ['freq']]
             self._set_freq_dicts(db_manager, i, to_dict_args)
             for rowid, ngram, freq in freq_12:
                 self._upd(rowid, ngram, freq, i)
         out_list = self._get_list()
         db_manager.save(out_list)
     db_manager.finalize(self)
Пример #7
0
 def compute(self, ngram_db, save_every=-1):
     in_table = 'lex_context_counts'
     in_fld = ['rowid', 'word', 'skipgram', 'pred']
     db_manager = DatabaseManager(ngram_db,
                                  in_table,
                                  in_fld,
                                  save_every=save_every)
     new_fields = [('lpr', 'float')]
     db_manager.new_fields(new_fields)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure('word lpr', n)
         self.syn_prob = db_manager.to_dict('syn_context_counts',
                                            ['word', 'skipos'], ['pred'], n)
         lex_prob, _ = db_manager.get_iterator(n)
         for i, (rowid, word, skipgram, prob) in enumerate(lex_prob):
             self._upd(rowid, word, skipgram, prob)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Пример #8
0
 def compute(self, ngram_db, save_every=-1):
     in_table = 'syn_context_counts'
     in_fld = ['rowid', 'skipos', 'freq']
     db_manager = DatabaseManager(ngram_db,
                                  in_table,
                                  in_fld,
                                  save_every=save_every)
     new_fields = [('pred', 'float')]
     db_manager.new_fields(new_fields)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure('syntactic predictability', n)
         self.skipos_freqs = db_manager.to_dict('skipos_counts', ['skipos'],
                                                ['freq'], n)
         syn_c_freqs, self.N = db_manager.get_iterator(n)
         for i, (rowid, skipos, freq) in enumerate(syn_c_freqs):
             self._upd(rowid, skipos, freq)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Пример #9
0
 def compute(self, ngram_db, table: str, field: str, save_every=-1):
     in_fld = [field, 'freq']
     db_manager = DatabaseManager(ngram_db,
                                  table,
                                  in_fld,
                                  save_every=save_every)
     new_table = 'syn_context_counts'
     new_fields = [('word', 'text'), ('skipos', 'text'), ('length', 'int'),
                   ('freq', 'int')]
     db_manager.new_table(new_table, new_fields)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure('syntactic context freq', n)
         ngram_freqs, _ = db_manager.get_iterator(n)
         self.n = n
         for i, (ngram, freq) in enumerate(ngram_freqs):
             self._upd(ngram, freq)
             db_manager.save_every(self._get_list(add_n=True), i)
         db_manager.save(self._get_list(add_n=True))
     db_manager.finalize(self)
Пример #10
0
 def compute(self, ngram_db, table: str, field: str, save_every=-1):
     in_fld = ['rowid', field, 'freq']
     db_manager = DatabaseManager(ngram_db,
                                  table,
                                  in_fld,
                                  save_every=save_every)
     new_fld = [(m.__name__, 'float') for m in self.measure]
     db_manager.new_fields(new_fld)
     self.freq_1 = db_manager.to_dict(table, [field], ['freq'], 1)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         fld_str = ', '.join([f[0] for f in new_fld])
         messages.computing_measure(fld_str, n)
         freq_12, N = db_manager.get_iterator(n)
         self.n, self.N = n, N
         for i, (rowid, ngram, freq) in enumerate(freq_12):
             self._upd(rowid, ngram, freq)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Пример #11
0
 def compute(self, ngram_db, table: str, save_every=-1):
     in_table = 'lex_context_counts'
     in_fld = ['max_' + self.measure]
     db_manager = DatabaseManager(ngram_db,
                                  in_table,
                                  in_fld,
                                  save_every=save_every)
     new_fields = [(self.agg_fun + '_' + self.measure, 'float')]
     db_manager.new_fields(new_fields, table=table)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure(new_fields[0][0], n)
         ngrams_id = db_manager.to_list(table, ['rowid'], n)
         self.w_lpr, N = db_manager.get_iterator(n, pbar=False)
         self.n = n
         ngrams_id = messages.pbar(ngrams_id)
         for i, rowid in enumerate(ngrams_id):
             self._upd(rowid[0])
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Пример #12
0
 def compute(self, ngram_db, table: str, field: str, save_every=-1):
     in_fld = [field, 'freq']
     db_manager = DatabaseManager(ngram_db,
                                  table,
                                  in_fld,
                                  save_every=save_every)
     new_table = 'lex_context_counts'
     new_fields = [('word', 'text'), ('skipgram', 'text'),
                   ('length', 'int'), ('pred', 'float')]
     db_manager.new_table(new_table, new_fields)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure('lexical predictability', n)
         ngram_freqs, self.N = db_manager.get_iterator(n)
         self.n = n
         self.skipgram_freqs = db_manager.to_dict('skipgram_counts',
                                                  ['skipgram'], ['freq'], n)
         for i, (ngram, freq) in enumerate(ngram_freqs):
             self._upd(ngram, freq)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)