示例#1
0
 def frequency_threshold(self, table, threshold):
     messages.msg("Filtering ngrams...")
     delete = sql.DELETE.format(table=table)
     self.execute(delete, threshold)
     self.commit()
     messages.done()
     self.upd_info(table)
示例#2
0
def lex_fixedness(ngram_db, variant_dict: dict, table: str, field: str,
                  m_field: str):
    in_fld = ['rowid', field, m_field]
    output_db, n_max = initialize(ngram_db, table, in_fld)
    new_fld = [('lex_fixedness_' + m_field, 'float')]
    output_db.new_fields(table, new_fld)
    for n in range(2, n_max+1):
        messages.msg("Collecting statistics from {}grams...".format(n))
        lemma_pmis = ngram_db.to_dict('lemma_counts', ['lemma'], ['pmi'], n)
        ngram_freqs = ngram_db[n]
        lex_fixedness = list()
        for i, (ngram, pmi, rowid) in enumerate(messages.pbar(ngram_freqs,
                                                       total=num_rows[str(n)])):
            V = sim(ngram, neighbours_dict)
            V_pmis = list()
            for v in V:
                try:
                    V_pmis.append(lemma_pmis[v])
                except KeyError:
                    pass
            if len(V_pmis) > 1:
                fixedness = (pmi - np.mean(V_pmis)) / np.var(V_pmis)
            else:
                fixedness = 0
            lex_fixedness.append((rowid, fixedness))
            if (i+1) % save_every == 0:
                output_db.update_data(lex_fixedness, in_table, 'lex_fixedness')
                lex_fixedness = list()
    output_db.update_data(lex_fixedness, in_table, 'lex_fixedness')
    ngram_db.disconnect()
    output_db.disconnect()
    del_and_rename(ngram_db, output_db)
    messages.done()
示例#3
0
 def copy_db(ngram_db):
     messages.msg('Copying database...')
     outroot = ngram_db.fileroot + '_'
     if not os.path.exists(outroot + '.info.json'):
         copyfile(ngram_db.db, outroot + '.db')
         copyfile(ngram_db.info_file, outroot + '.info.json')
     messages.done()
     return NgramDb.load(outroot)
示例#4
0
 def _aggregate_tmp(self, sum_fld: list, n_keys=1):
     messages.msg('Aggregating  values...')
     for f in sum_fld:
         self.new_flds.remove(f)
     grp_f, other_f = self.new_flds[:n_keys], self.new_flds[n_keys:]
     self.output_db.aggregate_by(self.new_tb[4:], self.new_tb, sum_fld,
                                 other_f, grp_f)
     self.output_db.drop_table(self.new_tb)
     self.new_tb = self.new_tb[4:]
示例#5
0
def gen_skipos(ngram_db, display=True):
    if display:
        messages.new_display()
    messages.msg('Generating skipos statistics...')
    ngram_db.connect()
    ngram_db.aggregate_by('skipos_counts', 'skipgram_counts', ['freq'],
                          ['length'], ['skipos'])
    ngram_db.upd_info('skipos_counts')
    ngram_db.disconnect()
    messages.done()
示例#6
0
 def _gen_info(self, dim: int):
     messages.msg("Generating info file...")
     num_rows = dict()
     table = self.get_tables()[0]
     n_max = self.query(sql.MAX_LEN.format(table=table))[0][0]
     for n in range(1, n_max + 1):
         n_rows = self.query(sql.ROW_COUNT.format(table=table), n)
         num_rows.setdefault(table, dict())[n] = n_rows[0][0]
     info = {'n_max': n_max, 'dim': dim, 'num_rows': num_rows}
     utils.save_json(info, self.info_file)
     messages.done()
示例#7
0
 def _counts_to_db(self, counter, sep, commit_each=10000):
     messages.msg("Saving counter to sql database...")
     self.connect()
     rows = list()
     for i, (ngram, freq) in messages.pbar(enumerate(counter.items())):
         ngram = ngram.split(' ')
         n = len(ngram)
         row = [n, freq]
         for feat in self.feats:
             v = ' '.join([self._get_feat(w, feat, sep) for w in ngram])
             row.append(v)
         rows.append(row)
     self.insert_data(messages.pbar(rows), 'ngram_counts')
示例#8
0
 def _gen_info(self):
     messages.msg("Generating info file...")
     num_rows = dict()
     tables = self.get_tables()
     if tables:
         n_max = self.query(sql.MAX_LEN.format(table=tables[0]))[0][0]
         for t in tables:
             for n in range(1, n_max + 1):
                 n_rows = self.query(sql.ROW_COUNT.format(table=t), n)
                 num_rows.setdefault(t, dict())[n] = n_rows[0][0]
     else:
         n_max, num_rows = 0, None
     info = {'n_max': n_max, 'num_rows': num_rows}
     utils.save_json(info, self.info_file)
     messages.done()
示例#9
0
    def count_ngrams(self,
                     sentences,
                     n,
                     use_bounter=True,
                     sep='\t',
                     **bounterargs):
        """Counts n-gram occurrences in a corpus.

        Counts n-gram occurrences in a corpus and inserts the output in an
        SQLite database.

        Parameters
        ----------
        sentences: Iterable
            Iterable of sentences. Each sentence must be a list of strings
            representing word features separated with the character that
            is passed to the 'sep' argument of this function.
        n: int or list of int
            length of the n-grams
        use_bounter: bool, default=True
            If True, the counts are performed via bounter, a probabilistic and
            memory efficient counter. If false, they are performed via regular
            Counter. The use of bounter is strongly recommended when working
            with a large corpus.
        sep: str, default '\t'
            The character that separates the features of each word in the
            input.
        **bounterargs
            keyword arguments passed to the bounter constructor if used.

        """
        messages.msg("Counting ngrams of length {}...".format(n))
        if use_bounter:
            bounterargs.setdefault('size_mb', 1024)
            counter = bounter(**bounterargs)
        else:
            counter = Counter()
        for sent in sentences:
            if type(n) == list:
                ngrams = list()
                for i in n:
                    ngrams += NgramCounter._gen_ngrams(sent, i)
            else:
                ngrams = NgramCounter._gen_ngrams(sent, n)
            counter.update(ngrams)
        messages.done()
        self._counts_to_db(counter, sep)
示例#10
0
    def aggregate_by(self, key: str):
        """Sum frequency counts over a given field.

        Parameters
        ----------
        key: str
            the field over which to compute the sum.

        """
        messages.msg("Aggregating values...")
        new_tb = '{key}_counts'.format(key=key)
        self.connect()
        super().aggregate_by(new_tb,
                             'ngram_counts', ['freq'],
                             other_f=['length'],
                             grp_f=[key])
        self.disconnect()
        messages.done()