def get_n_count_6grams(domain, minimum=3): """ Extracts all 6-grams from the texts of the domain and counts frequency and all metrics in this domain for them. :param domain: int, domain_id from which you want extract n-grams. :param minimum: int, frequency threshold :return: """ command = sql('getting_6grams.sql') cursor.execute(command, (domain, )) data = Counter(cursor.fetchall()) log.debug('Counted!') n = get_domain_size(domain) verbose = True for ngram in data: colloc_freq = data[ngram] if colloc_freq >= minimum: w1, w2, w3, w4, w5, w6 = ngram if verbose: log.debug("Started! %s", ngram) command = sql('select_to_count_6grams.sql') cursor.execute(command.format(domain), (w1, w2, w3, w4, w5)) id_5gram, pattern_freq = cursor.fetchone() cursor.execute( "SELECT freq{} FROM unigrams WHERE id_unigram = %s".format( domain), (w6, )) lw_freq = cursor.fetchone()[0] pmisc = pmi(colloc_freq, pattern_freq, lw_freq, n) tsc = t_score(colloc_freq, pattern_freq, lw_freq, n) logdsc = logDice(colloc_freq, pattern_freq, lw_freq) # print(id_4gram, w5, colloc_freq, logdsc, pmisc, tsc) try: cursor.execute( """ INSERT INTO 6grams (5gram, token, d{}_freq, d{}_logdice, d{}_pmi, d{}_tsc) VALUES (%s, %s, %s, %s, %s, %s) """.format(*[domain] * 4), (id_5gram, w6, colloc_freq, logdsc, pmisc, tsc)) except IntegrityError: cursor.execute( """ UPDATE 6grams SET d{}_freq = %s, d{}_logdice = %s, d{}_pmi = %s, d{}_tsc = %s WHERE 5gram = %s AND token = %s """.format(*[domain] * 4), (colloc_freq, logdsc, pmisc, tsc, id_5gram, w6)) if verbose: log.debug('Counted and inserted!') verbose = False
def count_all_domains_bigr(minimum=1): """ :param minimum: n occurences per million :return: """ log.info('Counting metrics!') cursor.execute( """ SELECT COUNT(*) FROM (SELECT id_word FROM words) as a """) n = cursor.fetchone()[0] log.info('Total corpus size %s', n) cursor.execute(sql('selecting_all_2grams.sql')) log.debug('Selected!') data = set() for _id, colloc_freq, pattern_freq, lw_freq in cursor: if colloc_freq/(n/1000000) >= minimum: pmisc = pmi(colloc_freq, pattern_freq, lw_freq, n) tsc = t_score(colloc_freq, pattern_freq, lw_freq, n) logdsc = logDice(colloc_freq, pattern_freq, lw_freq) data.add((logdsc, pmisc, tsc, _id)) # log.debug('Counted %s', _id) log.info('Ready to insert') for i in data: cursor.execute( """ UPDATE 2grams SET logdice = %s, pmi = %s, tscore = %s WHERE id_bigram = %s """, i) cnx.commit() log.info('Metrics for bigram in all corpus are commited!')
def get_n_count_3grams(domain, minimum=3): """ Extracts all 3-grams from the texts of the domain and counts frequency and all metrics in this domain for them. :param domain: int, domain_id from which you want extract n-grams. :param minimum: int, frequency threshold :return: """ log.info('Counting trigrams for domain %s', domain) command = sql('getting_3grams.sql') cursor.execute(command, (domain, )) data = Counter(cursor.fetchall()) log.debug('Counted!') n = get_domain_size(domain) verbose = True for trigram in data: if data[trigram] >= minimum: w1, w2, w3 = trigram if verbose: log.debug("Started! %s", trigram) if w1 % 100 == 0: log.debug("working with %s", w1) cursor.execute( """ SELECT id_bigram, d{}_freq FROM 2grams WHERE wordform_1 = %s AND wordform_2 = %s """.format(domain), (w1, w2)) id_bigram, pattern_freq = cursor.fetchone() if pattern_freq == 0: print(w1, ' ', w2) print(trigram, data[trigram]) cursor.execute( "SELECT freq{} FROM unigrams WHERE id_unigram = %s".format( domain), (w3, )) lw_freq = cursor.fetchone()[0] if lw_freq == 0: print(w3) colloc_freq = data[trigram] pmisc = pmi(colloc_freq, pattern_freq, lw_freq, n) tsc = t_score(colloc_freq, pattern_freq, lw_freq, n) logdsc = logDice(colloc_freq, pattern_freq, lw_freq) try: cursor.execute( """ INSERT INTO 3grams (bigram, token, d{}_freq, d{}_logdice, d{}_pmi, d{}_tsc) VALUES (%s, %s, %s, %s, %s, %s) """.format(*[domain] * 4), (id_bigram, w3, colloc_freq, logdsc, pmisc, tsc)) except IntegrityError: cursor.execute( """ UPDATE 3grams SET d{}_freq = %s, d{}_logdice = %s, d{}_pmi = %s, d{}_tsc = %s WHERE bigram = %s AND token = %s """.format(*[domain] * 4), (colloc_freq, logdsc, pmisc, tsc, id_bigram, w3)) if verbose: log.debug('3-grams are counted and inserted!')
def count_2metrics(domain, minimum=3): """ Counts all metrics for already extracted 2-grams with counted frequency. :param domain: int, domain_id, which size will be used to calculate related metrics. :param minimum: int, frequency threshold. :return: """ log.info('Counting metrics!') cursor.execute( """ SELECT COUNT(*) FROM (SELECT id_word FROM words, metadata WHERE words.id_text = metadata.id_text AND metadata.id_domain = (%s) ) AS sd """, (domain, )) n = cursor.fetchone()[0] log.info('Domain size %s', n) cursor.execute(""" SELECT id_bigram AS id, d{}_freq AS freq, w1.freq{} AS w1_freq, w2.freq{} AS w2_freq FROM 2grams, unigrams AS w1, unigrams AS w2 WHERE 2grams.wordform_1 = w1.id_unigram AND 2grams.wordform_2 = w2.id_unigram """.format(domain, domain, domain)) log.debug('Selected!') data = set() for _id, colloc_freq, pattern_freq, lw_freq in cursor: if colloc_freq >= minimum: pmisc = pmi(colloc_freq, pattern_freq, lw_freq, n) tsc = t_score(colloc_freq, pattern_freq, lw_freq, n) logdsc = logDice(colloc_freq, pattern_freq, lw_freq) data.add((logdsc, pmisc, tsc, _id)) # log.debug('Counted %s', _id) log.info('Ready to insert22') for i in data: if i[-1] % 1000 == 0: print(i) cursor.execute( """ UPDATE 2grams SET d{}_logdice = %s, d{}_pmi = %s, d{}_tsc = %s WHERE id_bigram = %s """.format(domain, domain, domain), i) # cnx.commit() log.info('Commited!')