def wiki_stats(env): """ Basic statistics from a wiki. Note: not tested with larger wikis! """ from pyspell._utils import non_sk_words from simplewiki import wiki wiki_input = os.path.join(env["start_dir"], env["input"]["dir"], env["input"]["wiki_xml"]) wiki_out_freqs = os.path.join(env["start_dir"], env["output"]["dir"], env["output"]["wiki_freqs"]) if not os.path.exists(wiki_input): raise Exception("Wiki input not found [%s]" % wiki_input) w = wiki(wiki_input) freqs = defaultdict(int) all_words = 0 log_every_n = env["log_every_n"] for pos, (wordorig, word, sentence_start, page_id) in enumerate(w.words(True)): if 0 == len(word) or non_sk_words(word): continue word = word.lower() all_words += 1 freqs[word] += 1 if 0 == all_words % log_every_n: perc = round((100. * float(len(freqs))) / all_words, 3) _logger.info( "done [%8d] words ... [%8d][%.2f%%] unique words ... [%5d] pages", all_words, len(freqs), perc, page_id ) print " # of all words: %6d" % all_words print "# of unique words: %6d" % len(freqs) import heapq nth = 100 nth = 100 too_few_occurrences = 20 baseline = heapq.nlargest(nth, freqs.values())[-1] d = defaultdict(list) min_occurs_cnt = 0 min_occurrences = [None, 100, 90, 80, 70, 60, 40, 20] min_occurrences_freq = defaultdict(int) json.dump(freqs, open(wiki_out_freqs, "w+"), encoding="utf-8") for k, v in freqs.iteritems(): too_few_occurrences = min_occurrences[min(len(min_occurrences) - 1, len(k))] if v < too_few_occurrences: min_occurrences_freq[len(k)] += 1 min_occurs_cnt += 1 if v >= baseline: d[v].append(k) print "# of unique words that occurred < %d times: %6d" % ( too_few_occurrences, min_occurs_cnt ) for k, v in min_occurrences_freq.iteritems(): print "Words with len [%3d] occurred < too_few_occurrences [%4d] times" % (k, v) for k in sorted(d.keys(), reverse=True): for v in d[k]: print "%6s: %4d" % (v, k)
def wiki_stats(env): """ Basic statistics from a wiki. Note: not tested with larger wikis! """ from pyspell._utils import non_sk_words from simplewiki import wiki wiki_input = os.path.join(env["start_dir"], env["input"]["dir"], env["input"]["wiki_xml"]) wiki_out_freqs = os.path.join(env["start_dir"], env["output"]["dir"], env["output"]["wiki_freqs"]) if not os.path.exists(wiki_input): raise Exception("Wiki input not found [%s]" % wiki_input) w = wiki(wiki_input) freqs = defaultdict(int) all_words = 0 log_every_n = env["log_every_n"] for pos, (wordorig, word, sentence_start, page_id) in enumerate(w.words(True)): if 0 == len(word) or non_sk_words(word): continue word = word.lower() all_words += 1 freqs[word] += 1 if 0 == all_words % log_every_n: perc = round((100. * float(len(freqs))) / all_words, 3) _logger.info( "done [%8d] words ... [%8d][%.2f%%] unique words ... [%5d] pages", all_words, len(freqs), perc, page_id ) print " # of all words: %6d" % all_words print "# of unique words: %6d" % len(freqs) import heapq nth = 100 too_few_occurrences = 20 baseline = heapq.nlargest(nth, freqs.values())[-1] d = defaultdict(list) min_occurs_cnt = 0 min_occurrences = [None, 100, 90, 80, 70, 60, 40, 20] min_occurrences_freq = defaultdict(int) json.dump(freqs, open(wiki_out_freqs, "w+"), encoding="utf-8") for k, v in freqs.iteritems(): too_few_occurrences = min_occurrences[min(len(min_occurrences) - 1, len(k))] if v < too_few_occurrences: min_occurrences_freq[len(k)] += 1 min_occurs_cnt += 1 if v >= baseline: d[v].append(k) print "# of unique words that occurred < %d times: %6d" % ( too_few_occurrences, min_occurs_cnt ) for k, v in min_occurrences_freq.iteritems(): print "Words with len [%3d] occurred < too_few_occurrences [%4d] times" % (k, v) for k in sorted(d.keys(), reverse=True): for v in d[k]: print "%6s: %4d" % (v, k)
def _is_important_valid_word(word, f, non_sk_words): dyn_min_occurrences = min_occurrences[min(len(min_occurrences) - 1, len(word))] if dyn_min_occurrences <= f: if non_sk_words(word): # important and non valid return True, False # important and valid return True, True # not important for now and we do not know if valid return False, None
def wiki_words(env): """ Gather most used words according to a specific definition Note: not tested with larger wikis! """ from simplewiki import wiki from pyspell._utils import non_sk_words wiki_input = os.path.join(env["start_dir"], env["input"]["dir"], env["wiki_xml"]) wiki_words_output = os.path.join(env["start_dir"], env["output"]["dir"], env["output"]["wiki_words"]) log_every_n = env["log_every_n"] if not os.path.exists(wiki_input): raise Exception("Wiki input not found [%s]" % wiki_input) w = wiki(wiki_input) done_occurrence = 1234567 freqs = defaultdict(int) capital_freqs = defaultdict(int) with codecs.open(wiki_words_output, mode="w+", encoding="utf-8") as fout: for pos, (wordorig, word, sentence_start, page_id) in enumerate(w.words(True)): if 0 == len(word): continue f = freqs[word] # skip already done or strange if done_occurrence == f or 0 > f: continue if sentence_start and word[0].isupper(): capital_freqs[word] += 1 continue freqs[word] = f + 1 is_important, is_valid = _is_important_valid_word(w, f, non_sk_words) if is_important: if not is_valid: # do not bother with that one again freqs[word] = -1 continue freqs[word] = done_occurrence # have we already output the same but lowercase? if not word.islower() and done_occurrence == freqs[word.lower()]: _logger.warn(u"Processing non-lower word [%s] but lower has been already processed", word) if word.islower() and done_occurrence == freqs[word[0].upper() + word[1:]]: _logger.warn(u"Processing lower word [%s] but non-lower has been already processed", word) if not word.islower(): iword = word.lower() iwordfreq = freqs[iword] if 0 < iwordfreq and float(f) / float(iwordfreq) <= 2.: _logger.warn( u"Capital first being processed [%s][%d] but non capital is not 0 [%d]", word, f, iwordfreq ) if word in capital_freqs: del capital_freqs[word] fout.write(word + u"\n") if 0 == pos % log_every_n: _logger.info("done [%8d] words ... [%5d] pages", pos, page_id) _logger.info("Could not get capitals right:") for k, v in capital_freqs.iteritems(): if v > min_occurrences[-1] / 3: if non_sk_words(k): continue if done_occurrence == freqs[k.lower()]: continue _logger.info("%10s: %2d", k, v)