def analyze_urls(langs, dir_training_data): urls = [] for i, lang in enumerate(langs): urls.append([u for t,_,u in stream_sents(lang, dir_training_data)]) print("{}/{}. {}".format(i+1, len(langs), lang)) # Map URLs to langs. Do same for top-level domains (generic and country code). url2langs = {} netloc2langs = {} domain2langs = {} suffix2langs = {} for lang, urls in zip(langs, urls): uniq_urls = set(urls) for url in uniq_urls: netloc = get_netloc(url) domain, suffix = get_toplevel_domains(url) for key, dct in [(url, url2langs), (netloc, netloc2langs), (domain, domain2langs), (suffix, suffix2langs)]: if key not in dct: dct[key] = [] dct[key].append(lang) # Show some results for keyname, dct in [("Netlocs", netloc2langs), ("Domains", domain2langs), ("Suffixes", suffix2langs)]: print_title_with_border(keyname) for i, (key, langs) in enumerate(sorted(dct.items(), key=lambda x:len(x[1]), reverse=True)): lang_fd = Counter(langs) lang_str = ", ".join("%s (%d)" % (l,f) for l,f in sorted(lang_fd.items(),key=lambda x:x[1], reverse=True)) print(" %d. %s: %s" % (i+1, key, lang_str))
def analyze_duplicate_texts(langs, dir_training_data): max_lengths = [None, 256, 128] all_nb_dups = [0 for _ in max_lengths] total_sents = 0 for i, lang in enumerate(langs): print("\n{}/{}. {}".format(i+1, len(langs), lang)) nb_sents = 0 nb_dups = [0 for _ in max_lengths] sents = [set() for _ in max_lengths] for j, (text, _, url) in enumerate(stream_sents(lang, dir_training_data)): nb_sents += 1 for k, m in enumerate(max_lengths): if m is None: truncated = text else: truncated = text[:m] if truncated in sents[k]: nb_dups[k] += 1 else: sents[k].add(truncated) for j, m in enumerate(max_lengths): all_nb_dups[j] += nb_dups[j] print("# dups (max_length=%s): %d/%d" % (str(m), nb_dups[j], nb_sents)) total_sents += nb_sents print_title_with_border("Summary") for j, m in enumerate(max_lengths): print("# dups (max_length=%s): %d/%d" % (str(m), all_nb_dups[j], total_sents)) return
def analyze_alphabet_sizes(langs, dir_training_data): alphabet_sizes = [] super_alphabet_fd = {} print() for i, lang in enumerate(langs): print("{}/{}. {}".format(i+1, len(langs), lang)) text = "" for (t, _, url) in stream_sents(lang, dir_training_data): text += t alphabet_fd = Counter(text) alphabet_sizes.append(len(alphabet_fd)) for char, freq in alphabet_fd.items(): if char not in super_alphabet_fd: super_alphabet_fd[char] = freq else: super_alphabet_fd[char] += freq print_title_with_border("Summary of alphabet sizes") print_stats(alphabet_sizes) print("- Size of super-alphabet: %d" % len(super_alphabet_fd)) nb_hapax = sum(1 for c,f in super_alphabet_fd.items() if f == 1) print("- Nb chars in super-alphabet with freq == 1: %d/%d" % (nb_hapax, len(super_alphabet_fd))) for max_freq in [2,5,10,20]: n = sum(1 for c,f in super_alphabet_fd.items() if f <= max_freq) print("- Nb chars in super-alphabet with freq <= %d: %d/%d" % (max_freq, n, len(super_alphabet_fd)))
def analyze_corpus_sizes(langs, dir_training_data): corpus_sizes = [] print() for i, lang in enumerate(langs): print("{}/{}. {}".format(i+1, len(langs), lang)) nb_sents = sum(1 for (text, text_id, url) in stream_sents(lang, dir_training_data)) corpus_sizes.append(nb_sents) size_fd = Counter(corpus_sizes) print_title_with_border("Corpus size (freq)") for (size, count) in sorted(size_fd.items(), key=lambda x:x[0], reverse=True): print("- {} ({})".format(size, count)) print_title_with_border("Summary of corpus sizes") print_stats(list(corpus_sizes))
def analyze_text_lengths(langs, dir_training_data): text_lengths = [] max_length_thresholds = [64, 128, 256, 512] for i, lang in enumerate(langs): lengths = [len(text) for (text, _, url) in stream_sents(lang, dir_training_data)] title = "{}/{}. {}".format(i+1, len(langs), lang) print_title_with_border(title) print_stats(lengths, max_thresholds=max_length_thresholds) text_lengths.append(lengths) # Print some summary stats all_text_lengths = [] for x in text_lengths: all_text_lengths += x print_title_with_border("Summary") print_stats(all_text_lengths) for threshold in [64,128,256,512]: print_count_gt_threshold(all_text_lengths, threshold)
def compute_sampling_probs_for_subgroup(lang_list, data_dir, alpha=1.0, logger=None): assert alpha >= 0 and alpha <= 1 if len(lang_list) == 1: return [1] lang2freq = {} for lang in lang_list: if logger: logger.info(" %s" % lang) lang2freq[lang] = sum(1 for ( sent, text_id, url) in stream_sents(lang, data_dir, input_format="text-only")) counts = np.array([lang2freq[k] for k in lang_list], dtype=np.float) probs = counts / counts.sum() probs_damp = probs**alpha probs = probs_damp / probs_damp.sum() return probs
def analyze_words_chars_urls(langs, dir_training_data): vocab_sizes = [] alphabet_sizes = [] spu_ratios = [] for i, lang in enumerate(langs): char2freq = defaultdict(int) word2freq = defaultdict(int) uniq_urls = set() nb_sents = 0 for (text, _, url) in stream_sents(lang, dir_training_data): nb_sents += 1 if url: uniq_urls.add(url) # Whitespace-tokenize text for word in text.split(" "): word2freq[word] += 1 for char in word: char2freq[char] += 1 if len(uniq_urls): spu_ratio = nb_sents/len(uniq_urls) spu_ratios.append(spu_ratio) nb_tokens = sum(word2freq.values()) vocab_size = len(word2freq) vocab_sizes.append(vocab_size) alphabet_size = len(char2freq) alphabet_sizes.append(alphabet_size) print("\n--- {}/{}. {} ---".format(i+1, len(langs), lang)) print("Nb tokens: {}".format(nb_tokens)) print("Vocab size: {}".format(vocab_size)) print("Alphabet size: {}".format(alphabet_size)) if len(uniq_urls): print("Nb unique URLs: {}".format(len(uniq_urls))) print("Sents/URL ratio: {:f}".format(spu_ratio)) # Print some summary stats to_analyze = [("Vocab sizes", vocab_sizes), ("Alphabet sizes", alphabet_sizes)] if len(spu_ratios): to_analyze.append(("Sents/URL ratios", spu_ratios)) for (statname, vals) in to_analyze: print_title_with_border(statname) print_stats(vals) return
def disambig(dir_in, dir_out, max_length=None): # Check args if not os.path.exists(dir_out): os.makedirs(dir_out) lang2path = map_ULI_langs_to_paths(dir_in) lang2filename = {x: os.path.split(y)[-1] for (x, y) in lang2path.items()} langs = sorted(lang2path.keys()) # Write labeled data. Store class frequencies path_tmp = os.path.join(dir_out, "data.labeled.tmp") path_lang_fd = os.path.join(dir_out, "data.lang_fd.tsv") lang_fd = None if not os.path.exists(path_tmp): f = open(path_tmp, 'w') line_count = 0 lang_fd = {} for i, lang in enumerate(langs): print("{}/{}. {}".format(i + 1, len(langs), lang)) # Apply length cutoff and deduplicate uniq_sents = set() data = [] for (text, text_id, url) in stream_sents(lang, dir_in): if max_length is not None: text = text[:max_length] if text not in uniq_sents: uniq_sents.add(text) data.append((text, text_id, url)) for (text, text_id, url) in data: line = data_to_string(text, lang, "custom", url=url, text_id=text_id, label=lang) f.write(line) line_count += 1 lang_fd[lang] = len(data) f.close() with open(path_lang_fd, 'w') as f: for (lang, freq) in lang_fd.items(): f.write("%s\t%d\n" % (lang, freq)) # Sort labeled dataset in alphabetical order of texts path_sorted = os.path.join(dir_out, "data.sorted.tmp") if not os.path.exists(path_sorted): cmd = ["sort", path_tmp] print("\nSorting %d texts... " % line_count) with open(path_sorted, 'w') as outfile: subprocess.run(cmd, stdout=outfile) print("Done.") # Check if we skipped labeling and sorting if lang_fd is None: lang_fd = {} line_count = 0 with open(path_lang_fd) as f: for line in f: elems = line.strip().split("\t") lang = elems[0] freq = int(elems[1]) lang_fd[lang] = freq line_count += freq # Read in sorted dataset, look for duplicate texts, write disambiguated dataset lang2outfile = { lang: open(os.path.join(dir_out, lang2filename[lang]), 'w') for lang in langs } prev_text = None prev_info = [] lines_processed = 0 confusion = {} print("\nDisambiguating... ") with open(path_sorted) as f_in: for i, line in enumerate(f_in): if not len(line.strip()): continue (text, text_id, url, lang) = string_to_data(line, "custom", lang=None) if text == prev_text: prev_info.append((lang, text_id, url)) else: if prev_text is not None: # Disambiguate previous text and write to output file for the language we picked ix = None min_lang_freq = 1e10 for j, (x, y, z) in enumerate(prev_info): freq = lang_fd[x] if freq < min_lang_freq: min_lang_freq = freq ix = j (slang, stext_id, surl) = prev_info[ix] output = data_to_string(text, slang, "source", url=surl, text_id=stext_id, label=None) lang2outfile[slang].write(output) # Store confusion counts for (x, y) in combinations([x for (x, y, z) in prev_info], 2): if (x, y) not in confusion: confusion[(x, y)] = 0 confusion[(x, y)] += 1 prev_text = text prev_info = [(lang, text_id, url)] if (i + 1) % 1000000 == 0: pct = 100 * (i + 1) / line_count print("# texts processed: %d/%d (%.1f%%)" % (i + 1, line_count, pct)) print("# texts processed: %d/%d" % (line_count, line_count)) for (lang, outfile) in lang2outfile.items(): outfile.close() # Clean up. for path in [path_tmp, path_sorted, path_lang_fd]: subprocess.run(["rm", path]) # Print some stats on pairwise confusion print("\n\nConfusion frequencies:") if not len(confusion): print("(none)") for ((lang1, lang2), freq) in sorted(confusion.items(), key=lambda x: x[1], reverse=True): msg = "- (%s, %s): %d" % (lang1, lang2, freq) extra = [] for x in [lang1, lang2]: if x in RELEVANT_LANGS: extra.append("%s is relevant" % x) elif x in IRRELEVANT_URALIC_LANGS: extra.append("%s is confounding" % x) if len(extra): msg += " " * 10 msg += ", ".join(extra) print(msg) print() return
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--sampling_alpha", type=float, default=1.0, help= "Frequency dampening factor used for computing language sampling probabilities" ) parser.add_argument( "--weight_relevant", type=float, default=1.0, help= ("Relative sampling frequency of relevant languages wrt irrelevant languages." " Default is 1, which produces a balanced mix of relevant and irrelevant." )) parser.add_argument( "dev_size", type=int, help="Number of examples in dev set (must be greater than 0)") parser.add_argument("test_size", type=int, help="Number of examples in test set (can be 0)") parser.add_argument( "input_dir", help= ("Path of directory containing training data (n files named <lang>.train," " containing unlabeled text only (no labels, URLS or text IDs)")) parser.add_argument("output_dir") args = parser.parse_args() # Check args assert args.dev_size > 0 assert args.test_size >= 0 assert args.sampling_alpha >= 0 and args.sampling_alpha <= 1 assert not os.path.exists(args.output_dir) os.makedirs(args.output_dir) outdir_train = os.path.join(args.output_dir, "Training") outdir_test = os.path.join(args.output_dir, "Test") os.makedirs(outdir_train) os.makedirs(outdir_test) # Set up logging logger = logging.getLogger(__name__) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.DEBUG) # We expect that the input dir contains n files called lang.train, # which contain unlabeled text (without labels, URLS or text IDs) filenames = [n for n in os.listdir(args.input_dir) if n[-6:] == ".train"] logger.info("Nb training files found: %d" % len(filenames)) for n in filenames: lang = n[:-6] assert lang in ALL_LANGS # Seed RNG np.random.seed(91500) # Get language sampling probabilities lang2prob = compute_sampling_probs(args.input_dir, alpha=args.sampling_alpha, rel_weight=args.weight_relevant, logger=logger) # Sample languages and count all_langs = sorted(ALL_LANGS) sampling_probs = [lang2prob[k] for k in all_langs] dev_sample = np.random.choice(np.arange(len(all_langs)), size=args.dev_size, replace=True, p=sampling_probs) dev_counts = [0 for k in all_langs] for lang_id in dev_sample: dev_counts[lang_id] += 1 if args.test_size > 0: test_sample = np.random.choice(np.arange(len(all_langs)), size=args.test_size, replace=True, p=sampling_probs) test_counts = [0 for k in all_langs] for lang_id in test_sample: test_counts[lang_id] += 1 # Print stats on distributions of the dev and test sets. Show min, # max, mean and median. Then do the same for RELEVANT, CONFOUNDING # AND IRRELEVANT. title = "Stats on # dev samples (all languages)" log_title_with_border(title, logger) log_stats(dev_counts, logger) title = "Stats on # dev samples (relevant languages)" log_title_with_border(title, logger) rel_counts = [ dev_counts[i] for i in range(len(dev_counts)) if all_langs[i] in RELEVANT_LANGS ] log_stats(rel_counts, logger) title = "Stats on # dev samples (irrelevant languages)" log_title_with_border(title, logger) irr_counts = [ dev_counts[i] for i in range(len(dev_counts)) if all_langs[i] in IRRELEVANT_LANGS ] log_stats(irr_counts, logger) title = "Stats on # dev samples (irrelevant Uralic languages)" log_title_with_border(title, logger) con_counts = [ dev_counts[i] for i in range(len(dev_counts)) if all_langs[i] in IRRELEVANT_URALIC_LANGS ] log_stats(con_counts, logger) if args.test_size > 0: title = "Stats on # test samples (all languages)" log_title_with_border(title, logger) log_stats(test_counts, logger) title = "Stats on # test samples (relevant languages)" log_title_with_border(title, logger) rel_counts = [ test_counts[i] for i in range(len(test_counts)) if all_langs[i] in RELEVANT_LANGS ] log_stats(rel_counts, logger) title = "Stats on # test samples (irrelevant languages)" log_title_with_border(title, logger) irr_counts = [ test_counts[i] for i in range(len(test_counts)) if all_langs[i] in IRRELEVANT_LANGS ] log_stats(irr_counts, logger) title = "Stats on # test samples (irrelevant Uralic languages)" log_title_with_border(title, logger) con_counts = [ test_counts[i] for i in range(len(test_counts)) if all_langs[i] in IRRELEVANT_URALIC_LANGS ] log_stats(con_counts, logger) # Write training data in separate, unlabeled text files. Store dev # and test examples (to shuffle later, to avoid writing them in # order of language) dev_set = [] test_set = [] logger.info("Writing training data in %s..." % (outdir_train)) for lang_id, lang in enumerate(all_langs): logger.info(" %s" % lang) # Get number of examples nb_examples = sum(1 for (sent, text_id, url) in stream_sents( lang, args.input_dir, input_format="text-only")) # Sample dev and test indices indices = np.arange(nb_examples) np.random.shuffle(indices) nb_dev = dev_counts[lang_id] nb_test = test_counts[lang_id] dev_indices = set(indices[:nb_dev]) test_indices = set(indices[nb_dev:nb_dev + nb_test]) # Stream sents, write training examples, store others outpath = os.path.join(outdir_train, "%s.train" % (lang)) with open(outpath, 'w') as outfile: for ix, (sent, text_id, url) in enumerate( stream_sents(lang, args.input_dir, input_format="text-only")): if ix in dev_indices: dev_set.append((sent, lang)) elif ix in test_indices: test_set.append((sent, lang)) else: outfile.write(sent + "\n") # Shuffle and write dev and test sets logger.info("Writing test data in %s..." % (outdir_test)) np.random.shuffle(dev_set) ptexts = os.path.join(outdir_test, "dev.txt") plabels = os.path.join(outdir_test, "dev-gold-labels.txt") ptuples = os.path.join(outdir_test, "dev-labeled.tsv") with open(ptexts, 'w') as ftexts, open(plabels, 'w') as flabels, open(ptuples, 'w') as ftuples: for (text, lang) in dev_set: ftexts.write(text + "\n") flabels.write(lang + "\n") ftuples.write("%s\t%s\n" % (text, lang)) if len(test_set): np.random.shuffle(test_set) ptexts = os.path.join(outdir_test, "test.txt") plabels = os.path.join(outdir_test, "test-gold-labels.txt") ptuples = os.path.join(outdir_test, "test-labeled.tsv") with open(ptexts, 'w') as ftexts, open(plabels, 'w') as flabels, open(ptuples, 'w') as ftuples: for (text, lang) in test_set: ftexts.write(text + "\n") flabels.write(lang + "\n") ftuples.write("%s\t%s\n" % (text, lang))