def analyze_urls(langs, dir_training_data):
    urls = []
    for i, lang in enumerate(langs):
        urls.append([u for t,_,u in stream_sents(lang, dir_training_data)])
        print("{}/{}. {}".format(i+1, len(langs), lang))

    # Map URLs to langs. Do same for top-level domains (generic and country code). 
    url2langs = {}
    netloc2langs = {}
    domain2langs = {}
    suffix2langs = {}
    for lang, urls in zip(langs, urls):
        uniq_urls = set(urls)
        for url in uniq_urls:
            netloc = get_netloc(url)
            domain, suffix = get_toplevel_domains(url)
            for key, dct in [(url, url2langs),
                             (netloc, netloc2langs),
                             (domain, domain2langs),
                             (suffix, suffix2langs)]:
                if key not in dct:
                    dct[key] = []
                dct[key].append(lang)

    # Show some results
    for keyname, dct in [("Netlocs", netloc2langs),
                         ("Domains", domain2langs),
                         ("Suffixes", suffix2langs)]:
        print_title_with_border(keyname)
        for i, (key, langs) in enumerate(sorted(dct.items(), key=lambda x:len(x[1]), reverse=True)):
            lang_fd = Counter(langs)
            lang_str = ", ".join("%s (%d)" % (l,f) for l,f in sorted(lang_fd.items(),key=lambda x:x[1], reverse=True))
            print(" %d. %s: %s" % (i+1, key, lang_str))
def analyze_duplicate_texts(langs, dir_training_data):
    max_lengths = [None, 256, 128]    
    all_nb_dups = [0 for _ in max_lengths]
    total_sents = 0
    for i, lang in enumerate(langs):
        print("\n{}/{}. {}".format(i+1, len(langs), lang))
        nb_sents = 0
        nb_dups = [0 for _ in max_lengths]
        sents = [set() for _ in max_lengths]
        for j, (text, _, url) in enumerate(stream_sents(lang, dir_training_data)):
            nb_sents += 1
            for k, m in enumerate(max_lengths):
                if m is None:
                    truncated = text
                else:
                    truncated = text[:m]
                if truncated in sents[k]:
                    nb_dups[k] += 1
                else:
                    sents[k].add(truncated)
        for j, m in enumerate(max_lengths):
            all_nb_dups[j] += nb_dups[j]
            print("# dups (max_length=%s): %d/%d" % (str(m), nb_dups[j], nb_sents))
        total_sents += nb_sents
        
    print_title_with_border("Summary")
    for j, m in enumerate(max_lengths):
        print("# dups (max_length=%s): %d/%d" % (str(m), all_nb_dups[j], total_sents))
    return
def analyze_alphabet_sizes(langs, dir_training_data):
    alphabet_sizes = []
    super_alphabet_fd = {}
    print()
    for i, lang in enumerate(langs):            
        print("{}/{}. {}".format(i+1, len(langs), lang))

        text = ""
        for (t, _, url) in stream_sents(lang, dir_training_data):
            text += t
        alphabet_fd = Counter(text)            
        alphabet_sizes.append(len(alphabet_fd))
        for char, freq in alphabet_fd.items():
            if char not in super_alphabet_fd:
                super_alphabet_fd[char] = freq
            else:
                super_alphabet_fd[char] += freq
    print_title_with_border("Summary of alphabet sizes")    
    print_stats(alphabet_sizes)    
    print("- Size of super-alphabet: %d" % len(super_alphabet_fd))
    nb_hapax = sum(1 for c,f in super_alphabet_fd.items() if f == 1)
    print("- Nb chars in super-alphabet with freq == 1: %d/%d" % (nb_hapax, len(super_alphabet_fd)))
    for max_freq in [2,5,10,20]:
        n = sum(1 for c,f in super_alphabet_fd.items() if f <= max_freq)
        print("- Nb chars in super-alphabet with freq <= %d: %d/%d" % (max_freq, n, len(super_alphabet_fd)))
def analyze_corpus_sizes(langs, dir_training_data):
    corpus_sizes = []
    print()
    for i, lang in enumerate(langs):            
        print("{}/{}. {}".format(i+1, len(langs), lang))
        nb_sents = sum(1 for (text, text_id, url) in stream_sents(lang, dir_training_data))
        corpus_sizes.append(nb_sents)
    size_fd = Counter(corpus_sizes)
    print_title_with_border("Corpus size (freq)")    
    for (size, count) in sorted(size_fd.items(), key=lambda x:x[0], reverse=True):
        print("- {} ({})".format(size, count))
    print_title_with_border("Summary of corpus sizes")    
    print_stats(list(corpus_sizes))    
def analyze_text_lengths(langs, dir_training_data):
    text_lengths = []
    max_length_thresholds = [64, 128, 256, 512]
    for i, lang in enumerate(langs):            
        lengths = [len(text) for (text, _, url) in stream_sents(lang, dir_training_data)]
        title = "{}/{}. {}".format(i+1, len(langs), lang)
        print_title_with_border(title)
        print_stats(lengths, max_thresholds=max_length_thresholds)        
        text_lengths.append(lengths)

    # Print some summary stats
    all_text_lengths = []
    for x in text_lengths:
        all_text_lengths += x
    print_title_with_border("Summary")    
    print_stats(all_text_lengths)
    for threshold in [64,128,256,512]:
        print_count_gt_threshold(all_text_lengths, threshold)
def compute_sampling_probs_for_subgroup(lang_list,
                                        data_dir,
                                        alpha=1.0,
                                        logger=None):
    assert alpha >= 0 and alpha <= 1
    if len(lang_list) == 1:
        return [1]
    lang2freq = {}
    for lang in lang_list:
        if logger:
            logger.info("  %s" % lang)
        lang2freq[lang] = sum(1 for (
            sent, text_id,
            url) in stream_sents(lang, data_dir, input_format="text-only"))
    counts = np.array([lang2freq[k] for k in lang_list], dtype=np.float)
    probs = counts / counts.sum()
    probs_damp = probs**alpha
    probs = probs_damp / probs_damp.sum()
    return probs
def analyze_words_chars_urls(langs, dir_training_data):
    vocab_sizes = []
    alphabet_sizes = []
    spu_ratios = []
    for i, lang in enumerate(langs):
        char2freq = defaultdict(int)
        word2freq = defaultdict(int)
        uniq_urls = set()
        nb_sents = 0
        for (text, _, url) in stream_sents(lang, dir_training_data):
            nb_sents += 1
            if url:
                uniq_urls.add(url)
            # Whitespace-tokenize text
            for word in text.split(" "):
                word2freq[word] += 1
                for char in word:
                    char2freq[char] += 1
        if len(uniq_urls):
            spu_ratio = nb_sents/len(uniq_urls)
            spu_ratios.append(spu_ratio)
        nb_tokens = sum(word2freq.values())
        vocab_size = len(word2freq)
        vocab_sizes.append(vocab_size)        
        alphabet_size = len(char2freq)
        alphabet_sizes.append(alphabet_size)
        print("\n--- {}/{}. {} ---".format(i+1, len(langs), lang))            
        print("Nb tokens: {}".format(nb_tokens))
        print("Vocab size: {}".format(vocab_size))
        print("Alphabet size: {}".format(alphabet_size))
        if len(uniq_urls):
            print("Nb unique URLs: {}".format(len(uniq_urls)))
            print("Sents/URL ratio: {:f}".format(spu_ratio))

    # Print some summary stats
    to_analyze = [("Vocab sizes", vocab_sizes), ("Alphabet sizes", alphabet_sizes)]
    if len(spu_ratios):
        to_analyze.append(("Sents/URL ratios", spu_ratios))
    for (statname, vals) in to_analyze:
        print_title_with_border(statname) 
        print_stats(vals)
    return
Exemplo n.º 8
0
def disambig(dir_in, dir_out, max_length=None):
    # Check args
    if not os.path.exists(dir_out):
        os.makedirs(dir_out)

    lang2path = map_ULI_langs_to_paths(dir_in)
    lang2filename = {x: os.path.split(y)[-1] for (x, y) in lang2path.items()}
    langs = sorted(lang2path.keys())

    # Write labeled data. Store class frequencies
    path_tmp = os.path.join(dir_out, "data.labeled.tmp")
    path_lang_fd = os.path.join(dir_out, "data.lang_fd.tsv")
    lang_fd = None
    if not os.path.exists(path_tmp):
        f = open(path_tmp, 'w')
        line_count = 0
        lang_fd = {}
        for i, lang in enumerate(langs):
            print("{}/{}. {}".format(i + 1, len(langs), lang))

            # Apply length cutoff and deduplicate
            uniq_sents = set()
            data = []
            for (text, text_id, url) in stream_sents(lang, dir_in):
                if max_length is not None:
                    text = text[:max_length]
                if text not in uniq_sents:
                    uniq_sents.add(text)
                    data.append((text, text_id, url))
            for (text, text_id, url) in data:
                line = data_to_string(text,
                                      lang,
                                      "custom",
                                      url=url,
                                      text_id=text_id,
                                      label=lang)
                f.write(line)
                line_count += 1
            lang_fd[lang] = len(data)
        f.close()
        with open(path_lang_fd, 'w') as f:
            for (lang, freq) in lang_fd.items():
                f.write("%s\t%d\n" % (lang, freq))

    # Sort labeled dataset in alphabetical order of texts
    path_sorted = os.path.join(dir_out, "data.sorted.tmp")
    if not os.path.exists(path_sorted):
        cmd = ["sort", path_tmp]
        print("\nSorting %d texts... " % line_count)
        with open(path_sorted, 'w') as outfile:
            subprocess.run(cmd, stdout=outfile)
        print("Done.")

    # Check if we skipped labeling and sorting
    if lang_fd is None:
        lang_fd = {}
        line_count = 0
        with open(path_lang_fd) as f:
            for line in f:
                elems = line.strip().split("\t")
                lang = elems[0]
                freq = int(elems[1])
                lang_fd[lang] = freq
                line_count += freq

    # Read in sorted dataset, look for duplicate texts, write disambiguated dataset
    lang2outfile = {
        lang: open(os.path.join(dir_out, lang2filename[lang]), 'w')
        for lang in langs
    }
    prev_text = None
    prev_info = []
    lines_processed = 0
    confusion = {}
    print("\nDisambiguating... ")
    with open(path_sorted) as f_in:
        for i, line in enumerate(f_in):
            if not len(line.strip()):
                continue
            (text, text_id, url, lang) = string_to_data(line,
                                                        "custom",
                                                        lang=None)
            if text == prev_text:
                prev_info.append((lang, text_id, url))
            else:
                if prev_text is not None:
                    # Disambiguate previous text and write to output file for the language we picked
                    ix = None
                    min_lang_freq = 1e10
                    for j, (x, y, z) in enumerate(prev_info):
                        freq = lang_fd[x]
                        if freq < min_lang_freq:
                            min_lang_freq = freq
                            ix = j
                    (slang, stext_id, surl) = prev_info[ix]
                    output = data_to_string(text,
                                            slang,
                                            "source",
                                            url=surl,
                                            text_id=stext_id,
                                            label=None)
                    lang2outfile[slang].write(output)
                    # Store confusion counts
                    for (x, y) in combinations([x for (x, y, z) in prev_info],
                                               2):
                        if (x, y) not in confusion:
                            confusion[(x, y)] = 0
                        confusion[(x, y)] += 1
                prev_text = text
                prev_info = [(lang, text_id, url)]
            if (i + 1) % 1000000 == 0:
                pct = 100 * (i + 1) / line_count
                print("# texts processed: %d/%d (%.1f%%)" %
                      (i + 1, line_count, pct))
    print("# texts processed: %d/%d" % (line_count, line_count))
    for (lang, outfile) in lang2outfile.items():
        outfile.close()

    # Clean up.
    for path in [path_tmp, path_sorted, path_lang_fd]:
        subprocess.run(["rm", path])

    # Print some stats on pairwise confusion
    print("\n\nConfusion frequencies:")
    if not len(confusion):
        print("(none)")
    for ((lang1, lang2), freq) in sorted(confusion.items(),
                                         key=lambda x: x[1],
                                         reverse=True):
        msg = "- (%s, %s): %d" % (lang1, lang2, freq)
        extra = []
        for x in [lang1, lang2]:
            if x in RELEVANT_LANGS:
                extra.append("%s is relevant" % x)
            elif x in IRRELEVANT_URALIC_LANGS:
                extra.append("%s is confounding" % x)
        if len(extra):
            msg += " " * 10
            msg += ", ".join(extra)
        print(msg)
    print()
    return
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--sampling_alpha",
        type=float,
        default=1.0,
        help=
        "Frequency dampening factor used for computing language sampling probabilities"
    )
    parser.add_argument(
        "--weight_relevant",
        type=float,
        default=1.0,
        help=
        ("Relative sampling frequency of relevant languages wrt irrelevant languages."
         " Default is 1, which produces a balanced mix of relevant and irrelevant."
         ))
    parser.add_argument(
        "dev_size",
        type=int,
        help="Number of examples in dev set (must be greater than 0)")
    parser.add_argument("test_size",
                        type=int,
                        help="Number of examples in test set (can be 0)")
    parser.add_argument(
        "input_dir",
        help=
        ("Path of directory containing training data (n files named <lang>.train,"
         " containing unlabeled text only (no labels, URLS or text IDs)"))
    parser.add_argument("output_dir")
    args = parser.parse_args()

    # Check args
    assert args.dev_size > 0
    assert args.test_size >= 0
    assert args.sampling_alpha >= 0 and args.sampling_alpha <= 1
    assert not os.path.exists(args.output_dir)
    os.makedirs(args.output_dir)
    outdir_train = os.path.join(args.output_dir, "Training")
    outdir_test = os.path.join(args.output_dir, "Test")
    os.makedirs(outdir_train)
    os.makedirs(outdir_test)

    # Set up logging
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.DEBUG)

    # We expect that the input dir contains n files called lang.train,
    # which contain unlabeled text (without labels, URLS or text IDs)
    filenames = [n for n in os.listdir(args.input_dir) if n[-6:] == ".train"]
    logger.info("Nb training files found: %d" % len(filenames))
    for n in filenames:
        lang = n[:-6]
        assert lang in ALL_LANGS

    # Seed RNG
    np.random.seed(91500)

    # Get language sampling probabilities
    lang2prob = compute_sampling_probs(args.input_dir,
                                       alpha=args.sampling_alpha,
                                       rel_weight=args.weight_relevant,
                                       logger=logger)

    # Sample languages and count
    all_langs = sorted(ALL_LANGS)
    sampling_probs = [lang2prob[k] for k in all_langs]
    dev_sample = np.random.choice(np.arange(len(all_langs)),
                                  size=args.dev_size,
                                  replace=True,
                                  p=sampling_probs)
    dev_counts = [0 for k in all_langs]
    for lang_id in dev_sample:
        dev_counts[lang_id] += 1
    if args.test_size > 0:
        test_sample = np.random.choice(np.arange(len(all_langs)),
                                       size=args.test_size,
                                       replace=True,
                                       p=sampling_probs)
        test_counts = [0 for k in all_langs]
        for lang_id in test_sample:
            test_counts[lang_id] += 1

    # Print stats on distributions of the dev and test sets. Show min,
    # max, mean and median. Then do the same for RELEVANT, CONFOUNDING
    # AND IRRELEVANT.
    title = "Stats on # dev samples (all languages)"
    log_title_with_border(title, logger)
    log_stats(dev_counts, logger)
    title = "Stats on # dev samples (relevant languages)"
    log_title_with_border(title, logger)
    rel_counts = [
        dev_counts[i] for i in range(len(dev_counts))
        if all_langs[i] in RELEVANT_LANGS
    ]
    log_stats(rel_counts, logger)
    title = "Stats on # dev samples (irrelevant languages)"
    log_title_with_border(title, logger)
    irr_counts = [
        dev_counts[i] for i in range(len(dev_counts))
        if all_langs[i] in IRRELEVANT_LANGS
    ]
    log_stats(irr_counts, logger)
    title = "Stats on # dev samples (irrelevant Uralic languages)"
    log_title_with_border(title, logger)
    con_counts = [
        dev_counts[i] for i in range(len(dev_counts))
        if all_langs[i] in IRRELEVANT_URALIC_LANGS
    ]
    log_stats(con_counts, logger)
    if args.test_size > 0:
        title = "Stats on # test samples (all languages)"
        log_title_with_border(title, logger)
        log_stats(test_counts, logger)
        title = "Stats on # test samples (relevant languages)"
        log_title_with_border(title, logger)
        rel_counts = [
            test_counts[i] for i in range(len(test_counts))
            if all_langs[i] in RELEVANT_LANGS
        ]
        log_stats(rel_counts, logger)
        title = "Stats on # test samples (irrelevant languages)"
        log_title_with_border(title, logger)
        irr_counts = [
            test_counts[i] for i in range(len(test_counts))
            if all_langs[i] in IRRELEVANT_LANGS
        ]
        log_stats(irr_counts, logger)
        title = "Stats on # test samples (irrelevant Uralic languages)"
        log_title_with_border(title, logger)
        con_counts = [
            test_counts[i] for i in range(len(test_counts))
            if all_langs[i] in IRRELEVANT_URALIC_LANGS
        ]
        log_stats(con_counts, logger)

    # Write training data in separate, unlabeled text files. Store dev
    # and test examples (to shuffle later, to avoid writing them in
    # order of language)
    dev_set = []
    test_set = []
    logger.info("Writing training data in %s..." % (outdir_train))
    for lang_id, lang in enumerate(all_langs):
        logger.info("  %s" % lang)
        # Get number of examples
        nb_examples = sum(1 for (sent, text_id, url) in stream_sents(
            lang, args.input_dir, input_format="text-only"))

        # Sample dev and test indices
        indices = np.arange(nb_examples)
        np.random.shuffle(indices)
        nb_dev = dev_counts[lang_id]
        nb_test = test_counts[lang_id]
        dev_indices = set(indices[:nb_dev])
        test_indices = set(indices[nb_dev:nb_dev + nb_test])

        # Stream sents, write training examples, store others
        outpath = os.path.join(outdir_train, "%s.train" % (lang))
        with open(outpath, 'w') as outfile:
            for ix, (sent, text_id, url) in enumerate(
                    stream_sents(lang,
                                 args.input_dir,
                                 input_format="text-only")):
                if ix in dev_indices:
                    dev_set.append((sent, lang))
                elif ix in test_indices:
                    test_set.append((sent, lang))
                else:
                    outfile.write(sent + "\n")

    # Shuffle and write dev and test sets
    logger.info("Writing test data in %s..." % (outdir_test))
    np.random.shuffle(dev_set)
    ptexts = os.path.join(outdir_test, "dev.txt")
    plabels = os.path.join(outdir_test, "dev-gold-labels.txt")
    ptuples = os.path.join(outdir_test, "dev-labeled.tsv")
    with open(ptexts,
              'w') as ftexts, open(plabels,
                                   'w') as flabels, open(ptuples,
                                                         'w') as ftuples:
        for (text, lang) in dev_set:
            ftexts.write(text + "\n")
            flabels.write(lang + "\n")
            ftuples.write("%s\t%s\n" % (text, lang))
    if len(test_set):
        np.random.shuffle(test_set)
        ptexts = os.path.join(outdir_test, "test.txt")
        plabels = os.path.join(outdir_test, "test-gold-labels.txt")
        ptuples = os.path.join(outdir_test, "test-labeled.tsv")
        with open(ptexts,
                  'w') as ftexts, open(plabels,
                                       'w') as flabels, open(ptuples,
                                                             'w') as ftuples:
            for (text, lang) in test_set:
                ftexts.write(text + "\n")
                flabels.write(lang + "\n")
                ftuples.write("%s\t%s\n" % (text, lang))