示例#1
0
    unicomp = compare_bigrams.compare_Ngrams(unigrams_unpermuted,
                                             generated_corpus_unigrams)
    unicomp = dict([("unigrams.%s" % k, v) for (k, v) in unicomp.items()])

    comp = dict(list(comp.items()) + list(unicomp.items()))  # merge the dicts
    comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys())
    comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys())

    if True:

        print("%i" % (i + 1), )
        sys.stdout.flush()

        # Quantify the copying
        cop = quantify_copying.corpus(words)

        corpus_stats = pd.concat([
            corpus_stats,
            pd.DataFrame(dict(list(comp.items()) + list(cop.items())),
                         index=[i + 1])
        ])

        # Now also shuffle the word list, so that we don't always have the same lengths in the train and test
        # corpus after the following split.
        #random.shuffle(words)

        # Split into train and test corpus
        #train_corpus = words[:N_TRAIN]
        #test_corpus  = words[N_TRAIN:]
    comp["permutation"]=i+1
    
    unicomp = compare_bigrams.compare_Ngrams( unigrams_unpermuted, generated_corpus_unigrams )
    unicomp = dict([ ("unigrams.%s"%k,v) for (k,v) in unicomp.items() ])

    comp = {**comp,**unicomp} # merge the dicts
    comp["n.unique.bigrams"]  = len(generated_corpus_bigrams.keys())
    comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys())


    if True:

        print ("%i"%(i+1),end=" ",flush=True)

        # Quantify the copying
        cop = quantify_copying.corpus(corpus)

        corpus_stats = pd.concat([corpus_stats,
                                  pd.DataFrame({**comp,**cop},
                                               index=[i+1])])

        # Now also shuffle the word list, so that we don't always have the same lengths in the train and test
        # corpus after the following split.
        #random.shuffle(words)
        
        # Split into train and test corpus
        #train_corpus = words[:N_TRAIN]
        #test_corpus  = words[N_TRAIN:]

        # Write the shuffled corpora to file
        #f = open('%s/permutation_%05i_numbers.txt'%(OUTPUT_DIR,i+1),'w')
def generate_bigram_corpus(i):

    print(i, end=" ", flush=True)

    # First make a candidate corpus
    corpus = []
    total_length = 0
    while total_length < N_WORDS_IN_CORPUS:  # we obtain a sentence of a particular length

        # Generate one item
        sent, _ = list(
            compare_bigrams.generate_words_from_bigrams(
                bigrams_unpermuted, 1, maxlength,
                progress_output=False).values())[0][0]
        sent = sent.split(".")
        corpus += [sent]
        total_length += len(sent)

    # Ok, so now we have a corpus that is just slightly bigger in number of words than the target corpus.
    # We consider discarding the last sentence, and seeing if we end up closer to the target length.
    dlength = abs(total_length - N_WORDS_IN_CORPUS)
    dlength_discard = abs(
        (total_length - len(corpus[-1])) - N_WORDS_IN_CORPUS
    )  # the length of the corpus if we would discard the final addition

    if dlength_discard < dlength:  # if, discarding the last sentence, we are closer to the target length, do the discard
        corpus = corpus[:-1]

    total_length = sum([len(s) for s in corpus])

    generated_corpus_bigrams = compare_bigrams.bigram_counts(corpus)
    generated_corpus_unigrams = compare_bigrams.unigram_counts(corpus)

    comp = compare_bigrams.compare_Ngrams(bigrams_unpermuted,
                                          generated_corpus_bigrams)
    comp = dict([("bigrams.%s" % k, v) for (k, v) in comp.items()])

    unicomp = compare_bigrams.compare_Ngrams(unigrams_unpermuted,
                                             generated_corpus_unigrams)
    unicomp = dict([("unigrams.%s" % k, v) for (k, v) in unicomp.items()])

    # Also quantify copying
    cop = quantify_copying.corpus(corpus)

    # Combine all metrics we've gathered about this corpus
    comp = {
        **comp,
        **unicomp,
        **cop
    }  #dict(comp.items()+unicomp.items()+cop.items()) # merge the dicts
    comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys())
    comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys())

    if True:

        comp["randomisation"] = i
        corpus_stats = pd.DataFrame(comp, index=[i])

        # Write the shuffled corpora to file

        # Now also shuffle the word list, so that we don't always have the same lengths in the train and test
        # corpus after the following split.
        #random.shuffle(corpus)

        # Split into train and test corpus
        #train_corpus = corpus[:N_TRAIN]
        #test_corpus  = corpus[N_TRAIN:]

        # Write the shuffled corpora to file
        f = open('%s/bigramfree_%05i_corpus.txt' % (OUTPUT_DIR, i + 1), 'w')
        corpus = "\n".join([" ".join(w) for w in corpus])
        f.write(corpus)
        f.close()

        #f = open('%s/bigramfree_%05i_test_corpus.txt'%(OUTPUT_DIR,i+1),'w')
        #corpus = "\n".join([ " ".join(w) for w in test_corpus ])
        #f.write(corpus)
        #f.close()

        return corpus_stats
    return None
示例#4
0
        # So now we have a bootstrap resampled corpus with the same number
        # of sentences but not necessarily of the same lengths, nor necessarily
        # with the same unigram or bigram distributions.
        generated_corpus_bigrams = compare_bigrams.bigram_counts(
            bootstrap_corpus)
        generated_corpus_unigrams = compare_bigrams.unigram_counts(
            bootstrap_corpus)

        comp = compare_bigrams.compare_Ngrams(bigrams_unpermuted,
                                              generated_corpus_bigrams)
        comp = dict([("bigrams.%s" % k, v) for (k, v) in comp.items()])
        comp["permutation"] = i

        unicomp = compare_bigrams.compare_Ngrams(unigrams_unpermuted,
                                                 generated_corpus_unigrams)
        unicomp = dict([("unigrams.%s" % k, v) for (k, v) in unicomp.items()])
        comp = {**comp, **unicomp}  # merge the dicts
        comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys())
        comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys())

        # Quantify the copying
        cop = quantify_copying.corpus(bootstrap_corpus)

        corpus_stats = corpus_stats.append(
            pd.DataFrame({
                **comp,
                **cop
            }, index=[i]))

    corpus_stats.to_csv('interim/bootstrap_corpora_stats.csv')
def generate_bigram_corpus(i):

    # This generates one bigram corpus (randomisation i)
    # outputs it to a file, and returns the corpus stats.

    print (i,)
    sys.stdout.flush()
    
    # First make a candidate corpus
    corpus = []
    for l in lengths: # we obtain a sentence of a particular length

        # Choose a sentence of that length from the randomly generated sentences
        #sent = random.choice(supercorpus[l])
        sent = compare_bigrams.weighted_choice(supercorpus[l])
        # Sent will be encoded as a.b.c.d, so now we turn it back into a list, and add it to the corpus:
        corpus.append(sent.split("."))


    # So now we should have a corpus of the same number of words,
    # the same distribution of word lengths as those of the original corpus.
    # What we need to check is whether the distribution of unigrams and bigrams
    # is approximately correct. Let's see.
    # As a metric we use the Cramer V metric.

    generated_corpus_bigrams  = compare_bigrams.bigram_counts ( corpus )
    generated_corpus_unigrams = compare_bigrams.unigram_counts( corpus )
    
    comp = compare_bigrams.compare_Ngrams( bigrams_unpermuted, generated_corpus_bigrams )
    comp = dict([ ("bigrams.%s"%k,v) for (k,v) in comp.items() ])

    unicomp = compare_bigrams.compare_Ngrams( unigrams_unpermuted, generated_corpus_unigrams )
    unicomp = dict([ ("unigrams.%s"%k,v) for (k,v) in unicomp.items() ])

    comp["n.unique.bigrams"]  = len(generated_corpus_bigrams.keys())
    comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys())

    # Also quantify copying
    cop = quantify_copying.corpus(corpus)
    
    # Combine all metrics we've gathered about this corpus
    comp = {**comp,**unicomp,**cop} # merge the dicts
    

    # Hope this happens reasonably often
    if True:

        comp["permutation"]=i

        stats = pd.DataFrame(comp,index=[i])
    
        # Write the shuffled corpora to file

        # Now also shuffle the word list, so that we don't always have the same lengths in the train and test
        # corpus after the following split.
        #random.shuffle(corpus)
        
        # Split into train and test corpus
        #train_corpus = corpus[:N_TRAIN]
        #test_corpus  = corpus[N_TRAIN:]

        # Write the shuffled corpora to file
        f = open('%s/permutation_%05i.txt'%(OUTPUT_DIR,i),'w')
        corpus = "\n".join([ " ".join(w) for w in corpus ])
        f.write(corpus)
        f.close()


        #f = open('%s/bigramgen_%05i_test_corpus.txt'%(OUTPUT_DIR,i+1),'w')
        #corpus = "\n".join([ " ".join(w) for w in test_corpus ])
        #f.write(corpus)
        #f.close()


        return stats

    return None