unicomp = compare_bigrams.compare_Ngrams(unigrams_unpermuted, generated_corpus_unigrams) unicomp = dict([("unigrams.%s" % k, v) for (k, v) in unicomp.items()]) comp = dict(list(comp.items()) + list(unicomp.items())) # merge the dicts comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys()) comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys()) if True: print("%i" % (i + 1), ) sys.stdout.flush() # Quantify the copying cop = quantify_copying.corpus(words) corpus_stats = pd.concat([ corpus_stats, pd.DataFrame(dict(list(comp.items()) + list(cop.items())), index=[i + 1]) ]) # Now also shuffle the word list, so that we don't always have the same lengths in the train and test # corpus after the following split. #random.shuffle(words) # Split into train and test corpus #train_corpus = words[:N_TRAIN] #test_corpus = words[N_TRAIN:]
comp["permutation"]=i+1 unicomp = compare_bigrams.compare_Ngrams( unigrams_unpermuted, generated_corpus_unigrams ) unicomp = dict([ ("unigrams.%s"%k,v) for (k,v) in unicomp.items() ]) comp = {**comp,**unicomp} # merge the dicts comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys()) comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys()) if True: print ("%i"%(i+1),end=" ",flush=True) # Quantify the copying cop = quantify_copying.corpus(corpus) corpus_stats = pd.concat([corpus_stats, pd.DataFrame({**comp,**cop}, index=[i+1])]) # Now also shuffle the word list, so that we don't always have the same lengths in the train and test # corpus after the following split. #random.shuffle(words) # Split into train and test corpus #train_corpus = words[:N_TRAIN] #test_corpus = words[N_TRAIN:] # Write the shuffled corpora to file #f = open('%s/permutation_%05i_numbers.txt'%(OUTPUT_DIR,i+1),'w')
def generate_bigram_corpus(i): print(i, end=" ", flush=True) # First make a candidate corpus corpus = [] total_length = 0 while total_length < N_WORDS_IN_CORPUS: # we obtain a sentence of a particular length # Generate one item sent, _ = list( compare_bigrams.generate_words_from_bigrams( bigrams_unpermuted, 1, maxlength, progress_output=False).values())[0][0] sent = sent.split(".") corpus += [sent] total_length += len(sent) # Ok, so now we have a corpus that is just slightly bigger in number of words than the target corpus. # We consider discarding the last sentence, and seeing if we end up closer to the target length. dlength = abs(total_length - N_WORDS_IN_CORPUS) dlength_discard = abs( (total_length - len(corpus[-1])) - N_WORDS_IN_CORPUS ) # the length of the corpus if we would discard the final addition if dlength_discard < dlength: # if, discarding the last sentence, we are closer to the target length, do the discard corpus = corpus[:-1] total_length = sum([len(s) for s in corpus]) generated_corpus_bigrams = compare_bigrams.bigram_counts(corpus) generated_corpus_unigrams = compare_bigrams.unigram_counts(corpus) comp = compare_bigrams.compare_Ngrams(bigrams_unpermuted, generated_corpus_bigrams) comp = dict([("bigrams.%s" % k, v) for (k, v) in comp.items()]) unicomp = compare_bigrams.compare_Ngrams(unigrams_unpermuted, generated_corpus_unigrams) unicomp = dict([("unigrams.%s" % k, v) for (k, v) in unicomp.items()]) # Also quantify copying cop = quantify_copying.corpus(corpus) # Combine all metrics we've gathered about this corpus comp = { **comp, **unicomp, **cop } #dict(comp.items()+unicomp.items()+cop.items()) # merge the dicts comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys()) comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys()) if True: comp["randomisation"] = i corpus_stats = pd.DataFrame(comp, index=[i]) # Write the shuffled corpora to file # Now also shuffle the word list, so that we don't always have the same lengths in the train and test # corpus after the following split. #random.shuffle(corpus) # Split into train and test corpus #train_corpus = corpus[:N_TRAIN] #test_corpus = corpus[N_TRAIN:] # Write the shuffled corpora to file f = open('%s/bigramfree_%05i_corpus.txt' % (OUTPUT_DIR, i + 1), 'w') corpus = "\n".join([" ".join(w) for w in corpus]) f.write(corpus) f.close() #f = open('%s/bigramfree_%05i_test_corpus.txt'%(OUTPUT_DIR,i+1),'w') #corpus = "\n".join([ " ".join(w) for w in test_corpus ]) #f.write(corpus) #f.close() return corpus_stats return None
# So now we have a bootstrap resampled corpus with the same number # of sentences but not necessarily of the same lengths, nor necessarily # with the same unigram or bigram distributions. generated_corpus_bigrams = compare_bigrams.bigram_counts( bootstrap_corpus) generated_corpus_unigrams = compare_bigrams.unigram_counts( bootstrap_corpus) comp = compare_bigrams.compare_Ngrams(bigrams_unpermuted, generated_corpus_bigrams) comp = dict([("bigrams.%s" % k, v) for (k, v) in comp.items()]) comp["permutation"] = i unicomp = compare_bigrams.compare_Ngrams(unigrams_unpermuted, generated_corpus_unigrams) unicomp = dict([("unigrams.%s" % k, v) for (k, v) in unicomp.items()]) comp = {**comp, **unicomp} # merge the dicts comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys()) comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys()) # Quantify the copying cop = quantify_copying.corpus(bootstrap_corpus) corpus_stats = corpus_stats.append( pd.DataFrame({ **comp, **cop }, index=[i])) corpus_stats.to_csv('interim/bootstrap_corpora_stats.csv')
def generate_bigram_corpus(i): # This generates one bigram corpus (randomisation i) # outputs it to a file, and returns the corpus stats. print (i,) sys.stdout.flush() # First make a candidate corpus corpus = [] for l in lengths: # we obtain a sentence of a particular length # Choose a sentence of that length from the randomly generated sentences #sent = random.choice(supercorpus[l]) sent = compare_bigrams.weighted_choice(supercorpus[l]) # Sent will be encoded as a.b.c.d, so now we turn it back into a list, and add it to the corpus: corpus.append(sent.split(".")) # So now we should have a corpus of the same number of words, # the same distribution of word lengths as those of the original corpus. # What we need to check is whether the distribution of unigrams and bigrams # is approximately correct. Let's see. # As a metric we use the Cramer V metric. generated_corpus_bigrams = compare_bigrams.bigram_counts ( corpus ) generated_corpus_unigrams = compare_bigrams.unigram_counts( corpus ) comp = compare_bigrams.compare_Ngrams( bigrams_unpermuted, generated_corpus_bigrams ) comp = dict([ ("bigrams.%s"%k,v) for (k,v) in comp.items() ]) unicomp = compare_bigrams.compare_Ngrams( unigrams_unpermuted, generated_corpus_unigrams ) unicomp = dict([ ("unigrams.%s"%k,v) for (k,v) in unicomp.items() ]) comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys()) comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys()) # Also quantify copying cop = quantify_copying.corpus(corpus) # Combine all metrics we've gathered about this corpus comp = {**comp,**unicomp,**cop} # merge the dicts # Hope this happens reasonably often if True: comp["permutation"]=i stats = pd.DataFrame(comp,index=[i]) # Write the shuffled corpora to file # Now also shuffle the word list, so that we don't always have the same lengths in the train and test # corpus after the following split. #random.shuffle(corpus) # Split into train and test corpus #train_corpus = corpus[:N_TRAIN] #test_corpus = corpus[N_TRAIN:] # Write the shuffled corpora to file f = open('%s/permutation_%05i.txt'%(OUTPUT_DIR,i),'w') corpus = "\n".join([ " ".join(w) for w in corpus ]) f.write(corpus) f.close() #f = open('%s/bigramgen_%05i_test_corpus.txt'%(OUTPUT_DIR,i+1),'w') #corpus = "\n".join([ " ".join(w) for w in test_corpus ]) #f.write(corpus) #f.close() return stats return None