""" from ngrampy.LineFile import * import os import argparse import glob ASSERT_SORTED = True # if you want an extra check on sorting parser = argparse.ArgumentParser(description='Compute average surprisal from google style data') parser.add_argument('--in', dest='in', type=str, default="/home/piantado/Desktop/mit/Corpora/GoogleNGrams/2/*", nargs="?", help='The directory with google files (e.g. Google/3gms/)') parser.add_argument('--path', dest='path', type=str, default="/tmp/GoogleSurprisal", nargs="?", help='Where the database file lives') args = vars(parser.parse_args()) print "# Loading files" G = LineFile( glob.glob(args['in']), header=["w1", "w2", "cnt12"], path=args['path']) print "# Cleaning" G.clean(columns=3) # Since we collapsed case, go through and re-sum the triple counts print "# Resumming for case collapsing" G.sort(keys="w1 w2") G.resum_equal("w1 w2", "cnt12", assert_sorted=ASSERT_SORTED ) # in collapsing case, etc., we need to re-sum # Now go through and print "# Making marginal counts" G.make_marginal_column("cnt1", "w1", "cnt12") # and compute surprisal print "# Sorting by word" G.sort("w2")
from ngrampy.LineFile import * import os GOOGLE_ENGLISH_DIR = "/home/piantado/Desktop/mit/Corpora/GoogleNGrams/3/" VOCAB_FILE = "Vocabulary/EnglishVocabulary.txt" # Read the vocabulary file vocabulary = [l.strip() for l in open(VOCAB_FILE, "r")] #rawG = LineFile(["test3.txt"], header=["w1", "w2", "w3", "cnt123"]) # for debugging rawG = LineFile( [GOOGLE_ENGLISH_DIR + x for x in os.listdir(GOOGLE_ENGLISH_DIR)], header=["w1", "w2", "w3", "cnt123"]) rawG.clean() # already done! rawG.restrict_vocabulary( "w1 w2 w3", vocabulary) # in fields w1 and w2, restrict our vocabulary rawG.sort( keys="w1 w2 w3" ) # Since we collapsed case, etc. This could also be rawG.sort(keys=["w1","w2","w3"]) in the other format. rawG.resum_equal("w1 w2 w3", "cnt123") # Where we store all lines G = rawG.copy() # Now go through and compute what we want G1 = rawG.copy() # start with a copy G1.delete_columns("w2 w3") # delete the columns we don't want G1.sort("w1") # sort this by the one we do want G1.resum_equal("w1", "cnt123") # resum equal G1.rename_column("cnt123", "cnt1") # rename the column since its now a sum of 1
from ngrampy.LineFile import * import os GOOGLE_ENGLISH_DIR = "/home/piantado/Desktop/mit/Corpora/GoogleNGrams/3/" VOCAB_FILE = "Vocabulary/EnglishVocabulary.txt" # Read the vocabulary file vocabulary = [ l.strip() for l in open(VOCAB_FILE, "r") ] #rawG = LineFile(["test3.txt"], header=["w1", "w2", "w3", "cnt123"]) # for debugging rawG = LineFile([GOOGLE_ENGLISH_DIR+x for x in os.listdir(GOOGLE_ENGLISH_DIR)], header=["w1", "w2", "w3", "cnt123"]) rawG.clean() # already done! rawG.restrict_vocabulary("w1 w2 w3", vocabulary) # in fields w1 and w2, restrict our vocabulary rawG.sort(keys="w1 w2 w3") # Since we collapsed case, etc. This could also be rawG.sort(keys=["w1","w2","w3"]) in the other format. rawG.resum_equal("w1 w2 w3", "cnt123" ) # Where we store all lines G = rawG.copy() # Now go through and compute what we want G1 = rawG.copy() # start with a copy G1.delete_columns( "w2 w3" ) # delete the columns we don't want G1.sort("w1" ) # sort this by the one we do want G1.resum_equal( "w1", "cnt123" ) # resum equal G1.rename_column("cnt123", "cnt1") # rename the column since its now a sum of 1 G.sort("w1") # sort our target by w G.merge(G1, keys1="w1", tocopy="cnt1") # merge in G1.delete() # and delete this temporary G2 = rawG.copy()
""" from ngrampy.LineFile import * import os import argparse import glob ASSERT_SORTED = True # if you want an extra check on sorting parser = argparse.ArgumentParser(description='Compute average surprisal from google style data') parser.add_argument('--in', dest='in', type=str, default="/home/piantado/Desktop/mit/Corpora/GoogleNGrams/3/*", nargs="?", help='The directory with google files (e.g. Google/3gms/') parser.add_argument('--path', dest='path', type=str, default="/tmp/GoogleSurprisal", nargs="?", help='Where the database file lives') args = vars(parser.parse_args()) print "# Loading files" G = LineFile( glob.glob(args['in']), header=["w1", "w2", "cnt12"], path=args['path']) print "# Cleaning" G.clean(columns=3) # Since we collapsed case, go through and re-sum the triple counts print "# Resumming for case collapsing" G.sort(keys="w1 w2") G.resum_equal("w1 w2", "cnt12", assert_sorted=ASSERT_SORTED ) # in collapsing case, etc., we need to re-sum # Now go through and Gcontext = G.copy() #print "# Sorting by context" #Gcontext.sort("w1 w2") # sort this by the one we do want print "# Computing context sum" Gcontext.resum_equal( "w1", "cnt12", assert_sorted=ASSERT_SORTED ) # resum equal Gcontext.rename_column("cnt12", "cnt1") # rename the column since its now a sum of 1
import os SUBSAMPLE_N = 15000 tolerance = 0.001 BAD_WORD_FILE = "badwords.txt" def check_tolerance(x, y): """ A handy function to check if some variables are within tolerance percent of each other """ return abs(x - y) / ((x + y) / 2.) < tolerance # This will copy the file, make a new one, and then print out possible lines G = LineFile(files=["/ssd/trigram-stats"], path="/ssd/subsampled-stimuli", header="w1 w2 w3 c123 c1 c2 c3 c12 c23 unigram bigram trigram") # Now throw out the porno words porno_vocabulary = [l.strip() for l in open(BAD_WORD_FILE, "r")] G.restrict_vocabulary("w1 w2 w3", porno_vocabulary, invert=True) # and then subsample G.subsample_lines(N=SUBSAMPLE_N) # and make sure we are sorted for the below G.sort("unigram bigram trigram", dtype=float) G.head() # just a peek item_number = 0 line_stack = []
""" from ngrampy.LineFile import * import os SUBSAMPLE_N = 50000000 tolerance = 0.01 BAD_WORD_FILE = "badwords.txt" def check_tolerance(x,y): """ A handy function to check if some variables are within tolerance percent of each other """ return abs(x-y) / ((x+y)/2.) < tolerance # This will copy the file, make a new one, and then print out possible lines G = LineFile(files=["/ssd/trigram-stats"], path="/ssd/subsampled-stimuli", header="w1 w2 w3 c123 c1 c2 c3 c12 c23 unigram bigram trigram") # Now throw out the porno words #porno_vocabulary = [ l.strip() for l in open(BAD_WORD_FILE, "r") ] #G.restrict_vocabulary("w1 w2 w3", porno_vocabulary, invert=True) # draw a subsample #if SUBSAMPLE_N is not None: #G.subsample_lines(N=SUBSAMPLE_N) # we need to resort this so that we can have w1 and w3 equal and then all the n-grams matched G.sort("w1 w3 unigram bigram trigram", lines=1000000) G.head() item_number = 0 line_stack = []
""" from ngrampy.LineFile import * import os SUBSAMPLE_N = 15000 tolerance = 0.001 BAD_WORD_FILE = "badwords.txt" def check_tolerance(x,y): """ A handy function to check if some variables are within tolerance percent of each other """ return abs(x-y) / ((x+y)/2.) < tolerance # This will copy the file, make a new one, and then print out possible lines G = LineFile(files=["/ssd/trigram-stats"], path="/ssd/subsampled-stimuli", header="w1 w2 w3 c123 c1 c2 c3 c12 c23 unigram bigram trigram") # Now throw out the porno words porno_vocabulary = [ l.strip() for l in open(BAD_WORD_FILE, "r") ] G.restrict_vocabulary("w1 w2 w3", porno_vocabulary, invert=True) # and then subsample G.subsample_lines(N=SUBSAMPLE_N) # and make sure we are sorted for the below G.sort("unigram bigram trigram", dtype=float) G.head() # just a peek item_number = 0 line_stack = [] for l in G.lines(tmp=False, parts=False):
dest='in', type=str, default="/home/piantado/Desktop/mit/Corpora/GoogleNGrams/2/*", nargs="?", help='The directory with google files (e.g. Google/3gms/)') parser.add_argument('--path', dest='path', type=str, default="/tmp/GoogleSurprisal", nargs="?", help='Where the database file lives') args = vars(parser.parse_args()) print "# Loading files" G = LineFile(glob.glob(args['in']), header=["w1", "w2", "cnt12"], path=args['path']) print "# Cleaning" G.clean(columns=3) # Since we collapsed case, go through and re-sum the triple counts print "# Resumming for case collapsing" G.sort(keys="w1 w2") G.resum_equal( "w1 w2", "cnt12", assert_sorted=ASSERT_SORTED) # in collapsing case, etc., we need to re-sum # Now go through and print "# Making marginal counts" G.make_marginal_column("cnt1", "w1", "cnt12")