def create_sample(outfile, n_pos, n_neg): ''' Creates a dict describing a specific sample of rows and saves it at ARTIFACTS with form {filename : label}. Rewrites label to a python int (it comes in as numpy.int64) so that we can use this dict with pypy. args: outfile - the dict is written at ARTIFACTS/<outfile>.pkl n_pos - approximate number of positive instances in sample n_neg - approximate number of negative instances in sample return: nothing, but dict is pickled at ARTIFACT/<outfile>.pkl ''' tr = load_train(False) tr_pos = tr[tr.sponsored == 1] tr_pos = tr_pos.sample(n_pos) tr_neg = tr[tr.sponsored == 0] tr_neg = tr.sample(n_neg) sample_df = tr_pos.append(tr_neg) # We need this to prevent the pickled sample dict from containing a # numpy.int64, which prevents using pypy labels = [int(x) for x in sample_df.sponsored] sample = dict(zip(sample_df.file, labels)) artifacts.put_artifact(sample, outfile)
import os import util import artifacts # This script must run in Python, not Pypy. # This creates a dict like {filename: label} for the whole training set. train = util.load_train(True) artifacts.put_artifact(train, 'train_dict') # This makes a similar dict, holding a sample of 20k positive # and 20k negative instances. # This is used for determining frequent tags, tokens, etc. for features. # The dict is saved as artifacts/sample_20_20.pkl. sample = util.create_sample('sample_20_20', 20000, 20000)
"tag_attrs": tag_attrs, "tag_attr_vals": tag_attr_vals, "urls": urls, "paths": paths, "script": script, "style": style, } return out if __name__ == "__main__": text = """ Collect document frequencies for tags, attributes, urls, etc. from a sample specified in <sample> and write results at artifacts/<outfile>.pkl Example: pypy counts.py stats sample20_20 Collect statistics for files in artifacts/sample20_20.pkl and write output at artifacts/stats.pkl. """ start = datetime.now() parser = argparse.ArgumentParser(description=text) parser.add_argument("outfile", help="bare name of output file, without path or extension") parser.add_argument("sample", help="bare name of sample") args = parser.parse_args() out = get_counts(args.sample) artifacts.put_artifact(out, args.outfile) finish = datetime.now() print "Elapsed time: %d sec." % (finish - start).seconds
'tag_attr_vals': tag_attr_vals, 'urls': urls, 'paths': paths, 'script': script, 'style': style } return out if __name__ == '__main__': text = ''' Collect document frequencies for tags, attributes, urls, etc. from a sample specified in <sample> and write results at artifacts/<outfile>.pkl Example: pypy counts.py stats sample20_20 Collect statistics for files in artifacts/sample20_20.pkl and write output at artifacts/stats.pkl. ''' start = datetime.now() parser = argparse.ArgumentParser(description=text) parser.add_argument( 'outfile', help='bare name of output file, without path or extension') parser.add_argument('sample', help='bare name of sample') args = parser.parse_args() out = get_counts(args.sample) artifacts.put_artifact(out, args.outfile) finish = datetime.now() print 'Elapsed time: %d sec.' % (finish - start).seconds