示例#1
0
def create_sample(outfile, n_pos, n_neg):
    '''
  Creates a dict describing a specific sample of rows and saves 
  it at ARTIFACTS with form {filename : label}. Rewrites label 
  to a python int (it comes in as numpy.int64) so that we can 
  use this dict with pypy.
  
  args:
    outfile - the dict is written at ARTIFACTS/<outfile>.pkl
    n_pos - approximate number of positive instances in sample
    n_neg - approximate number of negative instances in sample
    
  return:
    nothing, but dict is pickled at ARTIFACT/<outfile>.pkl
  '''
    tr = load_train(False)
    tr_pos = tr[tr.sponsored == 1]
    tr_pos = tr_pos.sample(n_pos)
    tr_neg = tr[tr.sponsored == 0]
    tr_neg = tr.sample(n_neg)
    sample_df = tr_pos.append(tr_neg)
    # We need this to prevent the pickled sample dict from containing a
    # numpy.int64, which prevents using pypy
    labels = [int(x) for x in sample_df.sponsored]
    sample = dict(zip(sample_df.file, labels))
    artifacts.put_artifact(sample, outfile)
import os
import util
import artifacts

# This script must run in Python, not Pypy.

# This creates a dict like {filename: label} for the whole training set.
train = util.load_train(True)
artifacts.put_artifact(train, 'train_dict')

# This makes a similar dict, holding a sample of 20k positive 
# and 20k negative instances. 
# This is used for determining frequent tags, tokens, etc. for features.
# The dict is saved as artifacts/sample_20_20.pkl.
sample = util.create_sample('sample_20_20', 20000, 20000)
示例#3
0
import os
import util
import artifacts

# This script must run in Python, not Pypy.

# This creates a dict like {filename: label} for the whole training set.
train = util.load_train(True)
artifacts.put_artifact(train, 'train_dict')

# This makes a similar dict, holding a sample of 20k positive
# and 20k negative instances.
# This is used for determining frequent tags, tokens, etc. for features.
# The dict is saved as artifacts/sample_20_20.pkl.
sample = util.create_sample('sample_20_20', 20000, 20000)
示例#4
0
        "tag_attrs": tag_attrs,
        "tag_attr_vals": tag_attr_vals,
        "urls": urls,
        "paths": paths,
        "script": script,
        "style": style,
    }

    return out


if __name__ == "__main__":
    text = """
  Collect document frequencies for tags, attributes, urls, etc. from 
  a sample specified in <sample> and write results at artifacts/<outfile>.pkl
  
  Example: pypy counts.py stats sample20_20
  
  Collect statistics for files in artifacts/sample20_20.pkl and write output
  at artifacts/stats.pkl.
  """
    start = datetime.now()
    parser = argparse.ArgumentParser(description=text)
    parser.add_argument("outfile", help="bare name of output file, without path or extension")
    parser.add_argument("sample", help="bare name of sample")
    args = parser.parse_args()
    out = get_counts(args.sample)
    artifacts.put_artifact(out, args.outfile)
    finish = datetime.now()
    print "Elapsed time: %d sec." % (finish - start).seconds
示例#5
0
        'tag_attr_vals': tag_attr_vals,
        'urls': urls,
        'paths': paths,
        'script': script,
        'style': style
    }

    return out


if __name__ == '__main__':
    text = '''
  Collect document frequencies for tags, attributes, urls, etc. from 
  a sample specified in <sample> and write results at artifacts/<outfile>.pkl
  
  Example: pypy counts.py stats sample20_20
  
  Collect statistics for files in artifacts/sample20_20.pkl and write output
  at artifacts/stats.pkl.
  '''
    start = datetime.now()
    parser = argparse.ArgumentParser(description=text)
    parser.add_argument(
        'outfile', help='bare name of output file, without path or extension')
    parser.add_argument('sample', help='bare name of sample')
    args = parser.parse_args()
    out = get_counts(args.sample)
    artifacts.put_artifact(out, args.outfile)
    finish = datetime.now()
    print 'Elapsed time: %d sec.' % (finish - start).seconds