Exemplo n.º 1
0
    bucketlist = map(str.strip, open(bucketlist_path))
    features = read_features(feature_path)

    if args.domain:
        index_path = os.path.join(args.model, "domain_index")
        suffix = ".domain"
    elif args.lang:
        index_path = os.path.join(args.model, "lang_index")
        suffix = ".lang"
    else:
        raise ValueError("no event specified")

    if args.weights:
        weights_path = args.weights
    else:
        weights_path = os.path.join(args.model, "IGweights" + suffix + (".bin" if args.binarize else ""))

    # display paths
    print "model path:", args.model
    print "buckets path:", bucketlist_path
    print "features path:", feature_path
    print "weights path:", weights_path
    print "index path:", index_path
    print "suffix:", suffix

    print "computing information gain"
    dist = read_dist(index_path)
    ig = compute_IG(bucketlist, features, dist, args.binarize, suffix, args.jobs)

    write_weights(ig, weights_path)
Exemplo n.º 2
0
    print "features output path:", feature_path
    if args.tokens_per_order:
        print "max ngram order:", args.max_order
        print "tokens per order:", args.tokens_per_order
    else:
        print "tokens:", args.tokens

    with open(bucketlist_path) as f:
        bucketlist = map(str.strip, f)

    doc_count = tally(bucketlist, args.jobs)
    print "unique features:", len(doc_count)
    if args.doc_count:
        # The constant true is used to indicate output to default location
        doc_count_path = os.path.join(
            args.model, 'DF_all') if args.doc_count == True else args.doc_count
        write_weights(doc_count, doc_count_path)
        print "wrote DF counts for all features to:", doc_count_path

    if args.tokens_per_order:
        # Choose a number of features for each length of token
        feats = ngram_select(doc_count, args.max_order, args.tokens_per_order)
    else:
        # Choose a number of features overall
        feats = sorted(
            sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens])
    print "selected features: ", len(feats)

    write_features(feats, feature_path)
    print 'wrote features to "%s"' % feature_path
Exemplo n.º 3
0
  # Compute P(t|C)
  print "learning P(t|C)"
  paths = zip(*items)[2]
  nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args)
  nb_ptc = np.array(nb_ptc).reshape(len(feats), len(langs))

  # Normalize to 1 on the term axis
  print "renormalizing P(t|C)"
  for i in range(nb_ptc.shape[1]):
    # had to de-vectorize this due to memory consumption
    newval = np.empty_like(nb_ptc[:,i])
    for j in range(newval.shape[0]):
      newval[j] = (1/np.exp(nb_ptc[:,i] - nb_ptc[j,i]).sum())
    nb_ptc[:,i] = newval
    assert (1.0 - newval.sum()) < 0.0001

  print "doing per-pair output"
  for lang1, lang2 in pairs:
    # Where to do output
    if args.no_norm:
      weights_path = os.path.join(out_dir, ('BLfeats.no_norm.{0}.{1}'.format(lang1, lang2)))
    else:
      weights_path = os.path.join(out_dir, ('BLfeats.{0}.{1}'.format(lang1, lang2)))

    i1 = indexer.lang_index[lang1]
    i2 = indexer.lang_index[lang2]

    w = dict(zip(feats, np.abs((nb_ptc[:,i1] - nb_ptc[:,i2]) / (nb_ptc.sum(1) if not args.no_norm else 1))))
    write_weights(w, weights_path)
    print "wrote weights to {0}".format(weights_path)
Exemplo n.º 4
0
  print "features output path:", feature_path
  if args.tokens_per_order:
    print "max ngram order:", args.max_order
    print "tokens per order:", args.tokens_per_order
  else:
    print "tokens:", args.tokens

  with open(bucketlist_path) as f:
    bucketlist = map(str.strip, f)

  doc_count = tally(bucketlist, args.jobs)
  print "unique features:", len(doc_count)
  if args.doc_count:
    # The constant true is used to indicate output to default location
    doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count
    write_weights(doc_count, doc_count_path)
    print "wrote DF counts for all features to:", doc_count_path

  if args.tokens_per_order:
    # Choose a number of features for each length of token
    feats = ngram_select(doc_count, args.max_order, args.tokens_per_order)
  else:
    # Choose a number of features overall
    feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] )
  print "selected features: ", len(feats)

  write_features(feats, feature_path)
  print 'wrote features to "%s"' % feature_path 

  
Exemplo n.º 5
0
    else:
        raise ValueError("no event specified")

    if args.weights:
        weights_path = args.weights
    else:
        weights_path = os.path.join(
            args.model,
            'IGweights' + suffix + ('.bin' if args.binarize else ''))

    # display paths
    print "model path:", args.model
    print "buckets path:", bucketlist_paths
    print "features path:", feature_path
    print "weights path:", weights_path
    print "index path:", index_path
    print "suffix:", suffix

    print "computing information gain"
    # Compile buckets together
    bucketlist = zip(*(map(str.strip, open(p)) for p in bucketlist_paths))

    # Check that each bucketlist has the same number of buckets
    assert len(set(map(len, bucketlist))) == 1, "incompatible bucketlists!"

    dist = read_dist(index_path)
    ig = compute_IG(bucketlist, features, dist, args.binarize, suffix,
                    args.jobs)

    write_weights(ig, weights_path)
Exemplo n.º 6
0
  # Compute P(t|C)
  print "learning P(t|C)"
  paths = zip(*items)[2]
  nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args)
  nb_ptc = np.array(nb_ptc).reshape(len(feats), len(langs))

  # Normalize to 1 on the term axis
  print "renormalizing P(t|C)"
  for i in range(nb_ptc.shape[1]):
    # had to de-vectorize this due to memory consumption
    newval = np.empty_like(nb_ptc[:,i])
    for j in range(newval.shape[0]):
      newval[j] = (1/np.exp(nb_ptc[:,i] - nb_ptc[j,i]).sum())
    nb_ptc[:,i] = newval
    assert (1.0 - newval.sum()) < 0.0001

  print "doing per-pair output"
  for lang1, lang2 in pairs:
    # Where to do output
    if args.no_norm:
      weights_path = os.path.join(out_dir, ('BLfeats.no_norm.{0}.{1}'.format(lang1, lang2)))
    else:
      weights_path = os.path.join(out_dir, ('BLfeats.{0}.{1}'.format(lang1, lang2)))

    i1 = indexer.lang_index[lang1]
    i2 = indexer.lang_index[lang2]

    w = dict(zip(feats, np.abs((nb_ptc[:,i1] - nb_ptc[:,i2]) / (nb_ptc.sum(1) if not args.no_norm else 1))))
    write_weights(w, weights_path)
    print "wrote weights to {0}".format(weights_path)