Exemplo n.º 1
0
def DoTest(classA,
           classB,
           unsupervised=False,
           balance=False,
           use_baseline_segmenter=False):
    classA = list(classA)
    classB = list(classB)

    if balance:
        max_len = min(len(classA), len(classB))
        classA = classA[:max_len]
        classB = classB[:max_len]

    random.shuffle(classA)
    random.shuffle(classB)
    percents = (0.2, 0.1, 0.7)
    classA_test, classA_validation, classA_train = Partition(classA, percents)
    classB_test, classB_validation, classB_train = Partition(classB, percents)

    if use_baseline_segmenter:
        seg_func = segmenter.baseline_segmenter
    else:
        seg_func = segmenter.morph_segmenter(model)

    classifier = BinaryClassifier.Train(seg_func, classA_train, classB_train)
    classifier.GetTopRatios()

    thresh = GetOptimalThreshold(classifier, classA_validation,
                                 classB_validation)

    acc = TestAccuracy(classifier, classA_test, classB_test, thresh)
    print 'test accuracy {0}'.format(acc)

    if unsupervised:
        semisup_classifier = classifier
        for iter_num in range(3):
            print 'semi-sup iter {0}'.format(iter_num)
            semisup_classifier.TrainConfidenceEstimator(
                classA_validation, classB_validation)

            semisup_classifier = BinaryClassifier.TrainSemiSupervised(
                snapchat_names, semisup_classifier)
            thresh = GetOptimalThreshold(semisup_classifier, classA_validation,
                                         classB_validation)
            acc = TestAccuracy(semisup_classifier, classA_test, classB_test,
                               thresh)
            print 'accuracy {0}'.format(acc)
Exemplo n.º 2
0
def DoTest(classA, classB, unsupervised=False, balance=False,
           use_baseline_segmenter=False):
  classA = list(classA)
  classB = list(classB)

  if balance:
    max_len = min(len(classA), len(classB))
    classA = classA[:max_len]
    classB = classB[:max_len]

  random.shuffle(classA)
  random.shuffle(classB)
  percents = (0.2, 0.1, 0.7)
  classA_test, classA_validation, classA_train = Partition(classA, percents)
  classB_test, classB_validation, classB_train = Partition(classB, percents)

  if use_baseline_segmenter:
    seg_func = segmenter.baseline_segmenter
  else:
    seg_func = segmenter.morph_segmenter(model)

  classifier = BinaryClassifier.Train(seg_func, classA_train, classB_train)
  classifier.GetTopRatios()

  thresh = GetOptimalThreshold(classifier,
                               classA_validation, classB_validation)
  
  acc = TestAccuracy(classifier, classA_test, classB_test, thresh)
  print 'test accuracy {0}'.format(acc)

  if unsupervised:
    semisup_classifier = classifier
    for iter_num in range(3):
      print 'semi-sup iter {0}'.format(iter_num)
      semisup_classifier.TrainConfidenceEstimator(classA_validation, classB_validation)

      semisup_classifier = BinaryClassifier.TrainSemiSupervised(snapchat_names,
                                                                semisup_classifier)
      thresh = GetOptimalThreshold(semisup_classifier, classA_validation, classB_validation)
      acc = TestAccuracy(semisup_classifier, classA_test, classB_test, thresh)
      print 'accuracy {0}'.format(acc)
Exemplo n.º 3
0
			acc = TestAccuracy(semisup_classifier, classA_test, classB_test, thresh)
			print 'accuracy {0}'.format(acc)

			segfun = semisup_classifier.segfun
			with open("../models/init_nonum_semi%i.pkl" % (iter_num + 1), 'w') as f:
				semisup_classifier.segfun = None
				cPickle.dump(semisup_classifier, f)
			semisup_classifier.segfun = segfun
		return semisup_classifier


if __name__ == '__main__':
	target = 'gender'
	whereclause = "where gender is not ''" if target == 'gender' else ''
	model = segmenter.load_model('../models/idmorphs_naworl.model')
	segfun = segmenter.morph_segmenter(model, match='[a-z]+')

	# model_semi = segmenter.load_model('../models/idmorphs.model')
	# segfun_semi = segmenter.morph_segmenter(model_semi, match='[a-z]+')

	users = segmenter.get_users_from_db(whereclause=whereclause)
	male_ids = [user.id for user in users if user.gender == 'M']
	female_ids = [user.id for user in users if user.gender == 'F']

	# unlabeled_users = segmenter.get_users_from_db(tablename='naver')
	# unknown_ids = [user.id for user in unlabeled_users]
	unknown_ids = None
	cls = DoTest(male_ids, female_ids, segfun, unknown_ids, balance=False)

	# cls = cPickle.load(open("../models/init_nonum_semi3.pkl"))
	# cls.segfun = segfun_semi
Exemplo n.º 4
0
    result = classifier.Classify(name)
    result['lang'] = lang
    result['name'] = name
    results.append(result)
  return pandas.DataFrame(results)


def get_preds(baseline, morph, weight):
  columns = numpy.array(['True', 'False'])
  z = weight * baseline[columns] + (1.0 - weight) * morph[columns]
  idx = z.values.argmax(axis=1)
  return columns[idx]

base_segmenter = segmenter.baseline_segmenter
morph_segmenter = segmenter.morph_segmenter(Classifier.model)

def getMetrics(truelabels, predlabels):
  prec = metrics.precision_score(truelabels, predlabels, pos_label='True')
  recall = metrics.recall_score(truelabels, predlabels, pos_label='True')
  return prec, recall


all_langs = train.lang.unique()
for lang in all_langs:
  labels = [str(x) for x in train.lang == lang]
  testlabels = [str(x) for x in test.lang == lang]
  baseline_classifier = BayesClassifier.Train(base_segmenter,
                                              train.name_lower,
                                              labels)
  morph_classifier = BayesClassifier.Train(morph_segmenter,
Exemplo n.º 5
0
        result = classifier.Classify(name)
        result['lang'] = lang
        result['name'] = name
        results.append(result)
    return pandas.DataFrame(results)


def get_preds(baseline, morph, weight):
    columns = numpy.array(['True', 'False'])
    z = weight * baseline[columns] + (1.0 - weight) * morph[columns]
    idx = z.values.argmax(axis=1)
    return columns[idx]


base_segmenter = segmenter.baseline_segmenter
morph_segmenter = segmenter.morph_segmenter(Classifier.model)


def getMetrics(truelabels, predlabels):
    prec = metrics.precision_score(truelabels, predlabels, pos_label='True')
    recall = metrics.recall_score(truelabels, predlabels, pos_label='True')
    return prec, recall


all_langs = train.lang.unique()
for lang in all_langs:
    labels = [str(x) for x in train.lang == lang]
    testlabels = [str(x) for x in test.lang == lang]
    baseline_classifier = BayesClassifier.Train(base_segmenter,
                                                train.name_lower, labels)
    morph_classifier = BayesClassifier.Train(morph_segmenter, train.name_lower,