示例#1
0
def test_pipeline(pipeline):
    """
		Support function used to test a pipeline using the specified testSet
	"""
    if not isinstance(pipeline, Pipeline):
        raise ValueError("pipeline must be an instance of Pipeline")

    timer.start()
    if not useConllFile:
        labeled_featuresets = read_tweets_file(originalFile, pipeline).values()
    else:
        labeled_featuresets = read_conll_file(originalFile, conllFile,
                                              pipeline).values()

    validator = CrossValidator(labeled_featuresets)
    print "Elapsed time for data set processing: %.0fs\n" % (timer.stop() /
                                                             1000)

    # test the classifiers
    for classifierName in classifiers:
        timer.start()
        print "- %s " % classifierName,
        print "accuracy:	%f" % validator.validate(classifiers[classifierName],
                                                  numOfBins)[0]
        print "  Elapsed time: %.0fs\n" % (timer.stop() / 1000)
示例#2
0
def test_pipeline(pipeline):
	"""
		Support function used to test a pipeline using the specified testSet
	"""
	if not isinstance(pipeline, Pipeline):
		raise ValueError("pipeline must be an instance of Pipeline")

	timer.start()
	if not useConllFile:
		labeled_featuresets = read_tweets_file(originalFile, pipeline).values()
	else:
		labeled_featuresets = read_conll_file(originalFile, conllFile, pipeline).values()

	validator = CrossValidator(labeled_featuresets)
	print "Elapsed time for data set processing: %.0fs\n" % (timer.stop()/1000)

	# test the classifiers
	for classifierName in classifiers:
		timer.start()
		print "- %s " % classifierName,
		print "accuracy:	%f" % validator.validate(classifiers[classifierName], numOfBins)[0]
		print "  Elapsed time: %.0fs\n" % (timer.stop()/1000)
    from adapter.tagRemover import TagRemover
    tagrm = TagRemover()
    postfilters.append(tagrm)
    print "		TagRemover."

print "]"
print ""
print ""

pipeline = Pipeline(tokenizer, tagger, prefilters, postfilters)

file = ["tweeti-b", "tweeti-b.dev"]
if not args.n:
    # Load standard tweet file
    trainingfile = map(lambda path: args.datasetpath + path + ".tsv", file)
    labeled_featuresets = read_tweets_file(trainingfile, pipeline).values()
else:
    # If the not adapter filter has to be used, the program has to load the *.conll files instead
    # the conll files must be in the same dataset path specified by the user.
    trainingfile = map(lambda path: args.datasetpath + path + ".tsv", file)
    conllfile = map(lambda path: args.datasetpath + path + ".conll", file)
    labeled_featuresets = read_conll_file(trainingfile, conllfile,
                                          pipeline).values()

if not args.predict:
    ############ Cross Validation
    validator = CrossValidator(labeled_featuresets)
    timer.start()
    (acc, conf_matr, prec, recall,
     f_measure) = validator.validate(classifier, args.v)
    print "Accuracy:		%f" % acc
示例#4
0
# classifier
from classifier.shortTextClassifier import ShortTextClassifier
classifier =  ShortTextClassifier( )

# file paths
originalFile = map(lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", ["tweeti-b", "tweeti-b.dev"])
testingFile =  map(lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", ["twitter-test-input-B"])

# initialize the pipeline used to transform the tweets
tokenizer = POSTokenizer()
tagger = POSTagger()
pipeline = Pipeline( tokenizer, tagger, [], [] )

# read the training file
labeled_featuresets = read_tweets_file( originalFile, pipeline ).values( )

# training
classifier.train( labeled_featuresets )

# read the test file
labeled_featuresets_test = read_tweets_file( testingFile, pipeline )
for key in labeled_featuresets_test:
	labeled_featuresets_test[key] = labeled_featuresets_test[key][0]

# classification
labeled_featuresets_test = classifier.classify_dict( labeled_featuresets_test )

# output generation
output = open( get_project_dir() + "resources/twitter-test-input-B.out", 'w')
for key, label in labeled_featuresets_test.iteritems():
	from adapter.tagRemover import TagRemover
	tagrm = TagRemover()
	postfilters.append(tagrm)
	print "		TagRemover."

print "]"
print ""
print ""

pipeline = Pipeline( tokenizer, tagger, prefilters, postfilters )

file = ["tweeti-b", "tweeti-b.dev"]
if not args.n:
	# Load standard tweet file
	trainingfile = map(lambda path: args.datasetpath + path + ".tsv", file)
	labeled_featuresets = read_tweets_file(trainingfile, pipeline).values()
else:
	# If the not adapter filter has to be used, the program has to load the *.conll files instead
	# the conll files must be in the same dataset path specified by the user.
	trainingfile = map(lambda path: args.datasetpath + path + ".tsv", file)
	conllfile = map(lambda path: args.datasetpath + path + ".conll", file)
	labeled_featuresets = read_conll_file(trainingfile, conllfile,pipeline).values()

if not args.predict:
############ Cross Validation
	validator = CrossValidator(labeled_featuresets)
	timer.start()
	(acc, conf_matr, prec, recall, f_measure) = validator.validate(classifier, args.v)
	print "Accuracy:		%f" % acc
	print "Confusion Matrix:"
	for prec_label in conf_matr:
示例#6
0
# file paths
originalFile = map(
    lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv",
    ["tweeti-b", "tweeti-b.dev"])
testingFile = map(
    lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv",
    ["twitter-test-input-B"])

# initialize the pipeline used to transform the tweets
tokenizer = POSTokenizer()
tagger = POSTagger()
pipeline = Pipeline(tokenizer, tagger, [], [])

# read the training file
labeled_featuresets = read_tweets_file(originalFile, pipeline).values()

# training
classifier.train(labeled_featuresets)

# read the test file
labeled_featuresets_test = read_tweets_file(testingFile, pipeline)
for key in labeled_featuresets_test:
    labeled_featuresets_test[key] = labeled_featuresets_test[key][0]

# classification
labeled_featuresets_test = classifier.classify_dict(labeled_featuresets_test)

# output generation
output = open(get_project_dir() + "resources/twitter-test-input-B.out", 'w')
for key, label in labeled_featuresets_test.iteritems():