예제 #1
0
tag_filter = TagFilter(['CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'TO', 'WDT'])
tagrm = TagRemover()
url = URLAdapter( preprocess )
notadapt = NotAdapter()

# classifiers to test
classifiers = {"Bayes": BayesianClassifier(), "SVM": SVMClassifier(), "ShortTextClassifier": ShortTextClassifier()}

# to perform a quick test
#file = ["tweeti-b.dev"]
classifiers = {"ShortTextClassifier": ShortTextClassifier()}


# adjust all path
useConllFile = True
originalFile = map(lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", file)
conllFile = map(lambda path: get_project_dir() + "resources/conll/" + path + ".conll", file)


# support function
def test_pipeline(pipeline):
	"""
		Support function used to test a pipeline using the specified testSet
	"""
	if not isinstance(pipeline, Pipeline):
		raise ValueError("pipeline must be an instance of Pipeline")

	timer.start()
	if not useConllFile:
		labeled_featuresets = read_tweets_file(originalFile, pipeline).values()
	else:
예제 #2
0
import sys
sys.path.append('../')

from util.path import get_project_dir
from util.read_file import read_tweets_file
from util.pipeline import Pipeline

from tokenizer.posTokenizer import POSTokenizer
from tagger.posTagger import POSTagger

# classifier
from classifier.shortTextClassifier import ShortTextClassifier
classifier =  ShortTextClassifier( )

# file paths
originalFile = map(lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", ["tweeti-b", "tweeti-b.dev"])
testingFile =  map(lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", ["twitter-test-input-B"])

# initialize the pipeline used to transform the tweets
tokenizer = POSTokenizer()
tagger = POSTagger()
pipeline = Pipeline( tokenizer, tagger, [], [] )

# read the training file
labeled_featuresets = read_tweets_file( originalFile, pipeline ).values( )

# training
classifier.train( labeled_featuresets )

# read the test file
labeled_featuresets_test = read_tweets_file( testingFile, pipeline )
예제 #3
0
from util.path import get_project_dir
from util.read_file import read_conll_file
from util.pipeline import Pipeline
from validator.crossValidator import CrossValidator
from classifier.shortTextClassifier import ShortTextClassifier
from classifier.bayesianClassifier import BayesianClassifier
from classifier.svmClassifier import SVMClassifier
#from classifier.linearClassifier import LinearClassifier

# parameters
file = ["tweeti-b", "tweeti-b.dev"]
#file = ["tweeti-b.dev" ]
numOfBins = 10

# adjust all path
originalFile = map(lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", file)
conllFile = map(lambda path: get_project_dir() + "resources/conll/" + path + ".conll", file)

# timer used for timing
timer = Timer()

# classifiers to test
classifiers = {"ShortTextClassifier": ShortTextClassifier(), "SVMClassifier": SVMClassifier(), "Bayes": BayesianClassifier()}
#classifiers = {"LinearClassifier": LinearClassifier()}

#classifiers = {"Bayes": BayesianClassifier()}
# loading and processing data set
timer.start()
labeled_featuresets = read_conll_file(originalFile, conllFile, Pipeline()).values()

validator = CrossValidator(labeled_featuresets)
class ShortTextClassifier(Classifier):
    # static fields
    __tmpPath = (get_project_dir() + 'tmp/').replace(' ', '\ ')
    __svmFilePath = __tmpPath + 'svmFile'

    # library parameters
    __converter_arguments = '-stopword 0 -stemming 0 -feature 0'
    """
		Preprocessor options.
		-stopword : 0=no stopword removal | 1=stopword removal
		-stemming : 0=no stemming | 1=stemming
		-feature : 0=unigram | 1=bigram
	"""
    __grid_arguments = '0'
    """
		Grid search for the penalty parameter in linear classifiers.
		"0"   disable grid search (faster)
		"1"   enable grid search (slightly better results)
	"""
    __feature_arguments = ''
    """
		Feature representation. (default "-D 1 -N 1")
		"-D 1" : binary features
		"-D 0" : word count
		"-T 1" : term frequency
		"-I 1" : IDF (to use TF-IDF put "-D 0 -T 1 -I 1")

		"-N 1" : Instance-wise normalization before training/test.
	"""
    __liblinear_arguments = '-q'
    """
		Classifier. (default "-s 4")
		"-s 4" : support vector classification by Crammer and Singer
		"-s 3" : L1-loss support vector classification
		"-s 1" : L2-loss support vector classification
		"-s 7" : logistic regression

		"-q" : quiet mode
	"""
    def train(self, labeled_featuresets):
        """
			Trains the classifier on the specified training set. Multiple calls to
			this method invalidates the previous ones.

			The labeled_featuresets parameter must have the format [([feature], label)]
		"""

        # generate training file from labeled_featuresets
        self.__text_converter = Text2svmConverter(self.__converter_arguments)
        self.__convert_labeled_featuresets(labeled_featuresets,
                                           self.__svmFilePath)

        # train the model
        self.__model = train_converted_text(
            self.__svmFilePath,
            self.__text_converter,
            grid_arguments=self.__grid_arguments,
            feature_arguments=self.__feature_arguments,
            train_arguments=self.__liblinear_arguments)

    def classify_set(self, featuresets):
        """
			Classifies the specified featuresets.

			The featuresets parameter must have the format [ [feature] ]
			Returns the most probable label of each item in according to this classifier,
			where the returned value has the format [label]
		"""

        #Generation test file form itemSet
        self.__convert_featuresets(featuresets, self.__svmFilePath)

        # classify the featuresets
        p_labels = predict(self.__svmFilePath, self.__model.svm_model,
                           self.__liblinear_arguments)[0]
        p_labels = [
            self.__text_converter.getClassName(int(label))
            for label in p_labels
        ]

        return p_labels

    def __convert_featuresets(self, featuresets, output):
        """
		Convert a text data to a LIBSVM-format data.

		The featuresets parameter must have the format [ [feature] ]
		The output parameter is the file path where the result will be stored
		"""

        if isinstance(output, str):
            output = open(output, 'w')
        elif not isinstance(output, file):
            raise TypeError('output is a str or a file.')

        for featureset in featuresets:
            feat = self.__text_converter.toSVM(" ".join(featureset))
            feat = ''.join(' {0}:{1}'.format(f, feat[f]) for f in sorted(feat))

            output.write('-1 ' + feat + '\n')
        output.close()

    def __convert_labeled_featuresets(self, labeled_featuresets, output):
        """
		Convert a text data to a LIBSVM-format data.

		The labeled_featuresets parameter must have the format [([feature], label)]
		The output parameter is the file path where the result will be stored
		"""

        if isinstance(output, str):
            output = open(output, 'w')
        elif not isinstance(output, file):
            raise TypeError('output is a str or a file.')

        for featureset, label in labeled_featuresets:
            feat, label = self.__text_converter.toSVM(" ".join(featureset),
                                                      label)
            feat = ''.join(' {0}:{1}'.format(f, feat[f]) for f in sorted(feat))
            if label == None:
                label = -1
            output.write(str(label) + ' ' + feat + '\n')
        output.close()
예제 #5
0
sys.path.append('../')

from util.path import get_project_dir
from util.read_file import read_tweets_file
from util.pipeline import Pipeline

from tokenizer.posTokenizer import POSTokenizer
from tagger.posTagger import POSTagger

# classifier
from classifier.shortTextClassifier import ShortTextClassifier
classifier = ShortTextClassifier()

# file paths
originalFile = map(
    lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv",
    ["tweeti-b", "tweeti-b.dev"])
testingFile = map(
    lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv",
    ["twitter-test-input-B"])

# initialize the pipeline used to transform the tweets
tokenizer = POSTokenizer()
tagger = POSTagger()
pipeline = Pipeline(tokenizer, tagger, [], [])

# read the training file
labeled_featuresets = read_tweets_file(originalFile, pipeline).values()

# training
classifier.train(labeled_featuresets)
예제 #6
0
# classifiers to test
classifiers = {
    "Bayes": BayesianClassifier(),
    "SVM": SVMClassifier(),
    "ShortTextClassifier": ShortTextClassifier()
}

# to perform a quick test
#file = ["tweeti-b.dev"]
classifiers = {"ShortTextClassifier": ShortTextClassifier()}

# adjust all path
useConllFile = True
originalFile = map(
    lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", file)
conllFile = map(
    lambda path: get_project_dir() + "resources/conll/" + path + ".conll",
    file)


# support function
def test_pipeline(pipeline):
    """
		Support function used to test a pipeline using the specified testSet
	"""
    if not isinstance(pipeline, Pipeline):
        raise ValueError("pipeline must be an instance of Pipeline")

    timer.start()
    if not useConllFile: