def main():
	
	folders = {}
	folders["politik"] = "data/politik"
	folders["sport"] = "data/sport"
	folders["wirtschaft"] = "data/wirtschaft"

	bank = ClassBank()
	l = Loader()

	# train data
	for classname, folder in folders.iteritems():
		count = 0
		content = ""
		for file in os.listdir(folder + "/train/"):
			if file.endswith(".txt"):
				count = count + 1
				content = content + " " + l.load_txt(folder + "/train/" + file)
		c = Class(classname, content, count)
		bank.addClass(c)

 	bank.train()
 	c = Classifier()

 	# test data
 	for classname, folder in folders.iteritems():
 		print "\n=== Testing",classname, "===\n"
		for file in os.listdir(folder + "/test/"):
			if file.endswith(".txt"):
				tokenizer = Tokenizer(l.load_txt(folder + "/test/" + file))
				classifiedClass = c.classify(tokenizer.getTokens(), bank)
				print file,"=",classifiedClass.getName()
Exemplo n.º 2
0
    def test_selfClass(self):
        """Check for good conversions"""

        testData = u"""\
A	000041	Lu
B	000042	Lu
C	000043	Lu
a	000061	Ll
b	000062	Ll
c	000063	Ll
0	000030	Nd
1	000031	Nd
2	000032	Nd
$	000024	Sc
=	00003d	Sm
*	00002a	Po
愚	00611a	Lo
公	00516c	Lo
移	0079fb	Lo
山	005c71	Lo
"""

        for line in testData.split('\n'):
            line = line.strip()
            if not line:
                break
            A, B, C = line.split()
            if A and B and C:
                codepoint = ord(A)
                self.assertEqual(codepoint, int(B, 0x10), Self.doc())
                self.assertEqual(Class.classify(codepoint), C, Self.doc())
Exemplo n.º 3
0
    def __init__(self):
        "Initialize grammars for antlr4"
        self.uniclass = Class()
        self.labelled = {label: [] for label in Class.label}
        that = None
        codepoint, top = 0x0, 0x110000
        for codepoint in xrange(top):
            this = Class.classify(codepoint)
            if that != this:
                if that is not None:
                    self.labelled[that][-1].append(codepoint)
                self.labelled[this].append([codepoint])
                that = this
        that = Class.classify(top - 1)
        self.labelled[that][-1].append(codepoint)

        with open("local/PropertyValueAliases.txt") as source:
            find = 'gc ; '
            self.prop = {'__': 'Error'}
            for line in source.readlines():
                if line.startswith(find):
                    part = line.split(';')
                    self.prop[part[1].strip()] = part[2].strip()

        self.identify = [0] * top
        with open("local/Blocks.txt") as source:
            pattern = re.compile(r"([0-9A-F]{4,6})\.\.([0-9A-F]{4,6}); (.*)")
            self.block = {}

            for line in source:
                found = pattern.match(line)
                if found:
                    self.block[found.group(3).replace(' ', '_')] = [
                        found.group(i) for i in [1, 2]
                    ]
        self.noblock = '(Absent from Blocks.txt)'
        self.blockname = [self.noblock] + sorted(self.block.keys())
        for i, name in enumerate(self.blockname):
            self.blockname[i] = name
        for i, name in enumerate(self.blockname):
            if isinstance(name, str) and name is not self.noblock:
                endpoint = self.block[name]
                A, B = (int(s, 0x10) for s in endpoint)
                for codepoint in xrange(A, B):
                    self.identify[codepoint] = i