Пример #1
0
	def __init__(self, options={}):
		"""
		Attributes
		----------	
		tokens : List of String
			tokens[k] -> TOKEN STRING with token id 'k'
		idf : List of int
			idf[k] -> document frequency of token id 'k'
		features : List of String
			features[k] -> FEATURE STRING with feature id 'k'
		doc_tokens : dict
			tmp document represented by token strings and their counts
		doc_features : dict 
			tmp document represented by feature strings and their counts
		valid_features : dict
			*IMPORTANT* valid features to be considered for the svm classification 
		"""			
		Extract.__init__(self, options)
		self.tokens = []	# tokens[k] : TOKEN STRING with token id 'k'
		self.idf = []		# idf[k] : document frequency of token id 'k'
		self.features = []	# features[k] : FEATURE STRING with feature id 'k'
		self.doc_tokens = {'0000':0}	# tmp document represented by token strings and their counts
		self.doc_features = {'0000':0}	# tmp document represented by feature strings and their counts
		self.options = options
		main = os.path.realpath(__file__).split('/')
		self.rootDir = "/".join(main[:len(main)-4])		

		self.valid_features = {'punc':0, 'nopunc':0, 'onepunc':0, 'twopunc':0, 'nonumbers':0, 'dash':0,
						'noinitial':0, 'startinitial':0, 'posspage':0, 'weblink':0, 'posseditor':0, 'italic':0}
Пример #2
0
	def __init__(self, options={}):
		Extract.__init__(self, options)