def __init__(self, options={}): """ Attributes ---------- tokens : List of String tokens[k] -> TOKEN STRING with token id 'k' idf : List of int idf[k] -> document frequency of token id 'k' features : List of String features[k] -> FEATURE STRING with feature id 'k' doc_tokens : dict tmp document represented by token strings and their counts doc_features : dict tmp document represented by feature strings and their counts valid_features : dict *IMPORTANT* valid features to be considered for the svm classification """ Extract.__init__(self, options) self.tokens = [] # tokens[k] : TOKEN STRING with token id 'k' self.idf = [] # idf[k] : document frequency of token id 'k' self.features = [] # features[k] : FEATURE STRING with feature id 'k' self.doc_tokens = {'0000':0} # tmp document represented by token strings and their counts self.doc_features = {'0000':0} # tmp document represented by feature strings and their counts self.options = options main = os.path.realpath(__file__).split('/') self.rootDir = "/".join(main[:len(main)-4]) self.valid_features = {'punc':0, 'nopunc':0, 'onepunc':0, 'twopunc':0, 'nonumbers':0, 'dash':0, 'noinitial':0, 'startinitial':0, 'posspage':0, 'weblink':0, 'posseditor':0, 'italic':0}
def __init__(self, options={}): Extract.__init__(self, options)