def __init__(self,
              encoding='utf8',
              sourceType='string',
              contentType='text',
              lowercase=True,
              keepTexts=True,
              keepTokens=True,
              keepPositions=True):
     """
     sourceType - document created from string passed as parameter/from filename passed as parameter
     contentType - type of text (used for domain knowledge in tokenization etc)
     """
     from tokenizers import TokenizerFactory
     self.tf = TokenizerFactory()
     self.tokenizer = self.tf.createTokenizer(contentType, encoding,
                                              lowercase)
     self.sourceType = sourceType
     self.contentType = contentType
     self.keepTexts = keepTexts
     self.keepTokens = keepTokens
     self.keepPositions = keepPositions