Exemplo n.º 1
0
    def getTokenizedCorpus(self, tokenizer, corpus = '', language = 'automatic', languages = []):     
        import helper, retriever, classifier, analyser
        
        # create parser helper
        parserHelper = helper.ParserHelper()
        
        # parse corpus
        corpus = parserHelper.getPlainText(corpus)

        # stop word list
        stopWords = retriever.StopWords()
        
        # initialise classifier
        classifier = classifier.BayesClassifier()
        
        # if language is set to 'automatic', try to guess language by Bayesian classification
        if language == 'automatic':
            language = classifier.guessLanguage(stopWords, corpus, languages)

        # strip stop words
        tokenizedCorpus = self.stripStopWords(tokenizer, stopWords, language, corpus)
            
        # return corpus information
        return dict(corpus = corpus,
                    tokenizedCorpus = tokenizedCorpus,
                    language = language)
Exemplo n.º 2
0
    def stripStopWords(self, tokenizer, stopWords = '', language = '', corpus = ''):
        import retriever
        
        # if no stop words instance was supplied by user
        if stopWords == '':
            # stop word list
            stopWords = retriever.StopWords()
            
        # get appropriate stop word list for language
        stopWordList = stopWords.getStopWordList(language)

        # tokenize, get paragraphs
        if language == 'german':
            tokenizedCorpus = list(tokenizer.processWhitespaces(corpus, stopWordList, 1))
        else:
            tokenizedCorpus = list(tokenizer.processWhitespaces(corpus, stopWordList, 0))
        
        # return
        return tokenizedCorpus
Exemplo n.º 3
0
# get corpus
# corpus = urlRetriever.retrieveURL('http://www.krohne-mar.com/Schwebekoerper-Durchflussmessgeraete_nass_kalibriert.11121.0.html')
corpusSet = urlRetriever.retrieveURL('http://linguistik-fachschaft.de/info.html')
corpus = corpusSet['corpus']
charset = corpusSet['charset']

# ML tag stripper
mlStripper = retriever.MLStripper()

# remove ML tags
mlStripper.feed(corpus)
corpus = mlStripper.getStripped()

# stop word list
stopWords = retriever.StopWords()
stopWordList = stopWords.getStopWordList('german')

# tokenizer
tokenizer = analyser.Tokenizer()

# tokenize
tokens = tokenizer.processWhitespaces(corpus, stopWordList, 1)
tokenCount = len(tokens)

# analyse text structure
textStructure = analyser.TextStructure()

# get N-grams
ngrams = textStructure.getNGrams(tokens, tokenCount)