Python whitespace 예제들, nltk.tokenize.whitespace Python 예제들

예제 #1

0

파일 보기

파일: fwiktr.py 프로젝트: qdot/fwiktr

    def RunPOSTagger(self):
        twitter_messages = self.tapi.GetPublicTimeline()
        for message in twitter_messages:
            try:

                cmd = 'echo "' + message.text + '" | treetagger/cmd/tree-tagger-english > ./twitter_message_output.txt'
                os.system(cmd)
                self.pos_file = open('twitter_message_output.txt', 'r')
                tokens = []
                self.parse_string = ""
                for line in self.pos_file:
                    current_line = []
                    self.parse_string += line + "<BR>"
                    for value in tokenize.whitespace(line):
                        current_line.append(value)
                    tokens.append(current_line)

                filename = uuid.uuid4()                
                self.output_file = open(str(filename)+".html", 'w')
                self.output_file.write(file_header % (message.text))
                self.output_file.write(message.text + "<BR>")
                
                self.RetreiveFlickrURLs(tokens)

                self.output_file.write(file_footer)
                self.output_file.close()
                self.output_file = open(str(filename)+".html", 'r')
                self.ftp_socket.storlines("STOR "+str(filename)+".html", self.output_file)
                self.output_file.close()
                self.pos_file.close()
                time.sleep(30)
            except UnicodeEncodeError:
                print "Twitter Message not ascii, skipping"        
            except AttributeError:
                print "Weird XML error. I wish it'd stop doing that"

예제 #2

0

파일 보기

파일: analyser.py 프로젝트: Wikiwix/Topicalizer

    def processWhitespacesWithoutStopWords(self, corpus, caseSensitive):
        from nltk import tokenize

        # initialise token buffer
        tokens = []

        # get tokens separated by whitespaces
        tokenizedCorpus = tokenize.whitespace(corpus)

        # compile regular expression for matching whitespaces
        whitespaces = re.compile(r'\s\&nbsp\;')

        # go through each token in corpus
        for token in tokenizedCorpus:
            # if case-sensitive handling of tokens
            if caseSensitive == 1:
                pass
            else:
                token = token.lower()

            # remove white spaces at beginning
            token = whitespaces.sub('', token)

            # append token to list
            tokens.append(token)

    # return tokens
        return tokens

예제 #3

0

파일 보기

파일: analyser.py 프로젝트: Wikiwix/Topicalizer

    def processWhitespaces(self,
                           corpus,
                           stopWordList,
                           caseSensitive,
                           minimumTokenLength=3,
                           maximumTokenLength=25):
        from nltk import tokenize

        # initialise token list
        tokens = []

        # initialise token buffer
        tokenBuffer = ''

        # get tokens separated by whitespaces
        tokenizedCorpus = tokenize.whitespace(corpus)

        # compile regular expression for matching special characters
        specialCharacters = re.compile(r'\&.+\;')

        # compile regular expression for matching whitespaces
        whitespaces = re.compile(r'\s|\&nbsp\;')

        # compile regular expression for sentence-boundary matching
        sentenceBoundary = re.compile(r'[\.\:\!\?\,]')

        # go through each token in corpus
        for token in tokenizedCorpus:
            # get token length
            tokenLength = len(token)

            # see, if token contains special character
            specialCharacterMatches = specialCharacters.findall(token)

            # reduce special characters to size one
            if specialCharacterMatches:
                for match in specialCharacterMatches:
                    tokenLength -= (len(match) - 1)

            # if case-sensitive handling of tokens
            if caseSensitive == 1:
                pass
            else:
                token = token.lower()

            # remove white spaces at beginning and end
            token = whitespaces.sub('', token)

            # write token to buffer and remove punctuation
            tokenBuffer = sentenceBoundary.sub('', token)

            # mark stop words
            if tokenLength < minimumTokenLength or tokenLength > maximumTokenLength or tokenBuffer in stopWordList or tokenBuffer.lower(
            ) in stopWordList:
                tokens.append(token + '<STOPWORD>')
            else:
                tokens.append(token)

    # return tokens
        return tokens

예제 #4

0

파일 보기

파일: analyser.py 프로젝트: BjoernKW/Topicalizer

    def processWhitespacesWithoutStopWords(self, corpus, caseSensitive):
        from nltk import tokenize

        # initialise token buffer
        tokens = []

        # get tokens separated by whitespaces
        tokenizedCorpus = tokenize.whitespace(corpus)

        # compile regular expression for matching whitespaces
        whitespaces = re.compile(r"\s\&nbsp\;")

        # go through each token in corpus
        for token in tokenizedCorpus:
            # if case-sensitive handling of tokens
            if caseSensitive == 1:
                pass
            else:
                token = token.lower()

                # remove white spaces at beginning
            token = whitespaces.sub("", token)

            # append token to list
            tokens.append(token)

        # return tokens
        return tokens

예제 #5

0

파일 보기

파일: fwiktr_web.py 프로젝트: qdot/fwiktr

 def GetTagList(self, text):
     self._before = text
     tags = []
     [tags.append(i) for i in tokenize.whitespace(text)]
     self._output = ""
     self._after = tags
     return tags

예제 #6

0

파일 보기

파일: analyser.py 프로젝트: BjoernKW/Topicalizer

    def processWhitespaces(self, corpus, stopWordList, caseSensitive, minimumTokenLength=3, maximumTokenLength=25):
        from nltk import tokenize

        # initialise token list
        tokens = []

        # initialise token buffer
        tokenBuffer = ""

        # get tokens separated by whitespaces
        tokenizedCorpus = tokenize.whitespace(corpus)

        # compile regular expression for matching special characters
        specialCharacters = re.compile(r"\&.+\;")

        # compile regular expression for matching whitespaces
        whitespaces = re.compile(r"\s|\&nbsp\;")

        # compile regular expression for sentence-boundary matching
        sentenceBoundary = re.compile(r"[\.\:\!\?\,]")

        # go through each token in corpus
        for token in tokenizedCorpus:
            # get token length
            tokenLength = len(token)

            # see, if token contains special character
            specialCharacterMatches = specialCharacters.findall(token)

            # reduce special characters to size one
            if specialCharacterMatches:
                for match in specialCharacterMatches:
                    tokenLength -= len(match) - 1

                # if case-sensitive handling of tokens
            if caseSensitive == 1:
                pass
            else:
                token = token.lower()

                # remove white spaces at beginning and end
            token = whitespaces.sub("", token)

            # write token to buffer and remove punctuation
            tokenBuffer = sentenceBoundary.sub("", token)

            # mark stop words
            if (
                tokenLength < minimumTokenLength
                or tokenLength > maximumTokenLength
                or tokenBuffer in stopWordList
                or tokenBuffer.lower() in stopWordList
            ):
                tokens.append(token + "<STOPWORD>")
            else:
                tokens.append(token)

        # return tokens
        return tokens

예제 #7

0

파일 보기

파일: featurechart.py 프로젝트: Sandy4321/nltk_contrib

def demo():
    import sys, time

    S = GrammarCategory.parse('S')
    VP = GrammarCategory.parse('VP')
    NP = GrammarCategory.parse('NP')
    PP = GrammarCategory.parse('PP')
    V = GrammarCategory.parse('V')
    N = GrammarCategory.parse('N')
    P = GrammarCategory.parse('P')
    Name = GrammarCategory.parse('Name')
    Det = GrammarCategory.parse('Det')
    DetSg = GrammarCategory.parse('Det[-pl]')
    DetPl = GrammarCategory.parse('Det[+pl]')
    NSg = GrammarCategory.parse('N[-pl]')
    NPl = GrammarCategory.parse('N[+pl]')

    # Define some grammatical productions.
    grammatical_productions = [
        cfg.Production(S, (NP, VP)),  cfg.Production(PP, (P, NP)),
        cfg.Production(NP, (NP, PP)),
        cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)),
        cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)),
        cfg.Production(NP, (DetSg, NSg))]

    # Define some lexical productions.
    lexical_productions = [
        cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)),
        cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)),
        cfg.Production(Det, ('a',)),
        cfg.Production(NSg, ('dog',)),   cfg.Production(NSg, ('cookie',)),
        cfg.Production(V, ('ate',)),  cfg.Production(V, ('saw',)),
        cfg.Production(P, ('with',)), cfg.Production(P, ('under',)),
        ]
    
    earley_grammar = cfg.Grammar(S, grammatical_productions)
    earley_lexicon = {}
    for prod in lexical_productions:
        earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())
    def lexicon(word):
        return earley_lexicon.get(word.upper(), [])

    sent = 'I saw John with a dog with my cookie'
    print "Sentence:\n", sent
    from nltk import tokenize
    tokens = list(tokenize.whitespace(sent))
    t = time.time()
    cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1)
    trees = cp.get_parse_list(tokens)
    print "Time: %s" % (time.time() - t)
    for tree in trees: print tree

예제 #8

0

파일 보기

파일: fwiktr_web.py 프로젝트: qdot/fwiktr

 def _Run(self, text):
     self._output = ""
     cmd = "echo \"%s\" | treetagger/cmd/tree-tagger-english > ./twitter_message_output.txt" % text
     os.system(cmd)
     pos_file = open('twitter_message_output.txt', 'r')
     tokens = []
     self.parse_string = ""
     for line in pos_file:
         current_line = []
         self._output += line
         for value in tokenize.whitespace(line):
             current_line.append(value)
         tokens.append(current_line)
     
     self._output = self._output.replace("<unknown>", "[unknown]")
     filtered_tags = filter(self._ComparisonFunction, tokens)
     final_tags = []
     [final_tags.append(i[0]) for i in filtered_tags]
     return final_tags

예제 #9

0

파일 보기

def getNGramStructure(sourceFile):
    # initialise n-gram dictionary
    ngrams = {}

    # read file
    corpus = sourceFile.read()

    # get tokens separated by whitespaces
    tokenizedCorpus = tokenize.whitespace(corpus)

    # go through each token
    for token in tokenizedCorpus:
        # split token in single characters
        characters = list(token)

        # copy character list
        charactersBuffer = list(characters)

        # initialise buffer
        buffer1 = ''

        # go through character list
        for char1 in characters:
            # write each n-gram to list
            buffer1 += char1
            ngrams[buffer1] = ngrams.get(buffer1, 0) + 1

            # shift from character list copy
            charactersBuffer.pop(0)

            # initialise buffer
            buffer2 = ''

            # go through copy of character list
            for char2 in charactersBuffer:
                buffer2 += char2
                ngrams[buffer2] = ngrams.get(buffer2, 0) + 1

# return n-grams
    return ngrams

예제 #10

0

파일 보기

파일: makeProfiles.py 프로젝트: BjoernKW/Topicalizer

def getNGramStructure(sourceFile):
    # initialise n-gram dictionary
    ngrams = {}

    # read file
    corpus = sourceFile.read()

    # get tokens separated by whitespaces
    tokenizedCorpus = tokenize.whitespace(corpus)

    # go through each token
    for token in tokenizedCorpus:
    	# split token in single characters
    	characters = list(token)

	# copy character list
	charactersBuffer = list(characters)

	# initialise buffer
	buffer1 = ''

	# go through character list
	for char1 in characters:
	    # write each n-gram to list
	    buffer1 += char1
	    ngrams[buffer1] = ngrams.get(buffer1, 0) + 1

	    # shift from character list copy
	    charactersBuffer.pop(0)

	    # initialise buffer
	    buffer2 = ''

	    # go through copy of character list
	    for char2 in charactersBuffer:
    		buffer2 += char2
    		ngrams[buffer2] = ngrams.get(buffer2, 0) + 1

   # return n-grams
    return ngrams

예제 #11

0

파일 보기

파일: tag2tab.py 프로젝트: steven-cutting/icsisumm

def tabtagged(files='chunked', basedir=None):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @return: iterator over lines in Malt-TAB input format
    """
    if type(files) is str: files = (files, )

    if not basedir: basedir = get_basedir()

    for file in files:
        path = os.path.join(get_basedir(), "treebank", file)
        f = open(path).read()

        for sent in tokenize.blankline(f):
            l = []
            for t in tokenize.whitespace(sent):
                if (t != '[' and t != ']'):
                    l.append(tag2tab(t))
            #add a blank line as sentence separator
            l.append('\n')
            yield l

예제 #12

0

파일 보기

파일: tag2tab.py 프로젝트: DrDub/icsisumm

def tabtagged(files = 'chunked', basedir= None):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @return: iterator over lines in Malt-TAB input format
    """       
    if type(files) is str: files = (files,)

    if not basedir: basedir = get_basedir()

    for file in files:
        path = os.path.join(get_basedir(), "treebank", file)
        f = open(path).read()

        for sent in tokenize.blankline(f):
            l = []
            for t in tokenize.whitespace(sent):
                if (t != '[' and t != ']'):
                    l.append(tag2tab(t))
            #add a blank line as sentence separator
            l.append('\n')
            yield l

예제 #13

0

파일 보기

파일: dataformat.py 프로젝트: arifmudi/my-wacky-analysis

 def pos_tag(self, infile):
     train_sents = list(islice(brown.tagged(), 1000000))
     trigram_tagger = tag.Trigram()
     trigram_tagger.train(train_sents)
     file = open(infile + ".txt", "r")
     out = open(infile + "-tag.txt", "w")
     try:
         text = file.read()
         lines = string.split(text, '\n')
         for line in lines:
             tokens = list(tokenize.whitespace(line))
             tagged = list(trigram_tagger.tag(tokens))
             for tags in tagged:
                 print tags
                 if tags[1] == None:
                     out.write(tags[0] + "/" + "NA")
                 else:
                     out.write(tags[0] + "/" + tags[1])
                 out.write(" ")
             out.write("\n")
     except IOError:
         raise IOError()
     file.close()
     out.close()

예제 #14

0

파일 보기

    # initialise co-occurrence matrix
    coOccurrences = {}

    # open file
    file = open(path + category, 'r')

    # add each line to corpus
    for line in file:
        corpus += line

    # close file pointer
    file.close()

    # get tokens from corpus
    tokenizedCorpus = tokenize.whitespace(corpus)

    # go through tokens
    for token in tokenizedCorpus:
        # add token to sentence
        words.append(tag.sub('', token))

        # if sentence-boundary has been found in this token
        if sentenceBoundary.findall(token):
            # recompose sentence
            for word in words:
                sentenceString += word + ' '

            # add to sentence string list
            sentences.append(sentenceString)

예제 #15

0

파일 보기

파일: featurechart.py 프로젝트: sushengyang/NLP-project

def demo():
    import sys, time

    S = GrammarCategory.parse('S')
    VP = GrammarCategory.parse('VP')
    NP = GrammarCategory.parse('NP')
    PP = GrammarCategory.parse('PP')
    V = GrammarCategory.parse('V')
    N = GrammarCategory.parse('N')
    P = GrammarCategory.parse('P')
    Name = GrammarCategory.parse('Name')
    Det = GrammarCategory.parse('Det')
    DetSg = GrammarCategory.parse('Det[-pl]')
    DetPl = GrammarCategory.parse('Det[+pl]')
    NSg = GrammarCategory.parse('N[-pl]')
    NPl = GrammarCategory.parse('N[+pl]')

    # Define some grammatical productions.
    grammatical_productions = [
        cfg.Production(S, (NP, VP)),
        cfg.Production(PP, (P, NP)),
        cfg.Production(NP, (NP, PP)),
        cfg.Production(VP, (VP, PP)),
        cfg.Production(VP, (V, NP)),
        cfg.Production(VP, (V, )),
        cfg.Production(NP, (DetPl, NPl)),
        cfg.Production(NP, (DetSg, NSg))
    ]

    # Define some lexical productions.
    lexical_productions = [
        cfg.Production(NP, ('John', )),
        cfg.Production(NP, ('I', )),
        cfg.Production(Det, ('the', )),
        cfg.Production(Det, ('my', )),
        cfg.Production(Det, ('a', )),
        cfg.Production(NSg, ('dog', )),
        cfg.Production(NSg, ('cookie', )),
        cfg.Production(V, ('ate', )),
        cfg.Production(V, ('saw', )),
        cfg.Production(P, ('with', )),
        cfg.Production(P, ('under', )),
    ]

    earley_grammar = cfg.Grammar(S, grammatical_productions)
    earley_lexicon = {}
    for prod in lexical_productions:
        earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())

    def lexicon(word):
        return earley_lexicon.get(word.upper(), [])

    sent = 'I saw John with a dog with my cookie'
    print "Sentence:\n", sent
    from nltk import tokenize
    tokens = list(tokenize.whitespace(sent))
    t = time.time()
    cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1)
    trees = cp.get_parse_list(tokens)
    print "Time: %s" % (time.time() - t)
    for tree in trees:
        print tree

예제 #16

0

파일 보기

파일: pickleBrownCorpus.py 프로젝트: BjoernKW/Topicalizer

    # initialise co-occurrence matrix
    coOccurrences = {}

    # open file
    file = open(path + category, 'r')

    # add each line to corpus
    for line in file:
    	corpus += line

    # close file pointer
    file.close()

    # get tokens from corpus
    tokenizedCorpus = tokenize.whitespace(corpus)

    # go through tokens
    for token in tokenizedCorpus:
    	# add token to sentence
    	words.append(tag.sub('', token))

	# if sentence-boundary has been found in this token
	if sentenceBoundary.findall(token):
	    # recompose sentence
	    for word in words:
    		sentenceString += word + ' '

	    # add to sentence string list
	    sentences.append(sentenceString)

예제 #17

0

파일 보기

파일: featurechart.py 프로젝트: Sandy4321/nltk_contrib

def demo():
    import sys, time

    S = GrammarCategory.parse("S")
    VP = GrammarCategory.parse("VP")
    NP = GrammarCategory.parse("NP")
    PP = GrammarCategory.parse("PP")
    V = GrammarCategory.parse("V")
    N = GrammarCategory.parse("N")
    P = GrammarCategory.parse("P")
    Name = GrammarCategory.parse("Name")
    Det = GrammarCategory.parse("Det")
    DetSg = GrammarCategory.parse("Det[-pl]")
    DetPl = GrammarCategory.parse("Det[+pl]")
    NSg = GrammarCategory.parse("N[-pl]")
    NPl = GrammarCategory.parse("N[+pl]")

    # Define some grammatical productions.
    grammatical_productions = [
        cfg.Production(S, (NP, VP)),
        cfg.Production(PP, (P, NP)),
        cfg.Production(NP, (NP, PP)),
        cfg.Production(VP, (VP, PP)),
        cfg.Production(VP, (V, NP)),
        cfg.Production(VP, (V,)),
        cfg.Production(NP, (DetPl, NPl)),
        cfg.Production(NP, (DetSg, NSg)),
    ]

    # Define some lexical productions.
    lexical_productions = [
        cfg.Production(NP, ("John",)),
        cfg.Production(NP, ("I",)),
        cfg.Production(Det, ("the",)),
        cfg.Production(Det, ("my",)),
        cfg.Production(Det, ("a",)),
        cfg.Production(NSg, ("dog",)),
        cfg.Production(NSg, ("cookie",)),
        cfg.Production(V, ("ate",)),
        cfg.Production(V, ("saw",)),
        cfg.Production(P, ("with",)),
        cfg.Production(P, ("under",)),
    ]

    earley_grammar = cfg.Grammar(S, grammatical_productions)
    earley_lexicon = {}
    for prod in lexical_productions:
        earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())

    def lexicon(word):
        return earley_lexicon.get(word.upper(), [])

    sent = "I saw John with a dog with my cookie"
    print "Sentence:\n", sent
    from nltk import tokenize

    tokens = list(tokenize.whitespace(sent))
    t = time.time()
    cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1)
    trees = cp.get_parse_list(tokens)
    print "Time: %s" % (time.time() - t)
    for tree in trees:
        print tree