def parse_word(word):
    TurkishMorphology.load_lexicon('turkish.fst');
    

    ws = re.split('\s+', word)
    
    selectedparse = []
    
    for w in ws:
        parses = TurkishMorphology.parse(w)
        if not parses:
            return (word, " ")
        
        parsechoices = []
        for p in parses: #There may be more than one possible morphological analyses for a word
            (parse, neglogprob) = p #An estimated negative log probability for a morphological analysis is also returned
            #print parse, ':', neglogprob,
            parsechoices.append((parse, neglogprob))
        
        parsechoices.sort(key=lambda tup: tup[1])
        '''
        for key,value in parsechoices:
            print key, ":",value
           ''' 
        # find the least, append in parselist
        bestparse = parsechoices[0][0]   # take the most probable (least negative log prob valued) parse as the selection
        selectedparse.append(bestparse)        
        #print parsechoices
    return (word, selectedparse)
def parse_wordlist(words):
    TurkishMorphology.load_lexicon('turkish.fst');
 
    wordparselist = []  # contains (word/phrase, best parse) list
    for word in words:
        print "<S>"
        ws = re.split('\s+', word)
        
        selectedparse = []
        
        for w in ws:
            parses = TurkishMorphology.parse(w)
            if not parses:
                print w, w+"[Unknown]"
                continue
            print w,
            
            parsechoices = []
            for p in parses: #There may be more than one possible morphological analyses for a word
                (parse, neglogprob) = p #An estimated negative log probability for a morphological analysis is also returned
                #print parse, ':', neglogprob,
                parsechoices.append((parse, neglogprob))
            parsechoices.sort(key=lambda tup: tup[1])
            for key,value in parsechoices:
                print key, ":",value
                
            # find the least, append in parselist
            bestparse = parsechoices[0][0]   # take the most probable (least negative log prob valued) parse as the selection
            selectedparse.append(bestparse)
        wordparselist.append((word, selectedparse))        
        print "</S"
    return wordparselist
def parse_wordlist(words):
    TurkishMorphology.load_lexicon('turkish.fst');
 
    wordparselist = []  # contains (word/phrase, best parse) list
    for word in words:
        
        wordparsepair = parse_word(word) 
        
        wordparselist.append(wordparsepair)        
        
    return wordparselist
示例#4
0
def parse_corpus():
	TurkishMorphology.load_lexicon('turkish.fst');
	n = 0
	e = 0
	f = open(sys.argv[1], 'r')
	for line in f:
		print '<S> <S>+BSTag'
		line = line.rstrip()
		words = re.split('\s+', line)
		for w in words:
			parses = TurkishMorphology.parse(w)
			if not parses:
				print w, w+"[Unknown]"
				continue
			print w,
			for p in parses: #There may be more than one possible morphological analyses for a word
				(parse, neglogprob) = p #An estimated negative log probability for a morphological analysis is also returned
				print parse, ':', neglogprob,
			print
		print '</S> </S>+ESTag'
	f.close()
示例#5
0
文件: stats.py 项目: cgl/sna-tools
 def select_best(self, edits):
     min_neglogprob = float('inf')
     min_parse = None
     for edit in edits:
         # print edit
         parses = TurkishMorphology.parse(edit.encode("utf-8"))
         if parses:
             for p in parses:
                 (parse, neglogprob) = p
                 if neglogprob < min_neglogprob:
                     min_neglogprob = neglogprob
                     min_parse = parse
                     # print min_parse
     return min_parse
示例#6
0
文件: word.py 项目: haldun/ncfinder
    def _parse(self):
        attributes = TurkishMorphology.parse(self.original)
        parses = sorted(attributes, key=operator.itemgetter(1))

        if not parses:
            return

        best_parse = parses[0][0]
        self.root = best_parse[:best_parse.find('[')]

        for match in re.findall("(?P<attr>\[\w+\])", best_parse):
            self.attrs.append(match[1:-1])

        self.attrs_set = set(self.attrs)
        self.is_parsed = True
示例#7
0
文件: word.py 项目: haldun/ncfinder
# coding: utf-8

import operator
import re

import TurkishMorphology
TurkishMorphology.load_lexicon('turkish.fst')

import lookups
import syllables
import utils

class Word(object):
    def __init__(self, original):
        if original.__class__ == unicode:
            self.original = original.encode('utf-8')
        else:
            self.original = original
        self.root = None
        self.attrs = []
        self.is_parsed = False
        self.attrs_set = set()
        self._parse()

    def __repr__(self):
        return self.original

    def __str__(self):
        return repr(self)

    def __cmp__(self, other):
示例#8
0
文件: stats.py 项目: cgl/sna-tools
    def generateCSVByUsername(self, label=""):
        exclude_list = ["Det", "Adv", "Pron", "Conj", "Postp", "Punc"]
        exclude_regs = []
        for e in exclude_list:
            exclude_regs.append(re.compile("^.*?\[%s\]$" % e))
        # print len(exclude_regs)
	TurkishMorphology.load_lexicon('turkish.fst');
        self.inputfile = open(self.ifilename, "r")
        line_count = getLineCount(self.inputfile)
        self.ofile = open(self.ofilename, "w")
        csvWriter = csv.writer(self.ofile, delimiter=',', quotechar="'",quoting=csv.QUOTE_NONNUMERIC)
        line = self.inputfile.readline()
        i = 0
        while len(line) > 0:
            i = i + 1
            print "Processing Tweet " + str(i) + " of " + str(line_count) + " tweets"
            try:
                tweet = jsonpickle.decode(line)
            except ValueError, e:
                print repr(e)
                line = self.inputfile.readline()
                continue
            if tweet.has_key("delete") or tweet.has_key("scrub_geo") or tweet.has_key("limit"):
                print "unimplemented data item"
            else:
                text = unicode(tweet["text"])
                print text
                screen_name = tweet["user"]["screen_name"]
                if tweet["user"].has_key("id_str"):
                    user_id = tweet["user"]["id_str"]
                    tweet_id = tweet["id_str"]
                else:
                    user_id = str(tweet["user"]["id"])
                    tweet_id = str(tweet["id"])
                tweet_w = time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
                tokens = tokenizer.tokenize(text)
#                token_display = screen_name + " " + user_id + " " + tweet_id
                parsed_display = screen_name + " " + user_id + " " + tweet_id
                if label:
                    parsed_display = parsed_display + " " + label
                # parsing token by token for now. might think about parsing the whole sequence at once.
                for token in tokens:
                    # print token
#                    token_display += " "+token
                    if token[0] == "@":
                        parsed_display += " "+token+"[Mention]"
                        continue
                    elif token[0] == "#":
                        parsed_display += " "+token+"[Hashtag]"
                        continue
                    elif emoticon_reg.match(token):
                        parsed_display += " "+token+"[Emoticon]"
                        continue
                    elif token == "RT":
                        parsed_display += " "+token+"[RT]"
                        continue
                    parses = TurkishMorphology.parse(token.encode("utf-8"))
                    best_edit = None
                    if not parses:
                        # do not include in the results. just for one time testing.
                        # we assume that no more than two consequent letters happen in words..
                        norm_token = removeRepetitions(token)
                        best_edit = self.select_best([norm_token])
                        if not best_edit:
                            dea = Deasciifier(norm_token)
                            norm_token_deasciified = dea.convert_to_turkish()
                            best_edit = self.select_best([norm_token_deasciified])
                            if not best_edit:
                                edits = edits1(norm_token_deasciified)
                                edits = edits.union(edits1(norm_token))
                                best_edit = self.select_best(edits)
                                if not best_edit:
                                    # leave this work to the latter parts of the code.
                                    # parsed_display += " "+best_edit
                                    # else:
                                    parsed_display += " "+token+"[Unknown]"
                                    continue
                    if best_edit == None:
                        min_neglogprob = float('inf')
                        min_parse = None
                        for p in parses:
                            (parse, neglogprob) = p
                            if neglogprob < min_neglogprob:
                                min_neglogprob = neglogprob
                                min_parse = parse.decode("utf-8")
                    else:
                        min_parse = best_edit.decode("utf-8")
                    first_layer = min_parse.split('+')
                    second_layer = first_layer[0].split('-')
                    include_token = True
                    for exclude_reg in exclude_regs:
                        result = exclude_reg.match(second_layer[0])
                        if result:
                            include_token = False
                            break
                    if include_token:
                        parsed_display += " "+second_layer[0]
                #print token_display
                #print parsed_display
                ##csvWriter.writerow(token_display.split())
                print parsed_display
                csvWriter.writerow([p.encode("utf-8") for p in parsed_display.split()])
            line = self.inputfile.readline()