def parse_word(word): TurkishMorphology.load_lexicon('turkish.fst'); ws = re.split('\s+', word) selectedparse = [] for w in ws: parses = TurkishMorphology.parse(w) if not parses: return (word, " ") parsechoices = [] for p in parses: #There may be more than one possible morphological analyses for a word (parse, neglogprob) = p #An estimated negative log probability for a morphological analysis is also returned #print parse, ':', neglogprob, parsechoices.append((parse, neglogprob)) parsechoices.sort(key=lambda tup: tup[1]) ''' for key,value in parsechoices: print key, ":",value ''' # find the least, append in parselist bestparse = parsechoices[0][0] # take the most probable (least negative log prob valued) parse as the selection selectedparse.append(bestparse) #print parsechoices return (word, selectedparse)
def parse_wordlist(words): TurkishMorphology.load_lexicon('turkish.fst'); wordparselist = [] # contains (word/phrase, best parse) list for word in words: print "<S>" ws = re.split('\s+', word) selectedparse = [] for w in ws: parses = TurkishMorphology.parse(w) if not parses: print w, w+"[Unknown]" continue print w, parsechoices = [] for p in parses: #There may be more than one possible morphological analyses for a word (parse, neglogprob) = p #An estimated negative log probability for a morphological analysis is also returned #print parse, ':', neglogprob, parsechoices.append((parse, neglogprob)) parsechoices.sort(key=lambda tup: tup[1]) for key,value in parsechoices: print key, ":",value # find the least, append in parselist bestparse = parsechoices[0][0] # take the most probable (least negative log prob valued) parse as the selection selectedparse.append(bestparse) wordparselist.append((word, selectedparse)) print "</S" return wordparselist
def parse_wordlist(words): TurkishMorphology.load_lexicon('turkish.fst'); wordparselist = [] # contains (word/phrase, best parse) list for word in words: wordparsepair = parse_word(word) wordparselist.append(wordparsepair) return wordparselist
def parse_corpus(): TurkishMorphology.load_lexicon('turkish.fst'); n = 0 e = 0 f = open(sys.argv[1], 'r') for line in f: print '<S> <S>+BSTag' line = line.rstrip() words = re.split('\s+', line) for w in words: parses = TurkishMorphology.parse(w) if not parses: print w, w+"[Unknown]" continue print w, for p in parses: #There may be more than one possible morphological analyses for a word (parse, neglogprob) = p #An estimated negative log probability for a morphological analysis is also returned print parse, ':', neglogprob, print print '</S> </S>+ESTag' f.close()
def select_best(self, edits): min_neglogprob = float('inf') min_parse = None for edit in edits: # print edit parses = TurkishMorphology.parse(edit.encode("utf-8")) if parses: for p in parses: (parse, neglogprob) = p if neglogprob < min_neglogprob: min_neglogprob = neglogprob min_parse = parse # print min_parse return min_parse
def _parse(self): attributes = TurkishMorphology.parse(self.original) parses = sorted(attributes, key=operator.itemgetter(1)) if not parses: return best_parse = parses[0][0] self.root = best_parse[:best_parse.find('[')] for match in re.findall("(?P<attr>\[\w+\])", best_parse): self.attrs.append(match[1:-1]) self.attrs_set = set(self.attrs) self.is_parsed = True
# coding: utf-8 import operator import re import TurkishMorphology TurkishMorphology.load_lexicon('turkish.fst') import lookups import syllables import utils class Word(object): def __init__(self, original): if original.__class__ == unicode: self.original = original.encode('utf-8') else: self.original = original self.root = None self.attrs = [] self.is_parsed = False self.attrs_set = set() self._parse() def __repr__(self): return self.original def __str__(self): return repr(self) def __cmp__(self, other):
def generateCSVByUsername(self, label=""): exclude_list = ["Det", "Adv", "Pron", "Conj", "Postp", "Punc"] exclude_regs = [] for e in exclude_list: exclude_regs.append(re.compile("^.*?\[%s\]$" % e)) # print len(exclude_regs) TurkishMorphology.load_lexicon('turkish.fst'); self.inputfile = open(self.ifilename, "r") line_count = getLineCount(self.inputfile) self.ofile = open(self.ofilename, "w") csvWriter = csv.writer(self.ofile, delimiter=',', quotechar="'",quoting=csv.QUOTE_NONNUMERIC) line = self.inputfile.readline() i = 0 while len(line) > 0: i = i + 1 print "Processing Tweet " + str(i) + " of " + str(line_count) + " tweets" try: tweet = jsonpickle.decode(line) except ValueError, e: print repr(e) line = self.inputfile.readline() continue if tweet.has_key("delete") or tweet.has_key("scrub_geo") or tweet.has_key("limit"): print "unimplemented data item" else: text = unicode(tweet["text"]) print text screen_name = tweet["user"]["screen_name"] if tweet["user"].has_key("id_str"): user_id = tweet["user"]["id_str"] tweet_id = tweet["id_str"] else: user_id = str(tweet["user"]["id"]) tweet_id = str(tweet["id"]) tweet_w = time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y") tokens = tokenizer.tokenize(text) # token_display = screen_name + " " + user_id + " " + tweet_id parsed_display = screen_name + " " + user_id + " " + tweet_id if label: parsed_display = parsed_display + " " + label # parsing token by token for now. might think about parsing the whole sequence at once. for token in tokens: # print token # token_display += " "+token if token[0] == "@": parsed_display += " "+token+"[Mention]" continue elif token[0] == "#": parsed_display += " "+token+"[Hashtag]" continue elif emoticon_reg.match(token): parsed_display += " "+token+"[Emoticon]" continue elif token == "RT": parsed_display += " "+token+"[RT]" continue parses = TurkishMorphology.parse(token.encode("utf-8")) best_edit = None if not parses: # do not include in the results. just for one time testing. # we assume that no more than two consequent letters happen in words.. norm_token = removeRepetitions(token) best_edit = self.select_best([norm_token]) if not best_edit: dea = Deasciifier(norm_token) norm_token_deasciified = dea.convert_to_turkish() best_edit = self.select_best([norm_token_deasciified]) if not best_edit: edits = edits1(norm_token_deasciified) edits = edits.union(edits1(norm_token)) best_edit = self.select_best(edits) if not best_edit: # leave this work to the latter parts of the code. # parsed_display += " "+best_edit # else: parsed_display += " "+token+"[Unknown]" continue if best_edit == None: min_neglogprob = float('inf') min_parse = None for p in parses: (parse, neglogprob) = p if neglogprob < min_neglogprob: min_neglogprob = neglogprob min_parse = parse.decode("utf-8") else: min_parse = best_edit.decode("utf-8") first_layer = min_parse.split('+') second_layer = first_layer[0].split('-') include_token = True for exclude_reg in exclude_regs: result = exclude_reg.match(second_layer[0]) if result: include_token = False break if include_token: parsed_display += " "+second_layer[0] #print token_display #print parsed_display ##csvWriter.writerow(token_display.split()) print parsed_display csvWriter.writerow([p.encode("utf-8") for p in parsed_display.split()]) line = self.inputfile.readline()