def __add_basic_pos_tag(df): pos_path_jar = "./stanford-postagger-full-2017-06-09/stanford-postagger.jar" pos_path_model = "./stanford-postagger-full-2017-06-09/models/english-left3words-distsim.tagger" pos_tagger = StanfordPOSTagger(pos_path_model, pos_path_jar) pos = [pos_tagger.tag(s) for s in [df.word]] pos = [i[1] for i in pos[0]] pos = pd.DataFrame(pos) df['pos'] = pos return df
class POSTagger(BaseEstimator, TransformerMixin): def __init__(self, models_path=None): models_path = models_path or os.environ["MODELS_PATH"] jar_file = Path(models_path, "stanford-postagger.jar") tagger_file = Path(models_path, "spanish.tagger") self.tagger = StanfordPOSTagger(str(tagger_file), str(jar_file)) def tag(self, token_list): tags = self.tagger.tag(token_list) _, tags = zip(*tags) return list(tags) def transform(self, x, y=None): return [self.tag(sequence) for sequence in x]
guessedAnswerText = ''.join(ch for ch in guessedAnswerText if ch not in PunctuationExclude) ###### if guessedAnswerText != "" and guessedAnswerText[0] == ' ': guessedAnswerText = guessedAnswerText[1:] # remove the first space # print(guessedAnswerText) if guessedAnswerText == question['answer']: correct +=1 elif questionType == 'NUMBER': wrongNumber += 1 # print(question['question']) # print(taggedBestAnswerSent) # print(questionType) # print(guessedAnswerText) # print("-----" + question['answer']) print(i / float(8460), ":", correct) print("wrong in selected cat",wrongNumber) print("total",i) print("correct",correct) print("correct in multi ans",possCorrect) print("avg multi ans len", totalans/float(multiAnswer)) print(multiAnswer) print(stanford_tagger.tag("Crazy monkey jumping on the tree"))
def contentToList(page_content): list = sent_tokenize(page_content) # list = page_content.split(' ') print(list) cleanList = [] list_with_startelement_numbers = [] # enthält Start item aller Redetexte list_with_startEnd_numbers = [ ] # enthält Start und Ende item aller Redetexte # hallo ihr # meine dfsdkfsdfsd for i in range(len(list)): list_element = list[i] list_element = list_element.replace("\n", "") list_element = list_element.replace("-", "") cleanList.append(list_element) # liste ohne -, \n #print("item at index", i, ":", list_element) # alle Listenelemente start_Element_Rede = 0 '''analysiere Struktur list_element''' ''' nachdem Präsident Lammert das Wort übergibt, beginnt eine Rede''' matchers = ['Das Wort', 'das Wort'] if any(m in list_element for m in matchers): print("item at index", i, ":", list_element) # Listenelemente, die matchers enthalten start_Element_Rede = i + 1 list_with_startelement_numbers.append(start_Element_Rede) print("Start_Index_Redetext: ", start_Element_Rede) '''- POS -> PartOfSpeech Verben, Nomen, ... in Listenelement mit matchers''' words = word_tokenize(list_element) '''extracting Named Entities - Person, Organization,...''' jar = 'jars/stanford-postagger.jar' model = 'jars/german-hgc.tagger' pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') text = pos_tagger.tag(pos_tagger) print(text) namedEnt = ne_chunk(tagged) print(namedEnt) #namedEnt.draw() def extract_entity_names(namedEnt): entityPers_names = [] if hasattr(namedEnt, 'label') and namedEnt.label: if namedEnt.label( ) == 'PERSON': #or namedEnt.label() == 'ORGANIZATION': entityPers_names.append(' '.join( [child[0] for child in namedEnt])) else: for child in namedEnt: entityPers_names.extend( extract_entity_names(child)) return entityPers_names entityPerson_names = [] entityPerson_names.extend(extract_entity_names(namedEnt)) # Print all entity names print("Person: " + str(entityPerson_names)) ''' Excel-sheet with all politicans ''' workbook = xlrd.open_workbook('mdb.xls') worksheet = workbook.sheet_by_name('Tabelle1') # Value of 1st row and 1st column value_of_first_col_Names = [] value_of_second_col_Party = [] first_col_Names = worksheet.col_values(0) second_col_Party = worksheet.col_values(1) print(first_col_Names) print(second_col_Party) matchers = first_col_Names politican_name = "" party_name = "" for i in range(len(entityPerson_names)): list_element = entityPerson_names[i] for m in range(len(matchers)): matcher_element = matchers[m] if matcher_element in list_element: print("listen_eintrag", i, ": ", list_element) print("excel_eintrag_name", m, ": ", matcher_element) print("excel_eintrag_partei", m, ": ", second_col_Party[m]) politican_name = matcher_element party_name = second_col_Party[m] ''' Eintrag in DB Name + Partei''' ''' Anbindung API-Abgeordnetenwatch - JSON Data-Extract''' # import urllib.request, json # politican_name = politican_name.lower() # print(politican_name) # politican_name = politican_name.replace(' ','-') # print(politican_name) # with urllib.request.urlopen("https://www.abgeordnetenwatch.de/api/profile/"+politican_name+"/profile.json") as url: # data = json.loads(url.read().decode()) # print(data) # print(data['profile']['personal']['first_name']+ " " +data['profile']['personal']['last_name']) # print(data['profile']['party']) ''' Eintrag in DB Name + Partei''' print("Liste mit Startnummern: ", list_with_startelement_numbers) # jede zweite Startnummer (= Ende) um 1 mindern für Ende einer Rede # [start:end:stop] # print(list_with_startelement_numbers[1::2]) for value in range(1, len(list_with_startelement_numbers), 2): list_with_startelement_numbers[ value] = list_with_startelement_numbers[value] - 1 #print(list_with_startelement_numbers) list_with_startEnd_numbers = list_with_startelement_numbers # list_with_startEnd_numbers enthält Start und Ende item(Nummern) aller Redetexte print("Liste mit Start + Endnummern: ", list_with_startEnd_numbers) for item in range(len(cleanList)): element = cleanList[item] #print("item at index", item, ":", element) alle_Reden = [] x = 0 y = 1 start = 1 print(len(list_with_startEnd_numbers)) end = len(list_with_startEnd_numbers) - 1 active = True while active: print("x: ", x) print("y: ", y) print("start: ", start) if start > end: active = False print("false") else: alle_Reden.append(cleanList[ list_with_startEnd_numbers[x]:list_with_startEnd_numbers[y]] ) # [alle zwischen Start:Ende] #print("weiter") #print("start: ", start) x += 2 y += 2 start += 2 # Ausgabe aller Reden for rede in alle_Reden: print(rede) print("\n")
nltk.ne_chunk(tags_tofu) qq_15 = qqExample.text[39579] not_1 = sent_tokenize(qq_15)[2] sentenceClean(qq_15) nltk.pos_tag( word_tokenize( 'it is a clean and beautiful restaurant otherwise with average service and very well kept restrooms' )) nltk.pos_tag(word_tokenize('i would definitely not recommend this restaurant')) nltk.pos_tag(word_tokenize('i won\'t like this restaurant')) nltk.pos_tag(word_tokenize('this was really great!')) nltk.pos_tag(word_tokenize('boring,flavorless and spicy')) nltk.pos_tag(word_tokenize('it is boring,flavorless and spicy')) st.tag(word_tokenize('boring,flavorless and spicy')) home = expanduser("~") _path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger' _path_to_jar = home + '/stanford-postagger/stanford-postagger.jar' st = StanfordPOSTagger(_path_to_model, _path_to_jar) nltk.pos_tag( word_tokenize('the tofu is pretty cold, flavorless, and a bit slimy.')) st.tag(word_tokenize('the tofu is pretty cold, flavorless, and a bit slimy.')) nltk.pos_tag(word_tokenize('i would definitely not recommend this restaurant')) st.tag(word_tokenize('it\'s over-cooked!')) nltk.pos_tag(word_tokenize(qq7))
class ActionListGenerator: def __init__(self,sentence, graph): self.Construct_Pattern_House() self.sentence = sentence self.rdfgraph = graph # self.sentence = sentence self.st = StanfordPOSTagger('chinese-distsim.tagger') self.nodecount = dict() def Construct_Pattern_House(self): self.patterns = [] self.patterns.append([u'当 (N) (V) (N) 时', 'event']) self.patterns.append([u'{哪} () [的]{0,1} (N) [的]{0,1} 股价 {涨幅} [会]{0,1} [最大|最多]', 'stock_increase']) self.patterns.append([u'{哪} (N) 股 [的|将]{0,1} {涨} [会]{0,1} [得]{0,1} [最大|最多]', 'specific_type_stock_increase']) def Generate(self): self.words = jieba.cut(self.sentence) self.sentence2 = ' '.join(list(self.words)) self.pos = self.st.tag(self.sentence2.split()) self.senpos = [(sp.split('#')[0], sp.split('#')[1]) for _, sp in self.pos] print self.sentence2 print self.pos self.actions = ActionList(self.rdfgraph) for pat in self.patterns: self.match(self.senpos, pat[0], pat[1]) print self.actions def GetCount(self, pattype): if pattype in self.nodecount: ID = self.nodecount[pattype] self.nodecount[pattype] += 1 return ID else: self.nodecount[pattype] = 1 return 0 def match(self, senpos, pattern, pattype): patarr = pattern.split() paralist = [] i=0 canmatch = True while i < len(senpos): canmatch = True regextra = 0 j = 0 while j < len(patarr): if patarr[j][0]=='(': if patarr[j][1:-1] in senpos[i+j + regextra][1]: paralist.append(senpos[i+j + regextra][0]) else: canmatch = False break elif patarr[j][0]=='[': contentstr = patarr[j].split(']')[0][1:] contents = contentstr.split('|') if patarr[j][-1]=='}': times = patarr[j].split('{')[1][:-1].split(',') minimum_allowed_occurance = int(times[0]) maximum_allowed_occurance = int(times[1]) repeat = 0 for repeatednum in range(minimum_allowed_occurance, maximum_allowed_occurance + 1): if senpos[i + j + regextra + repeatednum][0] in contents: repeat = repeatednum else: if repeatednum == 0: regextra -= 1 else: regextra += repeat break else: if senpos[i + j + regextra][0] in contents: pass else: canmatch = False break elif patarr[j][0]=='{': content = patarr[j][1:-1] if content in senpos[i+j + regextra][0]: pass else: canmatch = False break elif patarr[j] == senpos[i+j + regextra][0]: pass else: canmatch = False break j+=1 if canmatch: break else: paralist = [] i += 1 ID = lambda x: str(self.GetCount(x)) if pattype == 'event': if len(paralist) != 3 or not canmatch: return [] tid = ID('t') res = ['SELECT ?t'+ tid, " WHERE ", "{ "] NodeID = ID(pattype) res.append('?event'+NodeID + ' <http://www.example.org/subject> \"' + paralist[0]+'\" .') res.append('?event'+NodeID + ' <http://www.example.org/trigger> \"' + paralist[1]+'\" .') res.append('?event'+NodeID + ' <http://www.example.org/object> \"' + paralist[2]+'\" .') res.append('?event'+NodeID + ' <http://www.example.org/time> ?t' + tid + ' .') res.append('}') command = '\n'.join(res) act = Action('sparql') act.setCommand(command) act.inputtype = 'None' act.keydict['subject'] = paralist[0] act.returntype = 'value' self.actions.add(act) elif pattype == 'stock_increase': if not canmatch: return [] if len(paralist) == 1: companyname = self.actions[-1].keydict['subject'] pass elif len(paralist) == 2: companyname = paralist[0] pass res = ['SELECT ?support ?p ', "WHERE ", "{ "] NodeID = ID('company') res.append('?company'+NodeID + ' <http://www.example.org/support> ?support .') res.append('?company'+NodeID + ' <http://www.example.org/name> \"' + companyname +'\" .') supportNodeID = ID('supportnode') stockNodeID = ID('stocknode') res.append('?supportnode'+supportNodeID + ' <http://www.example.org/name> ?support .') res.append('?supportnode'+supportNodeID + ' <http://www.example.org/stock> ?stock'+stockNodeID + ' .') res.append('?stock'+stockNodeID + ' <http://www.example.org/stocktime> \"%s\" .') res.append('?stock'+stockNodeID + ' <http://www.example.org/price> ?p .') res.append('}') command = '\n'.join(res) act = Action('sparql') act.inputtype = 'timestamp' act.setCommand(command) self.actions.add(act) act1 = copy.deepcopy(act) act1.inputtype = 'latertimestamp' self.actions.add(act1) actminus = Action('minus') actminus.inputtype='table' self.actions.add(actminus) actmax = Action('max') actmax.inputtype='table' self.actions.add(actmax) elif pattype == 'specific_type_stock_increase': if not canmatch: return [] stocktype = paralist[0] res = ['SELECT ?company ?p ', "WHERE ", "{ "] companyNodeID = ID('company') stockNodeID = ID('stocknode') res.append('?companynode' + companyNodeID + ' <http://www.example.org/name> ?company .') res.append('?companynode' + companyNodeID + ' <http://www.example.org/stock> ?stock' + stockNodeID + ' .') res.append('?companynode' + companyNodeID + ' <http://www.example.org/type> \"' + stocktype + '\" .') res.append('?stock' + stockNodeID + ' <http://www.example.org/stocktime> \"%s\" .') res.append('?stock' + stockNodeID + ' <http://www.example.org/price> ?p .') res.append('}') command = '\n'.join(res) act = Action('sparql') act.inputtype = 'timestamp' act.setCommand(command) self.actions.add(act) act1 = copy.deepcopy(act) act1.inputtype = 'latertimestamp' self.actions.add(act1) actminus = Action('minus') actminus.inputtype='table' self.actions.add(actminus) actmax = Action('max') actmax.inputtype='table' self.actions.add(actmax)
sys.stdout.write("\t") for tok in token2: sys.stdout.write("\t") sys.stdout.write(tok.rjust(8)) print() for j in range(0,len(v.state)): sys.stdout.write(v.state[j]) sys.stdout.write("\t") for i in range(0,len(token2)): sys.stdout.write("\t") sys.stdout.write(str(round((Viterbi_matrix2[i][j]),5))) sys.stdout.write("\t") print() print("--------------------------------------------------------------------------------") #Stanford POS Tagging stanford_dir = "C:/stanford-postagger/" # change it into your own path model_file= stanford_dir + 'models/english-left3words-distsim.tagger' jarfile = stanford_dir +"stanford-postagger.jar"# jar file st = StanfordPOSTagger(model_filename=model_file, path_to_jar=jarfile) print("\nSentence 1: "+seq1) tokens1 = word_tokenize(seq1) # tokenize into words print("Using Stanford POS Tagging, Sentence 1 is tagged as: ") print(st.tag(seq1.split())) print("\nSentence 2: "+seq2) tokens2 = word_tokenize(seq2) # tokenize into words print("Using Stanford POS Tagging, Sentence 2 is tagged as: ") print(st.tag(seq2.split()))
import spacy home = expanduser("~") _path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger' _path_to_jar = home + '/stanford-postagger/stanford-postagger.jar' st = StanfordPOSTagger(_path_to_model, _path_to_jar) qqExample = pd.read_csv('/Applications/Study/UWM/628/module2/qq.csv', index_col=0) qqExample.index = range(0, len(qqExample)) i = 3 qqExample.text[i] nltk.pos_tag(word_tokenize(qqExample.text[i])) st.tag(word_tokenize(qqExample.text[i])) st.tag_sents([sent_tokenize(qqExample.text[i])]) qqAll = '. '.join(qqExample.text) len(qqAll) nltk.pos_tag(word_tokenize(qqAll)) st.tag(word_tokenize(qqAll)) st.tag_sents([sent_tokenize(qqAll)]) test = pd.read_csv( '/Applications/Study/UWM/628/module2/textUsing/chineseAllReview.csv') test.head(5) testAll = '. '.join(test.text) len(testAll) mangyiba = st.tag(word_tokenize(testAll))
qqExample['text'] = qqExample['text'].apply(sentenceClean2) qqExample.to_csv('/Applications/Study/UWM/628/module2/textUsing/transqq2.csv') #nltk.pos_tag(word_tokenize('jason is a really nice guy.')) tagList = [ 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ' ] nounList = ['NN', 'NNS'] def detact_noun(self): testBag = nltk.pos_tag(word_tokenize(self)) selectedBag = list( pd.Series(list(dict(testBag).keys())).iloc[np.where( [x in nounList for x in list(dict(testBag).values())])]) return selectedBag qqExample = pd.read_csv( '/Applications/Study/UWM/628/module2/textUsing/transqq2.csv', index_col=0) qqExample.index = range(0, len(qqExample)) detact_noun(qqExample.text[41]) nltk.pos_tag(word_tokenize(qqExample.text[41])) st.tag(word_tokenize(qqExample.text[41])) qqExample['text'] = qqExample['text'].apply(detact_noun) qqExample.head(5) qqExample.to_csv('/Applications/Study/UWM/628/module2/textUsing/nounsqq.csv')
from nltk import StanfordPOSTagger text = ''' الفيتامينات هي عناصر غذائيّة أساسية لجسم الإنسان، وهي عبارة عن مركبات عضويّة توجد طبيعيّاً في الأغذية ويحتاجها الجسم بكميّات بسيطة للقيام بوظائفه الطبيعية، ولا يستطيع الجسم تصنيعها أو تصنيع كميّات كافية منها لتلبي احتياجاته''' Tagger = StanfordPOSTagger( './stanfor arabic modeal and tagger/arabic.tagger', './stanfor arabic modeal and tagger/stanford-postagger.jar') output = Tagger.tag(text.split()) output = [tuple(filter(None, tp)) for tp in output] #remove empty tubles for data in output: print(data[0].split("/")[0] + " > " + data[0].split("/")[1] + "\n") # References: # 1. Stanford Arabic part-of-speech tagset # https://www.sketchengine.co.uk/stanford-arabic-part-of-speech-tagset/ # 2. Stanford POS tagger # https://nlp.stanford.edu/software/pos-tagger-faq.html#tagset