def __init__(self,sentence, graph): self.Construct_Pattern_House() self.sentence = sentence self.rdfgraph = graph # self.sentence = sentence self.st = StanfordPOSTagger('chinese-distsim.tagger') self.nodecount = dict()
def __add_basic_pos_tag(df): pos_path_jar = "./stanford-postagger-full-2017-06-09/stanford-postagger.jar" pos_path_model = "./stanford-postagger-full-2017-06-09/models/english-left3words-distsim.tagger" pos_tagger = StanfordPOSTagger(pos_path_model, pos_path_jar) pos = [pos_tagger.tag(s) for s in [df.word]] pos = [i[1] for i in pos[0]] pos = pd.DataFrame(pos) df['pos'] = pos return df
def __init__(self, filename): self.filename = filename self.tokenizer = TreebankWordTokenizer() self.sent_tokenizer = load( 'tokenizers/punkt/{0}.pickle'.format('english')) self.st = StanfordPOSTagger( '../stanfordPOStagger/english-bidirectional-distsim.tagger', '../stanfordPOStagger/stanford-postagger.jar', java_options='-mx2048m') #self.w2v_model = KeyedVectors.load_word2vec_format( # "C:/Users/PC1/Desktop/python/деплом/deplom/constructions/GoogleNews-vectors-negative300.bin.gz", # binary=True) self.w2v_model = None self.text = self.get_text() self.anns = [] self.idx_list = IdxList() self.punct = punctuation + '‘’— \t\n'
def workflow_resources(self): corpus_encoding = self.task_config["CORPUS_ENCODING"] stanford_postagger_path = self.task_config["STANFORD_POSTAGGER_PATH"] stanford_models_path = self.task_config["STANFORD_MODELS_PATH"] stanford_pos_model_path = self.task_config["STANFORD_POS_MODEL_PATH"] tokenizer = StanfordTokenizer(stanford_models_path, encoding=corpus_encoding) pos_tagger = StanfordPOSTagger(stanford_pos_model_path, path_to_jar=stanford_postagger_path, encoding=corpus_encoding) workflow_resources = {"tokenizer": tokenizer, "pos_tagger": pos_tagger} return workflow_resources
class POSTagger(BaseEstimator, TransformerMixin): def __init__(self, models_path=None): models_path = models_path or os.environ["MODELS_PATH"] jar_file = Path(models_path, "stanford-postagger.jar") tagger_file = Path(models_path, "spanish.tagger") self.tagger = StanfordPOSTagger(str(tagger_file), str(jar_file)) def tag(self, token_list): tags = self.tagger.tag(token_list) _, tags = zip(*tags) return list(tags) def transform(self, x, y=None): return [self.tag(sequence) for sequence in x]
os.environ[ "STANFORD_MODELS"] = "/Users/umeraltaf/Desktop/QA_Project/StanfordNER" from nltk.tag.stanford import StanfordNERTagger stanford_NER_tagger = StanfordNERTagger( '/Users/umeraltaf/Desktop/QA_Project/StanfordNER/english.all.3class.distsim.crf.ser.gz', '/Users/umeraltaf/Desktop/QA_Project/StanfordNER/stanford-ner.jar') from nltk import StanfordPOSTagger os.environ[ "STANFORD_MODELS"] = "/Users/umeraltaf/Desktop/QA_Project/StanfordNER" stanford_POS_tagger = StanfordPOSTagger( '/Users/umeraltaf/Desktop/QA_Project/StanfordNER/english-bidirectional-distsim.tagger', '/Users/umeraltaf/Desktop/QA_Project/StanfordNER/stanford-postagger.jar') with open('QA_train.json') as data_file: data = json.load(data_file)[:100] stopwords = set(nltk.corpus.stopwords.words( 'english')) # wrap in a set() (see below) ############## Remove from below stopwords.remove('the') stopwords.remove('of') stemmer = nltk.stem.PorterStemmer() ######## PunctuationExclude = set(string.punctuation) ############ PunctuationExclude.remove(',') PunctuationExclude.remove('-')
import pandas as pd import numpy as np import nltk import re from os.path import expanduser from nltk import StanfordPOSTagger from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords from nltk.corpus import brown import spacy home = expanduser("~") _path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger' _path_to_jar = home + '/stanford-postagger/stanford-postagger.jar' st = StanfordPOSTagger(_path_to_model, _path_to_jar) shitThing = ['.', ',', '-', '(', ')', ':'] test = pd.read_csv( '/Applications/Study/UWM/628/module2/textUsing/chineseAllReview.csv') test.head(5) tagList = [ 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ' ] def wordTrans(self): testBag = nltk.pos_tag(word_tokenize(self))
def contentToList(page_content): list = sent_tokenize(page_content) # list = page_content.split(' ') print(list) cleanList = [] list_with_startelement_numbers = [] # enthält Start item aller Redetexte list_with_startEnd_numbers = [ ] # enthält Start und Ende item aller Redetexte # hallo ihr # meine dfsdkfsdfsd for i in range(len(list)): list_element = list[i] list_element = list_element.replace("\n", "") list_element = list_element.replace("-", "") cleanList.append(list_element) # liste ohne -, \n #print("item at index", i, ":", list_element) # alle Listenelemente start_Element_Rede = 0 '''analysiere Struktur list_element''' ''' nachdem Präsident Lammert das Wort übergibt, beginnt eine Rede''' matchers = ['Das Wort', 'das Wort'] if any(m in list_element for m in matchers): print("item at index", i, ":", list_element) # Listenelemente, die matchers enthalten start_Element_Rede = i + 1 list_with_startelement_numbers.append(start_Element_Rede) print("Start_Index_Redetext: ", start_Element_Rede) '''- POS -> PartOfSpeech Verben, Nomen, ... in Listenelement mit matchers''' words = word_tokenize(list_element) '''extracting Named Entities - Person, Organization,...''' jar = 'jars/stanford-postagger.jar' model = 'jars/german-hgc.tagger' pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') text = pos_tagger.tag(pos_tagger) print(text) namedEnt = ne_chunk(tagged) print(namedEnt) #namedEnt.draw() def extract_entity_names(namedEnt): entityPers_names = [] if hasattr(namedEnt, 'label') and namedEnt.label: if namedEnt.label( ) == 'PERSON': #or namedEnt.label() == 'ORGANIZATION': entityPers_names.append(' '.join( [child[0] for child in namedEnt])) else: for child in namedEnt: entityPers_names.extend( extract_entity_names(child)) return entityPers_names entityPerson_names = [] entityPerson_names.extend(extract_entity_names(namedEnt)) # Print all entity names print("Person: " + str(entityPerson_names)) ''' Excel-sheet with all politicans ''' workbook = xlrd.open_workbook('mdb.xls') worksheet = workbook.sheet_by_name('Tabelle1') # Value of 1st row and 1st column value_of_first_col_Names = [] value_of_second_col_Party = [] first_col_Names = worksheet.col_values(0) second_col_Party = worksheet.col_values(1) print(first_col_Names) print(second_col_Party) matchers = first_col_Names politican_name = "" party_name = "" for i in range(len(entityPerson_names)): list_element = entityPerson_names[i] for m in range(len(matchers)): matcher_element = matchers[m] if matcher_element in list_element: print("listen_eintrag", i, ": ", list_element) print("excel_eintrag_name", m, ": ", matcher_element) print("excel_eintrag_partei", m, ": ", second_col_Party[m]) politican_name = matcher_element party_name = second_col_Party[m] ''' Eintrag in DB Name + Partei''' ''' Anbindung API-Abgeordnetenwatch - JSON Data-Extract''' # import urllib.request, json # politican_name = politican_name.lower() # print(politican_name) # politican_name = politican_name.replace(' ','-') # print(politican_name) # with urllib.request.urlopen("https://www.abgeordnetenwatch.de/api/profile/"+politican_name+"/profile.json") as url: # data = json.loads(url.read().decode()) # print(data) # print(data['profile']['personal']['first_name']+ " " +data['profile']['personal']['last_name']) # print(data['profile']['party']) ''' Eintrag in DB Name + Partei''' print("Liste mit Startnummern: ", list_with_startelement_numbers) # jede zweite Startnummer (= Ende) um 1 mindern für Ende einer Rede # [start:end:stop] # print(list_with_startelement_numbers[1::2]) for value in range(1, len(list_with_startelement_numbers), 2): list_with_startelement_numbers[ value] = list_with_startelement_numbers[value] - 1 #print(list_with_startelement_numbers) list_with_startEnd_numbers = list_with_startelement_numbers # list_with_startEnd_numbers enthält Start und Ende item(Nummern) aller Redetexte print("Liste mit Start + Endnummern: ", list_with_startEnd_numbers) for item in range(len(cleanList)): element = cleanList[item] #print("item at index", item, ":", element) alle_Reden = [] x = 0 y = 1 start = 1 print(len(list_with_startEnd_numbers)) end = len(list_with_startEnd_numbers) - 1 active = True while active: print("x: ", x) print("y: ", y) print("start: ", start) if start > end: active = False print("false") else: alle_Reden.append(cleanList[ list_with_startEnd_numbers[x]:list_with_startEnd_numbers[y]] ) # [alle zwischen Start:Ende] #print("weiter") #print("start: ", start) x += 2 y += 2 start += 2 # Ausgabe aller Reden for rede in alle_Reden: print(rede) print("\n")
#Or you can give "DEV" here provided that that dataset is available in the same directory runOn = "Test" #This switch can be used to use a relaxed evaluation matric which awards for partial matches and match in many possible answers #function defined later on, also can refer report #if False then will only give a score 1 if exact match with correct answer (Default in project) relaxedEvaluationMetric = False # printing start time of the script # This script should not take more that 4 or 5 minutes print("Start Time:", ctime()) # initializing taggers and modals from NLTK stanford_NER_tagger = StanfordNERTagger( 'english.all.3class.distsim.crf.ser.gz') stanford_POS_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') stemmer = nltk.stem.PorterStemmer() # os.environ["STANFORD_MODELS"] = "/Users/umeraltaf/Desktop/QA_Project/StanfordNER" # stanford_NER_tagger = StanfordNERTagger('/Users/umeraltaf/Desktop/QA_Project/StanfordNER/english.all.3class.distsim.crf.ser.gz','/Users/umeraltaf/Desktop/QA_Project/StanfordNER/stanford-ner.jar') # stanford_POS_tagger = StanfordPOSTagger('/Users/umeraltaf/Desktop/QA_Project/StanfordNER/english-bidirectional-distsim.tagger','/Users/umeraltaf/Desktop/QA_Project/StanfordNER/stanford-postagger.jar') # stemmer = nltk.stem.PorterStemmer() ##Some path declarations for the precomputed models # This is the cache file that will store the precomputed best sentences and tags # so that we dont have to tag each time we run this script if runOn == "DEV": fname = "bestSentencesTaggedEnhancedDev.bin" else: fname = 'bestSentencesTaggedEnhancedTest.bin' QuestionModelPATH = "QuestionClassificationModelStanford.pickle"
t0 = time.time() datas = 'data/QA_dev.json' print datas from nltk import StanfordNERTagger, StanfordPOSTagger dataset = json.loads(open(path.join(parent_path, datas)).readline()) ner_tagger = StanfordNERTagger(path.join(parent_path, 'data/english.all.3class.distsim.crf.ser.gz'), path.join(parent_path, 'data/stanford-ner.jar'), encoding='utf-8') pos_tagger = StanfordPOSTagger(path.join(parent_path, 'data/wsj-0-18-left3words-distsim.tagger'), path.join(parent_path, 'data/stanford-postagger.jar'), encoding='utf-8') prog_total = len(dataset) def dmerge(ner, pos): if pos and pos[1] == 'CD': return ner[0], 'NUMBER' elif ner[1] == 'O': return pos else: return ner def _merge_tag(ners, poss):
import pandas as pd import numpy as np import nltk import re from os.path import expanduser from nltk import StanfordPOSTagger from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords from nltk.corpus import brown import spacy home = expanduser("~") _path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger' _path_to_jar = home + '/stanford-postagger/stanford-postagger.jar' st = StanfordPOSTagger(_path_to_model, _path_to_jar) qqExample = pd.read_csv('/Applications/Study/UWM/628/module2/qq.csv', index_col=0) qqExample.index = range(0, len(qqExample)) i = 3 qqExample.text[i] nltk.pos_tag(word_tokenize(qqExample.text[i])) st.tag(word_tokenize(qqExample.text[i])) st.tag_sents([sent_tokenize(qqExample.text[i])]) qqAll = '. '.join(qqExample.text) len(qqAll) nltk.pos_tag(word_tokenize(qqAll)) st.tag(word_tokenize(qqAll))
return "LOCATION" elif "who" in question.lower(): return "PERSON" elif "how many" in question.lower() or "number" in question.lower() or "count" in question.lower(): return "NUMBER" elif "when" in question.lower() or "date" in question.lower(): return "NUMBER" else: return "OTHER" from nltk import StanfordPOSTagger os.environ["STANFORD_MODELS"] = "/Users/umeraltaf/Desktop/QA_Project/StanfordNER" stanford_tagger = StanfordPOSTagger('/Users/umeraltaf/Desktop/QA_Project/StanfordNER/english-bidirectional-distsim.tagger','/Users/umeraltaf/Desktop/QA_Project/StanfordNER/stanford-postagger.jar') correct = 0 possCorrect = 0 wrongNumber = 0 totalans = 0 multiAnswer = 0 i = -1 #index of our NER_TAGGED list (i.e. questions) for article in data: for question in article['qa']: i+=1 taggedBestAnswerSent = NER_tagged[i]
import sys #print(len(sys.argv)) assert (len(sys.argv) == 4) afile = sys.argv[1] qfile = sys.argv[2] numq = int(sys.argv[3]) #print(afile,qfile,numq) nlpspacy = spacy.load("en_core_web_md") stanforddir = 'stanford-postagger-2018-10-16/' modelfile = stanforddir + 'models/english-bidirectional-distsim.tagger' jarfile = stanforddir + 'stanford-postagger.jar' postagger = StanfordPOSTagger(model_filename=modelfile, path_to_jar=jarfile) #nltk.download("punkt") #nltk.download("wordnet") #nltk.download('stopwords') #nltk.download('averaged_perceptron_tagger') #nltk.download('wordnet') def posToWordnet(pos): first = pos[0] if first == 'J': return 'a' elif first == 'V': return 'v'
postings = index[term] for docid, weight in postings: accumulator[docid] += weight return accumulator.most_common(k) ############ End of coppied code #printing start time of the script print("Start Time:", ctime()) #initializing taggers and modals from NLTK #os.environ["STANFORD_MODELS"] = "/chechi/Documents/StanfordNER" stanford_NER_tagger = StanfordNERTagger( 'english.all.3class.distsim.crf.ser.gz') stanford_POS_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') stemmer = nltk.stem.PorterStemmer() #This is the cache file that will store the precomputed best sentences and tags #so that we dont have to tag each time we run this script if (runOn == "DEV"): fname = 'bestSentencesTaggedDev.bin' else: fname = 'bestSentencesTaggedTrain.bin' #This variable will store all tagged most relevant sentences NER_tagged = None #Load the dataset, note that as train set is large I only load first 50 articles if (runOn == "DEV"):
sys.stdout.write("\t") for tok in token2: sys.stdout.write("\t") sys.stdout.write(tok.rjust(8)) print() for j in range(0,len(v.state)): sys.stdout.write(v.state[j]) sys.stdout.write("\t") for i in range(0,len(token2)): sys.stdout.write("\t") sys.stdout.write(str(round((Viterbi_matrix2[i][j]),5))) sys.stdout.write("\t") print() print("--------------------------------------------------------------------------------") #Stanford POS Tagging stanford_dir = "C:/stanford-postagger/" # change it into your own path model_file= stanford_dir + 'models/english-left3words-distsim.tagger' jarfile = stanford_dir +"stanford-postagger.jar"# jar file st = StanfordPOSTagger(model_filename=model_file, path_to_jar=jarfile) print("\nSentence 1: "+seq1) tokens1 = word_tokenize(seq1) # tokenize into words print("Using Stanford POS Tagging, Sentence 1 is tagged as: ") print(st.tag(seq1.split())) print("\nSentence 2: "+seq2) tokens2 = word_tokenize(seq2) # tokenize into words print("Using Stanford POS Tagging, Sentence 2 is tagged as: ") print(st.tag(seq2.split()))
import pandas as pd import numpy as np import nltk import re from nltk.tokenize import word_tokenize, sent_tokenize from nltk import StanfordPOSTagger from os.path import expanduser home = expanduser("~") _path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger' _path_to_jar = home + '/stanford-postagger/stanford-postagger.jar' st = StanfordPOSTagger(_path_to_model, _path_to_jar) def sentenceClean2(self): result = self.lower() result = re.sub('high\squality', 'great', result) result = re.sub('low\squality', 'bad', result) result = re.sub('serve', 'service', result) result = re.sub('fast food', 'fastfood', result) result = re.sub('n\'t\s', ' not', result) result = re.sub('\sstars|\sstar', 'stars', result) result = re.sub('0stars', 'onestars', result) result = re.sub('1stars', 'onestars', result) result = re.sub('2stars', 'twostars', result) result = re.sub('3stars', 'threestars', result) result = re.sub('4stars', 'fourstars', result) result = re.sub('5stars', 'fivestars', result) result = re.sub('\snot\s', ' not', result) result = re.sub('\snever\s', ' never', result)
import pandas as pd import numpy as np import nltk import re from nltk.tokenize import word_tokenize, sent_tokenize from nltk import StanfordPOSTagger from os.path import expanduser home = expanduser("~") _path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger' _path_to_jar = home + '/stanford-postagger/stanford-postagger.jar' st = StanfordPOSTagger(_path_to_model, _path_to_jar) def sentenceClean2(self): result = self.lower() result = re.sub('high\squality', 'great', result) result = re.sub('low\squality', 'bad', result) result = re.sub('serve', 'service', result) result = re.sub('fast food', 'fastfood', result) result = re.sub('\s\w+?n\'t[^\w]+?', ' not', result) result = re.sub('\sstars|\sstar', 'stars', result) result = re.sub('1stars', 'onestars', result) result = re.sub('2stars', 'twostars', result) result = re.sub('3stars', 'threestars', result) result = re.sub('4stars', 'fourstars', result) result = re.sub('5stars', 'fivestars', result) result = re.sub('\snot\s', ' not', result) result = re.sub('\snever\s', ' never', result) return result
class AnnotationCompiler: def __init__(self, filename): self.filename = filename self.tokenizer = TreebankWordTokenizer() self.sent_tokenizer = load( 'tokenizers/punkt/{0}.pickle'.format('english')) self.st = StanfordPOSTagger( '../stanfordPOStagger/english-bidirectional-distsim.tagger', '../stanfordPOStagger/stanford-postagger.jar', java_options='-mx2048m') #self.w2v_model = KeyedVectors.load_word2vec_format( # "C:/Users/PC1/Desktop/python/деплом/deplom/constructions/GoogleNews-vectors-negative300.bin.gz", # binary=True) self.w2v_model = None self.text = self.get_text() self.anns = [] self.idx_list = IdxList() self.punct = punctuation + '‘’— \t\n' def get_text(self): with open(self.filename, encoding='utf-8-sig', newline='') as f: text = f.read() return text def correct_text(self, corrs, abs_idx=False): # given corrections (start idx,initial text, correction), correct the text slices = [] last_idx = 0 for item in corrs: if abs_idx: idx, initial, corr = item sent_idx = 0 else: idx, sent_idx, initial, corr = item slices.append(self.text[last_idx:idx + sent_idx]) slices.append(corr) last_idx = sent_idx + idx + len(initial) slices.append(self.text[last_idx:]) return ''.join(slices) def ann_from_spelling(self, corrs): # create annotations and correct text from aspell output matches = [(m.group(0), m.start()) for m in re.finditer(r'[^\s\-]+', self.text)] matches = [ x for x in matches if re.search('[0-9\\W]+', x[0]) is None or re.search('[0-9\\W]+', x[0]).group() != x[0] ] tokens, idx = zip(*matches) final_corrs = [] anns = [] for i, corr in enumerate(corrs): if corr is not None: tag = 'Spelling' start_idx = idx[i] end_idx = start_idx + len(corr[0]) self.idx_list.add(end_idx, len(corr[0]) - len(corr[1])) anns.append( ('%s %d %d\t%s' % (tag, start_idx, end_idx, corr[0]), 'AnnotatorNotes <ERROR>\t%s' % (corr[1]))) final_corrs.append((start_idx, corr[0], corr[1])) self.text = self.correct_text( final_corrs, abs_idx=True) # SHOULD BE SELF.TEXT WHEN IDXS ARE TACKLED return anns def ann_from_correction(self, corrs, tag): # start idx, sent start idx, initial np, predicted np anns = [] for corr in corrs: start_idx = corr[0] + corr[1] end_idx = start_idx + len(corr[2]) anns.append(('%s %d %d\t%s' % (tag, self.idx_list.find_old_idx(start_idx), self.idx_list.find_old_idx(end_idx), corr[2]), 'AnnotatorNotes <ERROR>\t%s' % (corr[3]))) self.text = self.correct_text(corrs) return anns def tokenize(self): sents = self.sent_tokenizer.tokenize(self.text) sent_spans = self.sent_tokenizer.span_tokenize(self.text) tokens = [self.tokenizer.tokenize(sent) for sent in sents] idxs = [ align_tokens(['"' if x in ['``', "''"] else x for x in toks], sent) for sent, toks in zip(sents, tokens) ] return sents, tokens, idxs, sent_spans def compile_annotation(self, path='.'): # collect all corrections sents, tokens, idxs, sent_spans = self.tokenize() with open(path + '/initial_sents.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(sents)) spelling = very_dummy_spellchecker(path) print('Spelling') spell_anns = self.ann_from_spelling(spelling) self.anns.extend(spell_anns) #print([self.text]) #print(self.idx_list) with open(path + '/corrected_spelling.txt', 'w', encoding='utf-8', newline='') as f: f.write(self.text) print('Tokenizing') sents, tokens, idxs, sent_spans = self.tokenize() #with open('init_sents_for_prepositions_test_parsed.txt','r',encoding='utf-8') as f: # trees = parse_tree(f.read()) ## #agr_corrs = check_agreement(trees,sent_spans) print('Tagging') tsents = self.st.tag_sents(tokens) print('Prepositions') prep_corrector = PrepositionCorrector() prep_corrs, prep_idxs = prep_corrector.detect_errors( self.w2v_model, tokens, tsents, idxs, sents, sent_spans) prep_anns = self.ann_from_correction(prep_corrs, 'Prepositions') for idx in prep_idxs: self.idx_list.add(idx[0], idx[1]) ## print(self.idx_list) with open(path + '/corrected_prepositions.txt', 'w', encoding='utf-8') as f: f.write(self.text) self.anns.extend(prep_anns) print('Articles') sents, tokens, idxs, sent_spans = self.tokenize() tsents = self.st.tag_sents(tokens) art_corrector = ArticleCorrector() art_corrs, art_idxs = art_corrector.detect_errors( self.w2v_model, tokens, tsents, idxs, sents, sent_spans) art_anns = self.ann_from_correction(art_corrs, 'Articles') for idx in art_idxs: self.idx_list.add(idx[0], idx[1]) with open(path + '/corrected_articles.txt', 'w', encoding='utf-8') as f: f.write(self.text) self.anns.extend(art_anns) print('Writing annotation') self.write_annotation() def write_annotation(self): with open(self.filename[:-4] + '.ann', 'w', encoding='utf-8') as f: for i, ann in enumerate(self.anns): f.write('T%d\t'%(i+1)+ann[0]+'\n'+\ '#%d\t'%(i+1)+ann[1].replace('<ERROR>','T%d'%(i+1))+'\n')
from nltk import StanfordPOSTagger text = ''' الفيتامينات هي عناصر غذائيّة أساسية لجسم الإنسان، وهي عبارة عن مركبات عضويّة توجد طبيعيّاً في الأغذية ويحتاجها الجسم بكميّات بسيطة للقيام بوظائفه الطبيعية، ولا يستطيع الجسم تصنيعها أو تصنيع كميّات كافية منها لتلبي احتياجاته''' Tagger = StanfordPOSTagger( './stanfor arabic modeal and tagger/arabic.tagger', './stanfor arabic modeal and tagger/stanford-postagger.jar') output = Tagger.tag(text.split()) output = [tuple(filter(None, tp)) for tp in output] #remove empty tubles for data in output: print(data[0].split("/")[0] + " > " + data[0].split("/")[1] + "\n") # References: # 1. Stanford Arabic part-of-speech tagset # https://www.sketchengine.co.uk/stanford-arabic-part-of-speech-tagset/ # 2. Stanford POS tagger # https://nlp.stanford.edu/software/pos-tagger-faq.html#tagset
def __init__(self, models_path=None): models_path = models_path or os.environ["MODELS_PATH"] jar_file = Path(models_path, "stanford-postagger.jar") tagger_file = Path(models_path, "spanish.tagger") self.tagger = StanfordPOSTagger(str(tagger_file), str(jar_file))
class ActionListGenerator: def __init__(self,sentence, graph): self.Construct_Pattern_House() self.sentence = sentence self.rdfgraph = graph # self.sentence = sentence self.st = StanfordPOSTagger('chinese-distsim.tagger') self.nodecount = dict() def Construct_Pattern_House(self): self.patterns = [] self.patterns.append([u'当 (N) (V) (N) 时', 'event']) self.patterns.append([u'{哪} () [的]{0,1} (N) [的]{0,1} 股价 {涨幅} [会]{0,1} [最大|最多]', 'stock_increase']) self.patterns.append([u'{哪} (N) 股 [的|将]{0,1} {涨} [会]{0,1} [得]{0,1} [最大|最多]', 'specific_type_stock_increase']) def Generate(self): self.words = jieba.cut(self.sentence) self.sentence2 = ' '.join(list(self.words)) self.pos = self.st.tag(self.sentence2.split()) self.senpos = [(sp.split('#')[0], sp.split('#')[1]) for _, sp in self.pos] print self.sentence2 print self.pos self.actions = ActionList(self.rdfgraph) for pat in self.patterns: self.match(self.senpos, pat[0], pat[1]) print self.actions def GetCount(self, pattype): if pattype in self.nodecount: ID = self.nodecount[pattype] self.nodecount[pattype] += 1 return ID else: self.nodecount[pattype] = 1 return 0 def match(self, senpos, pattern, pattype): patarr = pattern.split() paralist = [] i=0 canmatch = True while i < len(senpos): canmatch = True regextra = 0 j = 0 while j < len(patarr): if patarr[j][0]=='(': if patarr[j][1:-1] in senpos[i+j + regextra][1]: paralist.append(senpos[i+j + regextra][0]) else: canmatch = False break elif patarr[j][0]=='[': contentstr = patarr[j].split(']')[0][1:] contents = contentstr.split('|') if patarr[j][-1]=='}': times = patarr[j].split('{')[1][:-1].split(',') minimum_allowed_occurance = int(times[0]) maximum_allowed_occurance = int(times[1]) repeat = 0 for repeatednum in range(minimum_allowed_occurance, maximum_allowed_occurance + 1): if senpos[i + j + regextra + repeatednum][0] in contents: repeat = repeatednum else: if repeatednum == 0: regextra -= 1 else: regextra += repeat break else: if senpos[i + j + regextra][0] in contents: pass else: canmatch = False break elif patarr[j][0]=='{': content = patarr[j][1:-1] if content in senpos[i+j + regextra][0]: pass else: canmatch = False break elif patarr[j] == senpos[i+j + regextra][0]: pass else: canmatch = False break j+=1 if canmatch: break else: paralist = [] i += 1 ID = lambda x: str(self.GetCount(x)) if pattype == 'event': if len(paralist) != 3 or not canmatch: return [] tid = ID('t') res = ['SELECT ?t'+ tid, " WHERE ", "{ "] NodeID = ID(pattype) res.append('?event'+NodeID + ' <http://www.example.org/subject> \"' + paralist[0]+'\" .') res.append('?event'+NodeID + ' <http://www.example.org/trigger> \"' + paralist[1]+'\" .') res.append('?event'+NodeID + ' <http://www.example.org/object> \"' + paralist[2]+'\" .') res.append('?event'+NodeID + ' <http://www.example.org/time> ?t' + tid + ' .') res.append('}') command = '\n'.join(res) act = Action('sparql') act.setCommand(command) act.inputtype = 'None' act.keydict['subject'] = paralist[0] act.returntype = 'value' self.actions.add(act) elif pattype == 'stock_increase': if not canmatch: return [] if len(paralist) == 1: companyname = self.actions[-1].keydict['subject'] pass elif len(paralist) == 2: companyname = paralist[0] pass res = ['SELECT ?support ?p ', "WHERE ", "{ "] NodeID = ID('company') res.append('?company'+NodeID + ' <http://www.example.org/support> ?support .') res.append('?company'+NodeID + ' <http://www.example.org/name> \"' + companyname +'\" .') supportNodeID = ID('supportnode') stockNodeID = ID('stocknode') res.append('?supportnode'+supportNodeID + ' <http://www.example.org/name> ?support .') res.append('?supportnode'+supportNodeID + ' <http://www.example.org/stock> ?stock'+stockNodeID + ' .') res.append('?stock'+stockNodeID + ' <http://www.example.org/stocktime> \"%s\" .') res.append('?stock'+stockNodeID + ' <http://www.example.org/price> ?p .') res.append('}') command = '\n'.join(res) act = Action('sparql') act.inputtype = 'timestamp' act.setCommand(command) self.actions.add(act) act1 = copy.deepcopy(act) act1.inputtype = 'latertimestamp' self.actions.add(act1) actminus = Action('minus') actminus.inputtype='table' self.actions.add(actminus) actmax = Action('max') actmax.inputtype='table' self.actions.add(actmax) elif pattype == 'specific_type_stock_increase': if not canmatch: return [] stocktype = paralist[0] res = ['SELECT ?company ?p ', "WHERE ", "{ "] companyNodeID = ID('company') stockNodeID = ID('stocknode') res.append('?companynode' + companyNodeID + ' <http://www.example.org/name> ?company .') res.append('?companynode' + companyNodeID + ' <http://www.example.org/stock> ?stock' + stockNodeID + ' .') res.append('?companynode' + companyNodeID + ' <http://www.example.org/type> \"' + stocktype + '\" .') res.append('?stock' + stockNodeID + ' <http://www.example.org/stocktime> \"%s\" .') res.append('?stock' + stockNodeID + ' <http://www.example.org/price> ?p .') res.append('}') command = '\n'.join(res) act = Action('sparql') act.inputtype = 'timestamp' act.setCommand(command) self.actions.add(act) act1 = copy.deepcopy(act) act1.inputtype = 'latertimestamp' self.actions.add(act1) actminus = Action('minus') actminus.inputtype='table' self.actions.add(actminus) actmax = Action('max') actmax.inputtype='table' self.actions.add(actmax)
for docid, weight in postings: accumulator[docid] += weight return accumulator.most_common(k) ############ End of coppied code #printing start time of the script print("Start Time:",ctime()) #initializing taggers and modals from NLTK os.environ["STANFORD_MODELS"] = "/chechi/Documents/StanfordNER" stanford_NER_tagger = StanfordNERTagger('/Users/chechi/Documents/StanfordNER/english.all.3class.distsim.crf.ser.gz','/Users/chechi/Documents/StanfordNER/stanford-ner.jar') stanford_POS_tagger = StanfordPOSTagger('/Users/chechi/Documents/StanfordNER/english-bidirectional-distsim.tagger','/Users/chechi/Documents/StanfordNER/stanford-postagger.jar') stemmer = nltk.stem.PorterStemmer() #This is the cache file that will store the precomputed best sentences and tags #so that we dont have to tag each time we run this script if(runOn=="DEV"): fname = 'bestSentencesTaggedDev.bin' else: fname = 'bestSentencesTaggedTrain.bin' #This variable will store all tagged most relevant sentences NER_tagged = None
from nltk.corpus import stopwords import nltk from nltk import ne_chunk, pos_tag, Tree from nltk.stem import PorterStemmer import re import html from nltk import StanfordPOSTagger, StanfordNERTagger from feature_extraction.resources import cList model_pos_tag = '../stanford-postagger-2018-10-16/models/english-bidirectional-distsim.tagger' jar_pos_tag = '../stanford-postagger-2018-10-16/stanford-postagger.jar' model_en_tag = '../stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz' jar_en_tag = '../stanford-ner-2018-10-16/stanford-ner-3.9.2.jar' tagger_pos = StanfordPOSTagger(model_pos_tag, path_to_jar=jar_pos_tag, encoding='UTF-8') tagger_en = StanfordNERTagger(model_en_tag, path_to_jar=jar_en_tag, encoding='UTF-8') # preprocessing helper function to obtain string without html tags def html_and_remove(entry): return re.sub(r'<.*?>', '', html.unescape(entry)) # aggregate function removing all html tags from data def remove_html_tags(data): for count, entry in enumerate(data): print(count) entry['postText'][0] = html_and_remove(entry['postText'][0]) entry['targetTitle'] = html_and_remove(entry['targetTitle']) entry['targetDescription'] = html_and_remove(entry['targetDescription']) entry['targetKeywords'] = html_and_remove(entry['targetKeywords'])