def part_of_speeach(self): os.environ["JAVA_HOME"] = "/usr/bin/java" jar = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-full-2015-12-09/stanford-postagger.jar' model = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-full-2015-12-09/models/arabic.tagger' # model = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-2011-04-20/models/left3words-wasj-0-18.tagger' tagger = StanfordPOSTagger(model, jar) tagger.java_options = '-mx4096m' ### Setting higher memory limit for long sentences text = tagger.tag(word_tokenize(self.lyrics[0])) s = '' for i in text: f = i[0] + ' ' + i[1] s = s + f + ' / ' return s
def postagger_nltk(word_lists): os.environ['JAVAHOME'] = JAVA_PATH os.environ["STANFORD_PARSER"] = STANFORD_POSTAGGER_PATH os.environ["STANFORD_MODELS"] = STANFORD_POSTAGGER_MODELS chinese_tagger = StanfordPOSTagger(model_filename=nltk_pos_model_filename, path_to_jar=STANFORD_POSTAGGER_PATH) chinese_tagger.java_options = '-mx12000m' nltk_all_tag = [] flag = 1 for sentence in word_lists: analyse = chinese_tagger.tag(sentence.split()) str_tag = "" for tag_tuple in analyse: tag_list = [tag_char for tag_char in tag_tuple[1].split("#")] str_tag += tag_list[0] + "/" + tag_list[1] + " " nltk_all_tag.append(str_tag.strip()) print("######LTP POSTagger finished " + str(flag) + " sentences") flag += 1 print("NLTK POATagger is finished!!") return nltk_all_tag
def __init__(self, file_path, tagged_words_path=None): '''Creates a Collocations instance with a text file_path - string path to .txt input file; used to generate full description of results in output file, whether or not tagged_words is given tagged_words_path - string path to .txt file containing string representation of list of tagged words in input file; saves time and resources on computation ''' self.file_path = file_path if tagged_words_path == None: #open input file, extract text, and close file document = open(file_path, 'r', encoding='utf-8') raw = document.read().lower() document.close() #tokenize text into words and tag parts of speech using the #Stanford part-of-speech tagger sentences = nltk.sent_tokenize(raw) tokenized_sentences = [nltk.word_tokenize(w) for w in sentences] java_path = 'C:/Program Files/Java/jdk-9.0.1/bin/java.exe' os.environ['JAVAHOME'] = java_path path_to_model = ('stanford-postagger-2017-06-09/models/' 'english-left3words-distsim.tagger') path_to_jar = ('stanford-postagger-2017-06-09/' 'stanford-postagger.jar') tagger = StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options='-mx4096m' tagged_sentences = tagger.tag_sents(tokenized_sentences) self.tagged_words = sum(tagged_sentences, []) else: #load pre-tagged words import ast document = open(tagged_words_path, 'r', encoding='utf-8') self.tagged_words = ast.literal_eval(document.read()) document.close()
from firebase_admin import credentials, firestore cred = credentials.Certificate( "../DjangoBackendOne/news/newsapp-54f7c-firebase-adminsdk-wzja4-dc085fad0b.json" ) firebase_admin.initialize_app(cred) db = firestore.client() jar = '../DjangoBackendOne/stanford-postagger-2018-10-16/stanford-postagger.jar' model = '../DjangoBackendOne/stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger' java_path = "C:/Program Files/Java/jdk1.8.0_101/bin/java.exe" os.environ['JAVAHOME'] = java_path nltk.internals.config_java('C:/Program Files/Java/jdk1.8.0_101/bin/java.exe') pos_tagger = StanfordPOSTagger(model, jar) pos_tagger.java_options = '-mx4096m' config = { "apiKey": "AIzaSyBJumddViT3Y70F6vmEdP_1VMGXqEFaqgg", "authDomain": "newsapp-54f7c.firebaseapp.com", "databaseURL": "https://newsapp-54f7c.firebaseio.com", "projectId": "newsapp-54f7c", "storageBucket": "newsapp-54f7c.appspot.com", "messagingSenderId": "841850292385" } # firebase = pyrebase.initialize_app(config) newsObjects = [] entityCount = 1
from nltk import word_tokenize import os # java_path = "/usr/local/java" # os.environ['JAVAHOME'] = java_path # os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-7-oracle-cloudera/jre" os.environ["JAVA_HOME"] = "/usr/bin/java" # Alternatively to setting the CLASSPATH add the jar and model via their path: jar = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-2011-04-20/stanford-postagger.jar' #model = 'D:\ArabNLP\stanford-postagger-2015-12-09\models\english-left3words-distsim.tagger' model = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-2011-04-20/models/left3words-wsj-0-18.tagger' # model = '/home/ahmad/PycharmProjects/untitled1/english-left3words-distsim.tagger' tagger = StanfordPOSTagger(model, jar) tagger.java_options = '-mx4096m' ### Setting higher memory limit for long sentences text = tagger.tag(word_tokenize(u'أنا تسلقت شجرة')) s = '' for i in text: f = i[0] + ' ' + i[1] s = s + f + ' / ' # print(i) # as_list = u"['" + u"', '".join(i) + u"']"+as_list print s #print(text)