def pos_tag(mots, jar=os.path.join(".", "models", "stanford-postagger", "stanford-postagger-3.8.0.jar"), mdl=os.path.join(".", "models", "stanford-postagger", "french-ud.tagger")): try: pos_tagger = StanfordPOSTagger(mdl, jar, encoding='utf8') except LookupError: java_path = r"C:\Program Files (x86)\Java\jre1.8.0_261\bin\java.exe" os.environ['JAVAHOME'] = java_path pos_tagger = StanfordPOSTagger(mdl, jar, encoding='utf8') tagged = pos_tagger.tag(mots) tags = [g for m, g in tagged] forced_det = ["au", "aux"] absent_of_table = ["PART", "SCONJ"] if any(item in mots for item in forced_det) or any(item in tags for item in absent_of_table): for i, couple in enumerate(tagged): mot = couple[0] gram = couple[1] if mot in forced_det: tagged[i] = (mot, "DET") if gram == "PART": tagged[i] = (mot, "ADV") if gram == "SCONJ": tagged[i] = (mot, "CONJ") return tagged
def new_load_data(f_name): data = {} import os java_path = "C:/Program Files/Java/jdk1.8.0_121/bin/java.exe" os.environ['JAVAHOME'] = java_path st = StanfordPOSTagger('english-bidirectional-distsim.tagger', 'stanford-postagger.jar', encoding='utf-8') with open(f_name, 'r') as file: for line in file: fields = line.split('\t') sent_id = fields[0] """if sent_id == 'sent1656': print('yay')""" data[sent_id] = {} data[sent_id][SENTENCE] = fields[1].strip('\n').split() data[sent_id][ENTITIES] = {} tokenized_sent = nltk.sent_tokenize(fields[1]) for sent in tokenized_sent: chunk_id = 0 for chunk in nltk.ne_chunk(st.tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'label'): data[sent_id][ENTITIES][chunk_id] = ( chunk.label(), ' '.join(c[0] for c in chunk)) chunk_id += len([c[0] for c in chunk]) print(chunk.label(), ' '.join(c[0] for c in chunk)) else: chunk_id += 1 # assert chunk_id < len(fields[1].split()) #sent = st.tag(fields[1].split()) #print(sent) return data
def tag_tweet_twit(tweet, tagger=StanfordPOSTagger('gate-EN-twitter-fast.model')): # get indicies for words tokens = twit_token.ize(tweet['text']) text = " ".join(tokens) tags = tagger.tag(tokens) index_tags = [] for word, tag in tags: try: index_tags = [{ 'indices': [text.index(word), text.index(word) + len(word)], 'category': tag } for word, tag in tags] except: try: index_tags = [{'category': tag} for word, tag in tags] except: error_id = tweet["_id"] print(error_id) return error_id tweet['entities'].update({'Token': index_tags}) tagged_tweet = json.dumps(tweet) + "\n" with open(POS_dir + tok + DBname + "_twitIE_POS", 'a') as tagFile: tagFile.writelines(tagged_tweet) return ""
def get_postag_with_index(sources, idx2word, word2idx): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx in xrange(len(sources)): # len(test_data_plain) test_s_o = sources[idx] source_text = keyphrase_utils.cut_zero(test_s_o, idx2word) text = pos_tagger.tag(source_text) print('[%d/%d] : %s' % (idx, len(sources), str(text))) tagged_source.append(text) return tagged_source
def __init__(self): self.feat_index = {} self.implication_words = ["demonstrate", "suggest", "indicate"] self.hyp_words = ["possible"] self.method_words = ["probe", "detect"] self.pos_tagger = StanfordPOSTagger( 'english-bidirectional-distsim.tagger')
def newone(inpt): data=pd.read_csv('combinedproject.csv') jar ='C:/Users/Deepanshu.pal/Documents/deepanshu/stanford-postagger-2018-10-16/stanford-postagger.jar' model ='C:/Users/Deepanshu.pal/Documents/deepanshu/stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar,encoding='utf8') data['bag'] = data.apply(lambda row:nltk.word_tokenize( row["Description"] ),axis=1) stop_words = set(stopwords.words('english')) data['bag_remove_stopwords'] =data.apply(lambda ro: [w for w in ro['bag'] if not w in stop_words],axis=1) def text_process(mess): nopunc = [char for char in mess if char not in string.punctuation] nopunc = ''.join(nopunc) stem_data=[ps.stem(w) for w in nopunc.split()] return [word for word in stem_data if word.lower() not in stopwords.words('english')] ps = PorterStemmer() data['bag_stemming'] =data.apply(lambda ro: [ps.stem(w) for w in ro['bag_remove_stopwords'] ],axis=1) data['Description'].head(5).apply(text_process) bow_transformer = CountVectorizer(analyzer=text_process).fit(data['Description']) messages_bow = bow_transformer.transform(data['Description']) tfidf_transformer = TfidfTransformer().fit(messages_bow) messages_tfidf = tfidf_transformer.transform(messages_bow) new_project_description=inpt new_project_bow = bow_transformer.transform([new_project_description]) new_project_vector = tfidf_transformer.transform(new_project_bow) result_array=cosine_similarity(new_project_vector, messages_tfidf) similar_score_array=result_array.flatten() data['similar_score']=similar_score_array return data.sort_values(["similar_score"], axis=0, ascending=False)
def main(): with open('/home/abhinav/PycharmProjects/video_enrichment/text.txt', 'r') as myfile: text = myfile.read().replace('\n', '') # text = """Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora.""" # text = "Concepts present in text are outline of machine learning, data mining, statistics, cluster analysis, algorithms like logic, pseudo code." text = p.sub('', text) sentences = nltk.sent_tokenize(text) for sentence in sentences: sentence = sentence.lower() # Lower Case the whole sentence sentence = p.sub( '', sentence) # Removing anything enclosed within brackets print(sentence) ## TAGGING st_tag = StanfordPOSTagger(model_filename=eng_model_filename_pos, path_to_jar=my_path_to_pos_jar) tagged_sentence = st_tag.tag(word_tokenize(sentence)) # print(tagged_sentence) ## ENTITY RECOGNITION # st_ner = StanfordNERTagger(model_filename=eng_model_filename_ner, path_to_jar=my_path_to_ner_jar) # print(st_ner.tag('Rami Eid is studying at Stony Brook University in NY'.split())) ## PARSING # print(parsing(sentence)) ## Chunking Using Regex regex_exps = [ "NP: {<JJ|NN.?>*<NN.?>}", "NP: {<JJ|NN.?>*<NN.?><IN>?<JJ|NN.?>*<NN.?>}", "NP: {<JJ>*<NN.?>+}" ] # Include the following pattern to count conjuctions "NP: {<JJ|NN.?>*<NN.?><CC>?<JJ|NN.?>*<NN.?>}" for grammar in regex_exps: IOB_tagged = chunking(tagged_sentence, grammar) remove_IOBtags(IOB_tagged) # print(concept_count) ## Prune concepts on word level using word frequency count on BBC corpus prune_concepts_WordLevel() print("Pruned concepts are:", pruned_concepts) ## Identify Wikipedia articles(titles) that match concepts extracted from the text if Jaccard Similarity is one or if wikipedia title is a part of concept extracted Wikipedia_aritcle_matching() print("\n", concept_wiki_article) print("\nFinal List Of Concepts:", final_wiki_concepts) # prereq_graph.add_nodes_from(final_wiki_concepts) wiki_based_similarity() Connected_components = nx.connected_components(un_prereq_graph) print("\n Pre-req Graph successfully created") # print("\nConnected Components: ") # print(Connected_components) nx.draw(prereq_graph, with_labels=True) plt.axis('off') plt.savefig("graph_prereq.png")
def __init__( self, java_path="C:/Program Files (x86)/Java/jre1.8.0_251/bin/java.exe", stopwords=default_stopwords, symbols="""#§_-@+=*<>()[]{}/\\"'""", punct="""!;:,.?-..."""): # Chargement du path du dossier du ficher actuel dir_path = os.path.dirname(os.path.abspath(__file__)) # Chargement du lexique f = open(dir_path + '/data/lexique.txt', 'r', encoding="utf8") self.lexique = dict(eval(f.read())) # Défnition des stopwords self.stopwords = stopwords self.symbols = symbols self.punct = punct ## INITIALISATION DU TAGGER # Initialisation des path pour le StanfordPOSTagger jar = dir_path + '/stanford-postagger-full-2018-10-16/stanford-postagger-3.9.2.jar' model = dir_path + '/stanford-postagger-full-2018-10-16/models/french.tagger' self.java_path = java_path os.environ['JAVAHOME'] = self.java_path # Initialisation du StanfordPOSTagger self.pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
def __init__(self): classifier_path1 = "stanford/english.muc.7class.distsim.crf.ser.gz" # scenario 1 # classifier_path2 = "stanford/id-ner-model-half.ser.gz" # scenario 2 # classifier_path2 = "stanford/id-ner-model-id.ser.gz" # scenario 3 # classifier_path2 = "stanford/id-ner-model-2.ser.gz" ner_jar_path = "stanford/stanford-ner.jar" # for handling error nltk internals nltk.internals.config_java(options='-xmx5g') self.pre = Preprocess() self.scp = StanfordParser( './stanford/stanford-parser.jar', './stanford/stanford-parser-3.9.1-models.jar', encoding='utf8') self.ner_tagger = StanfordNERTagger(classifier_path1, ner_jar_path, encoding='utf8') # for scenario 3 self.pos_tagger = StanfordPOSTagger( './stanford/english-bidirectional-distsim.tagger', './stanford/stanford-postagger.jar', encoding='utf8') # combining classifier from Stanford with custom classifier # self.com_tagger = NERComboTagger(classifier_path1,ner_jar_path,stanford_ner_models=classifier_path1+","+classifier_path2) #for scenario 1 and 2 self.core_nlp = StanfordCoreNLP('http://localhost', port=9000)
def update_training_data(usr_input,label,command): format_input = "" st = StanfordPOSTagger(config['tagger']['model'],path_to_jar=config['tagger']['path']) tags = st.tag(usr_input.split()) print(tags) with open(MAPPING_PATH,'r') as data_file: data = json.load(data_file) for pos,tag in enumerate(tags): if(tag[1] != "NNP"): format_input += tag[0] format_input += " " data[label].append(format_input) with open(MAPPING_PATH, "w") as jsonFile: jsonFile.write(json.dumps(data, sort_keys=False, indent=4)) with open(TRAINDATA_PATH,'r') as data_file: data = json.load(data_file) add_dict = { "text" : format_input, "label" : label } data.append(add_dict) with open(TRAINDATA_PATH, "w") as jsonFile: jsonFile.write(json.dumps(data, sort_keys=False, indent=4)) with open(COMMAND_PATH,'r') as data_file: data = json.load(data_file) add_dict = { format_input : command } data[label].update(add_dict) with open(COMMAND_PATH,"w") as jsonFile: jsonFile.write(json.dumps(data, sort_keys=False, indent=4)) print('Added')
def handleMessage(sid, txt): tagger = StanfordPOSTagger(_path_to_model, path_to_jar=_path_to_jar, java_options='-mx4096m') tagged = tagger.tag(nltk.word_tokenize(txt)) responseMessage = str(tagged) sendResponse(sid, responseMessage)
def french_processing_text(df, text_field, POS_JAR, POS_MODEL): tokenizer = RegexpTokenizer(r'\w+') pos_tagger = StanfordPOSTagger(POS_MODEL, POS_JAR, encoding='utf8') lemmatizer = FrenchLefffLemmatizer() stop = stopwords.words('french') # Store processed original dataset new_text = [] # Store normalized words (1-gram) and its POS words = [] pos = [] for text in df[text_field]: # a text may have multiple sentences new_sentences = '' for sentence in text.split( '.' ): # Treat one sentence at a time. Why ? because we're playing with POS processed_sentence = french_processing_sentence( sentence, tokenizer, pos_tagger, lemmatizer, stop) new_sentences += ' '.join(processed_sentence) + ' ' new_text.append(new_sentences) df['processed_text'] = new_text return df
def get_word_dependencies(text): dependencies = {} dep_parser = StanfordDependencyParser( model_path=osp.join( datadir, "stanford_data/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ), java_options="-mx4g -XX:-UseGCOverheadLimit") st = StanfordPOSTagger(osp.join(datadir, "stanford_pos/stanford-postagger-3.9.1.jar"),\ osp.join(datadir, 'stanford_pos/models/english-bidirectional-distsim.tagger'), java_options='-mx4g, XX:-UseGCOverheadLimit') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st.stanford_jar = ':'.join(stanford_jars) result = dep_parser.raw_parse(text) dep = result.__next__() #print(list(dep.triples())) for i in list(dep.triples()): w1 = i[0][0] w2 = i[2][0] if w1 in dependencies: dependencies[w1].append((w2, i[1])) else: dependencies[w1] = [(w2, i[1])] #print(dependencies) return dependencies
def call_reia(): max_score = 0.1 map_val = "" print('-----------------------') user_input = raw_input("enter the string: ") #user_name = get_username(first_line.split(' ', 1)[0]) suggest_list = [] suggest_message = "" #prev_ts = ts print("\nINPUT = ") print(user_input) label = classify(user_input) if label == "": post_message( "Sorry, I could not understand. Please rephrase and try again.") consume_message() print("Classified as : " + str(label)) tokens = nltk.word_tokenize(user_input) print(tokens) st = StanfordPOSTagger(config['tagger']['model'], path_to_jar=config['tagger']['path']) stanford_tag = st.tag(user_input.split()) print("Tags") print(stanford_tag) """with open(MAPPING_PATH,'r') as data_file:
def __init__(self, use_stanford=False, NER_model=None, NER_tagger=None, POS_model=None, POS_tagger=None): """The initializer of the class :param NER_model: NER model path :param NER_tagger: NER tagger path :param POS_model: POS model path :param POS_tagger: POS tagger path :param use_stanford: boolean, if using stanford NER and POS tagging """ self.NER_model = NER_model self.NER_tagger = NER_tagger self.POS_model = POS_model self.POS_tagger = POS_tagger self.use_stanford = use_stanford if use_stanford: if NER_model is None or NER_tagger is None or POS_model is None \ or POS_tagger is None: sys.exit('tagging initialization: Stanford models and taggers' ' have to be provided!') else: self.post = StanfordPOSTagger(self.POS_model, self.POS_tagger).tag self.nert = StanfordNERTagger(self.NER_model, self.NER_tagger).tag else: self.post = nltk.pos_tag self.nert = nltk.ne_chunk
def segment_pos(dir='rawdata', datetype='all', outdir='nohref_seg'): jieba.set_dictionary('dict/dict.txt.big') for tag in loadTag(): jieba.add_word(tag) chinese_postagger = StanfordPOSTagger('tagger/chinese-distsim.tagger', 'tagger/stanford-postagger.jar', encoding='utf-8') for file in parseDateType(dir, datetype): dirname, filename = os.path.split(file) head = filename.split('.')[0] outfile = outdir + '/' + head + '.txt' if os.path.isfile(outfile): print 'pass %s...' % head continue print 'segment %s ...' % head f = open(outfile, 'w') dataList = readJson(file) p = re.compile("http[s]?://.*\n") for data in dataList: content = data['content'] content = re.sub(p, '', content) segList = jieba.cut(content) wordList, tagList = postagging(chinese_postagger, segList) for w, t in zip(wordList, tagList): f.write(w.encode('utf-8')) f.write(' ') f.write(t) f.write(' ') f.write('\n') f.close()
def call_reia(): while (True): max_score = 0.1 map_val = "" with open('/media/ubuntu/Local Disk/MAJOR_PROJECT/REIA/mqueue.txt', 'r') as f: first_line = f.readline() while first_line == "": time.sleep(1) call_reia() print('-----------------------') user_input = first_line.split(' ', 1)[1] user_name = get_username(first_line.split(' ', 1)[0]) suggest_list = [] suggest_message = "" #prev_ts = ts print("\nINPUT = ") print(user_input) label = classify(user_input) if label == "": post_message( "Sorry, I could not understand. Please rephrase and try again." ) consume_message() continue print("Classified as : " + str(label)) tokens = nltk.word_tokenize(user_input) print(tokens) st = StanfordPOSTagger(config['tagger']['model'], path_to_jar=config['tagger']['path']) stanford_tag = st.tag(user_input.split()) print("Tags") print(stanford_tag) with open(MAPPING_PATH, 'r') as data_file: data = json.load(data_file) for i in data[label]: dist = jf.jaro_distance(str(user_input), str(i)) suggest_list.append(tuple((dist, i))) print(dist) if (dist > max_score): max_score = dist map_val = i if max_score < config['preferences']['similarity_threshold']: post_message( "Sorry, I could not understand. Please rephrase and try again." ) consume_message() if config['preferences']['suggestions'] == True: suggest = suggestions(suggest_list) post_message("Did you mean :") for i in suggest: suggest_message += (str(i[1]) + "\n") post_message(suggest_message) continue print("\nMapped to : " + map_val) #post_message(map_val) construct_command(user_input, label, tokens, map_val, stanford_tag, exec_command, user_name) #call('sed -i -e "1d " REIA/mqueue.txt') consume_message()
def emotion_pos_tagging(): tag_target = ['V', 'N', 'J', 'R'] tag_list = [] # 단어 사전 엑셀 파일 입력 df_emotion = open_emotion_dataframe() # 품사 태깅 for word in df_emotion['영어']: STANFORD_POS_MODEL_PATH = "path/english-bidirectional-distsim.tagger" STANFORD_POS_JAR_PATH = "path/stanford-postagger-3.9.2.jar" pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH, STANFORD_POS_JAR_PATH) pos = pos_tagger.tag([word]) tag_first = pos[0][1][0] if tag_first in tag_target: if tag_first == 'V': tag_list.append('동사') if tag_first == 'N': tag_list.append('명사') if tag_first == 'J': tag_list.append('형용사') if tag_first == 'R': tag_list.append('부사') else: tag_list.append('') df_emotion['품사'] = tag_list # 품사 태깅한 확장 단어 사전 데이터프레임 출력 df_emotion.to_excel(f"../res/dic/감정 단어.xlsx")
def __init__(self): self.root_path = '../Models/stanfordNLP/' # word segmenter self.segmenter = StanfordSegmenter( path_to_jar=self.root_path + "stanford-segmenter.jar", path_to_slf4j=self.root_path + "log4j-over-slf4j.jar", path_to_sihan_corpora_dict=self.root_path + "segmenter/", path_to_model=self.root_path + "segmenter/pku.gz", path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz") # pos tagger self.posTagger = StanfordPOSTagger( self.root_path + 'pos-tagger/chinese-distsim.tagger', path_to_jar=self.root_path + "stanford-postagger.jar") # named entity recognizer self.nerTagger = StanfordNERTagger( self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz', path_to_jar=self.root_path + 'stanford-ner.jar') self.parser = StanfordDependencyParser( model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz', path_to_jar=self.root_path + 'stanford-parser.jar', path_to_models_jar=self.root_path + 'stanford-parser-3.7.0-models.jar', encoding='gbk')
def POSTagger(postag_folder_name='', postag_folder='', postag_model='', postag_jarpath=''): ### # default_postag_folder_name = 'stanford-postagger-full-2015-12-09' default_postag_folder_name = 'stanford-postagger-full-2017-06-09' if len(postag_folder_name)==0: postag_folder_name = default_postag_folder_name ### default_postag_folder = os.path.join(os.path.expanduser('~'), 'Stanford NLP', postag_folder_name) if len(postag_folder)==0: postag_folder = default_postag_folder ### default_postag_model = os.path.join(postag_folder,'models', 'chinese-distsim.tagger') if len(postag_model)==0: postag_model = default_postag_model ### default_postag_jarpath = os.path.join(postag_folder,'stanford-postagger.jar') if len(postag_jarpath)==0: postag_jarpath = default_postag_jarpath ### tagger = StanfordPOSTagger(postag_model, postag_jarpath) ### return tagger
def pos_tag(review): eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') tmp = eng_tagger.tag(review) result = [] for element in tmp: result.append(element[1]) return result
class KGQAPOSTagger: """ Parts-of-Speech and Named Entity Recognition taggers, based on Stanford Taggers (https://www.nltk.org/_modules/nltk/tag/stanford.html) """ _POSTagger = StanfordPOSTagger( #model_filename='stanford-postagger-2018-10-16/models/english-bidirectional-distsim.tagger', model_filename='stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger', path_to_jar="stanford-postagger-2018-10-16/stanford-postagger.jar") _NERTagger = StanfordNERTagger( model_filename='stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8') def __init__(self): _empty = 0 def tag(self, sentence, ner=True): """ POS and optional NER tagging :param sentence: sentence to tag :param ner: if True, also perform NER tagging :return: list of POS-word tuples and list of NER-word tuples (if ner was set to True) """ if isinstance(sentence, list): pos_tags = KGQAPOSTagger._POSTagger.tag(sentence) else: pos_tags = KGQAPOSTagger._POSTagger.tag(sentence.split()) if ner: tokens = nltk.tokenize.word_tokenize(sentence) ner_tags = KGQAPOSTagger._NERTagger.tag(tokens) else: ner_tags = None return pos_tags, ner_tags
def _preprpcessing_eng(id_list): stop_w = set(stopwords.words('english')) eng_tagger = StanfordPOSTagger( model_filename= '/home/zhouh/Downloads/stanford-postagger-full-2018-02-27/models/english-bidirectional-distsim.tagger', path_to_jar= '/home/zhouh/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger.jar' ) for i in id_list: try: text = read_file('/home/zhouh/Thesis/code/Transcripts/english/' + i + '.txt') words = nt.word_tokenize(text, language='english') word = [x for x in words if x not in string.punctuation] word = [x for x in word if x not in stop_w] word = [x for x in word if not x.isdigit()] word = eng_tagger.tag(word) tt = '' for w in word: tt += '/'.join(w) + ' ' new_path = '/home/zhouh/Thesis/code/Transcripts/eng_preprocessed/' + i + 'pre.txt' if os.path.exists(new_path): os.remove(new_path) with open(new_path, 'w') as f: f.write(tt) except: continue
def load_pos(tagger_path, model_path, tagset): # detect model type if model_path.endswith('RDR'): return POSModelWrapper(RDRPOSTagger(model_path), tagset) else: return POSModelWrapper(StanfordPOSTagger(model_path, tagger_path, 'utf8'), tagset)
def get_postag_with_record(records, pairs): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx, (record, pair) in enumerate(zip(records, pairs)): # len(test_data_plain) print('*' * 100) print('File: ' + record['name']) print('Input: ' + str(pair[0])) text = pos_tagger.tag(pair[0]) print('[%d/%d][%d] : %s' % (idx, len(records), len(pair[0]), str(text))) tagged_source.append(text) return tagged_source
def __init__(self, ): super(Lemmatizer, self).__init__() self.basename = 'lemmatized' self.pos_tagger = StanfordPOSTagger( 'english-left3words-distsim.tagger', java_options='-mx1024m') self.lemmatizer = WordNetLemmatizer() self.max_length = 500
def tag_tweet_zub_full(tweet, tagger=StanfordPOSTagger('gate-EN-twitter-fast.model')): # get indicies for words if tweet.get('full_text', None): tokens = nltk.word_tokenize( re.sub(r'([^\s\w]|_)+', '', tweet['full_text'])) else: # print(tweet['full_text']) tokens = nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '', tweet['text'])) text = " ".join(tokens) tags = tagger.tag(tokens) index_tags = [] for word, tag in tags: try: index_tags = [{ 'indices': [text.index(word), text.index(word) + len(word)], 'category': tag } for word, tag in tags] except: try: index_tags = [{'category': tag} for word, tag in tags] except: error_id = tweet["_id"] print(error_id) return error_id tweet['entities'].update({'Token': index_tags}) tagged_tweet = json.dumps(tweet) + "\n" with open(POS_dir + tok + DBname + "_twitIE_POS_FULL_TEXT", 'a') as tagFile: tagFile.writelines(tagged_tweet) return ""
def fr_words(DATA_PATH, candidats) : import pandas as pd import operator import nltk from nltk.corpus import stopwords #nltk stanford french tagger from nltk.tag import StanfordPOSTagger jar = 'C:/Users/user/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger-full-2018-02-27/stanford-postagger-3.9.1.jar' model = 'C:/Users/user/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger-full-2018-02-27/models/french.tagger' import os java_path = "C:/ProgramData/Oracle/Java/javapath/java.exe" os.environ['JAVAHOME'] = java_path pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8' ) #tokenizer (enlever les # @ et la ponctuation...) tokenizer = nltk.RegexpTokenizer(r'\w+') #lecture des tweets df = pd.read_csv(DATA_PATH) df = df[df['text'].notnull()] a = len(df) fr_words = [[] for i in range (len(candidats))] indesirable = ["RT","https","http","c","les", "et", "ça","coach", "ils","thevoice", "quand", "donc","thevoice_tf1" ] for j in range (len(candidats)): count = dict() candidat = candidats[j] for i in range (0,a) : if i in [ 7224, 16457,16458,22348,22349,22350,22351,22352, 22353,22354,22355] : continue else : line = df.at[i,'text'] tokenized = tokenizer.tokenize(line) # ne garder que les mots qui ne sont pas des stop words (de, que, dans...) # en minuscule words = [ w.lower() for w in tokenized if (w not in stopwords.words('french') and w not in indesirable)] if set(candidat) & set(words): for word in words : if word in count.keys() : count[word] += 1 else : count[word] = 1 else: continue count = sorted(count.items(), key=operator.itemgetter(1), reverse = True) fr_words1 = count [0:50] # enlever tous les verbes for element in fr_words1 : if pos_tagger.tag(element[0].split())[0][1] not in ['VINF','V'] : fr_words[j].append(element) else : continue return fr_words
def features(text): # POS-Tagging tagged = StanfordPOSTagger(model_filename=model_filename, path_to_jar=path_to_jar, encoding='utf8', verbose=False, java_options='-mx3000m') classified_word = tagged.tag(nltk.word_tokenize(text)) text_postags = [] for index_classified in classified_word: text_postags.append(index_classified[1]) freq_pos = nltk.FreqDist(text_postags) adverb, adjective, noun, pronoun, verb = 0, 0, 0, 0, 0 for index_freq in freq_pos.most_common(len(freq_pos)): if index_freq[0] in ["RB", "RBR", "RBS"]: adverb += index_freq[1] elif index_freq[0] in ["JJ", "JJR", "JJS"]: adjective += index_freq[1] elif index_freq[0] in ["NN", "NNS", "NNP", "NNPS"]: noun += index_freq[1] elif index_freq[0] in ["PRP", "PRP$"]: pronoun += index_freq[1] elif index_freq[0] in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]: verb += index_freq[1] X_test = [] X_test.extend( (adverb / adjective, adverb / noun, adverb / pronoun, adjective / verb, adjective / pronoun, noun / verb, noun / pronoun, verb / pronoun)) return X_test
def TagProb(Readfile, file): if path.exists(sys.path[0] + '/Preparation/save/data/' + file): remove(sys.path[0] + '/Preparation/save/data/' + file) tagger = StanfordPOSTagger(model_filename, path_to_jar) WordDict = {} for line in open(Readfile): sentence = tagger.tag(line.split()) for WordTag in sentence: if WordTag[0] not in WordDict.keys(): WordDict[WordTag[0]] = {} WordDict[WordTag[0]][WordTag[1]] = 1 else: if WordTag[1] not in WordDict[WordTag[0]].keys(): WordDict[WordTag[0]][WordTag[1]] = 1 else: WordDict[WordTag[0]][ WordTag[1]] = 1 + WordDict[WordTag[0]][WordTag[1]] for word in WordDict.keys(): sum_freq = 0 for tag in WordDict[word].keys(): sum_freq = WordDict[word][tag] + sum_freq for tag in WordDict[word].keys(): WordDict[word][tag] = WordDict[word][tag] / sum_freq with open(file, 'a', encoding='utf-8') as Writer: for word in WordDict.keys(): Writer.write(str(word) + ':' + str(WordDict[word]) + '\n') return WordDict