Python StanfordPOSTagger.tag示例，nltk.tag.StanfordPOSTagger.tag Python示例

示例#1

0

显示文件

文件： lemmatizer.py 项目： xandaschofield/stemmers

class Lemmatizer(AbstractStemmer):
    def __init__(self, ):
        super(Lemmatizer, self).__init__()
        self.basename = 'lemmatized'
        self.pos_tagger = StanfordPOSTagger(
            'english-left3words-distsim.tagger', java_options='-mx1024m')
        self.lemmatizer = WordNetLemmatizer()
        self.max_length = 500

    def process(self, words):
        current_sentence = []
        pos_words = []
        for word in words:
            current_sentence.append(word)
            if word in '.!?' and len(current_sentence) > self.max_length:
                try:
                    pos_words += self.pos_tagger.tag(current_sentence)
                except Exception:
                    print 'Broke on', current_sentence
                    raise
                current_sentence = []
        for i in range(len(current_sentence) / self.max_length):
            try:
                pos_words += self.pos_tagger.tag(
                    current_sentence[:self.max_length])
            except Exception:
                print 'Broke on', current_sentence[:self.max_length]
                raise
            current_sentence = current_sentence[self.max_length:]
        try:
            pos_words += self.pos_tagger.tag(current_sentence)
        except Exception:
            print 'Broke on', current_sentence
            raise
        processed_words = [
            self.lemmatizer.lemmatize(wd, pos=self.get_wn_pos(ps))
            for wd, ps in pos_words
        ]
        return processed_words

    # from http://stackoverflow.com/questions/15586721
    def get_wn_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

示例#2

0

显示文件

文件： lemmatizer.py 项目： heraldicsandfox/stemmers

class Lemmatizer(AbstractStemmer):

    def __init__(self, ):
        super(Lemmatizer, self).__init__()
        self.basename = 'lemmatized'
        self.pos_tagger = StanfordPOSTagger('english-left3words-distsim.tagger', java_options='-mx1024m')
        self.lemmatizer = WordNetLemmatizer()
        self.max_length = 500

    def process(self, words):
        current_sentence = []
        pos_words = []
        for word in words:
            current_sentence.append(word)
            if word in '.!?' and len(current_sentence) > self.max_length:
                try:
                    pos_words += self.pos_tagger.tag(current_sentence)
                except Exception:
                    print 'Broke on', current_sentence
                    raise
                current_sentence = []
        for i in range(len(current_sentence) / self.max_length):
            try:
                pos_words += self.pos_tagger.tag(current_sentence[:self.max_length])
            except Exception:
                print 'Broke on', current_sentence[:self.max_length]
                raise
            current_sentence = current_sentence[self.max_length:]
        try:
            pos_words += self.pos_tagger.tag(current_sentence)
        except Exception:
            print 'Broke on', current_sentence
            raise
        processed_words = [self.lemmatizer.lemmatize(wd, pos=self.get_wn_pos(ps)) for wd, ps in pos_words]
        return processed_words

    # from http://stackoverflow.com/questions/15586721
    def get_wn_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

示例#3

0

显示文件

文件： emotion_word.py 项目： choiyunh/2020-1-OSSP1-InvisibleHand-5

def emotion_pos_tagging():
    tag_target = ['V', 'N', 'J', 'R']
    tag_list = []

    # 단어 사전 엑셀 파일 입력
    df_emotion = open_emotion_dataframe()

    # 품사 태깅
    for word in df_emotion['영어']:
        STANFORD_POS_MODEL_PATH = "path/english-bidirectional-distsim.tagger"
        STANFORD_POS_JAR_PATH = "path/stanford-postagger-3.9.2.jar"

        pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH,
                                       STANFORD_POS_JAR_PATH)

        pos = pos_tagger.tag([word])
        tag_first = pos[0][1][0]
        if tag_first in tag_target:
            if tag_first == 'V':
                tag_list.append('동사')
            if tag_first == 'N':
                tag_list.append('명사')
            if tag_first == 'J':
                tag_list.append('형용사')
            if tag_first == 'R':
                tag_list.append('부사')
        else:
            tag_list.append('')
    df_emotion['품사'] = tag_list

    # 품사 태깅한 확장 단어 사전 데이터프레임 출력
    df_emotion.to_excel(f"../res/dic/감정 단어.xlsx")

示例#4

0

显示文件

def update_training_data(usr_input,label,command):
	format_input = ""
	st = StanfordPOSTagger(config['tagger']['model'],path_to_jar=config['tagger']['path'])
	tags = st.tag(usr_input.split())
	print(tags)
	with open(MAPPING_PATH,'r') as data_file:    
		data = json.load(data_file)
		for pos,tag in enumerate(tags):
			if(tag[1] != "NNP"):
				format_input += tag[0]
				format_input += " "
		data[label].append(format_input)
		with open(MAPPING_PATH, "w") as jsonFile:
			jsonFile.write(json.dumps(data, sort_keys=False, indent=4))
	with open(TRAINDATA_PATH,'r') as data_file:
		data = json.load(data_file)
		add_dict = {
			"text" : format_input,
			"label" : label
		}
		data.append(add_dict)
		with open(TRAINDATA_PATH, "w") as jsonFile:
			jsonFile.write(json.dumps(data, sort_keys=False, indent=4))
	with open(COMMAND_PATH,'r') as data_file:
		data = json.load(data_file)
		add_dict = {
			format_input : command
		}
		data[label].update(add_dict)
		with open(COMMAND_PATH,"w") as jsonFile:
			jsonFile.write(json.dumps(data, sort_keys=False, indent=4))
	print('Added')

示例#5

0

显示文件

def handleMessage(sid, txt):
    tagger = StanfordPOSTagger(_path_to_model,
                               path_to_jar=_path_to_jar,
                               java_options='-mx4096m')
    tagged = tagger.tag(nltk.word_tokenize(txt))
    responseMessage = str(tagged)
    sendResponse(sid, responseMessage)

示例#6

0

显示文件

def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source

示例#7

0

显示文件

文件： classify_test.py 项目： aditi2205/reia2

def call_reia():

    max_score = 0.1
    map_val = ""

    print('-----------------------')
    user_input = raw_input("enter the string: ")
    #user_name = get_username(first_line.split(' ', 1)[0])
    suggest_list = []
    suggest_message = ""
    #prev_ts = ts
    print("\nINPUT = ")
    print(user_input)
    label = classify(user_input)
    if label == "":
        post_message(
            "Sorry, I could not understand. Please rephrase and try again.")
        consume_message()

    print("Classified as : " + str(label))
    tokens = nltk.word_tokenize(user_input)
    print(tokens)
    st = StanfordPOSTagger(config['tagger']['model'],
                           path_to_jar=config['tagger']['path'])
    stanford_tag = st.tag(user_input.split())
    print("Tags")
    print(stanford_tag)
    """with open(MAPPING_PATH,'r') as data_file:

示例#8

0

显示文件

def new_load_data(f_name):
    data = {}
    import os
    java_path = "C:/Program Files/Java/jdk1.8.0_121/bin/java.exe"
    os.environ['JAVAHOME'] = java_path
    st = StanfordPOSTagger('english-bidirectional-distsim.tagger',
                           'stanford-postagger.jar',
                           encoding='utf-8')

    with open(f_name, 'r') as file:
        for line in file:
            fields = line.split('\t')
            sent_id = fields[0]
            """if sent_id == 'sent1656':
                print('yay')"""
            data[sent_id] = {}
            data[sent_id][SENTENCE] = fields[1].strip('\n').split()
            data[sent_id][ENTITIES] = {}
            tokenized_sent = nltk.sent_tokenize(fields[1])
            for sent in tokenized_sent:
                chunk_id = 0
                for chunk in nltk.ne_chunk(st.tag(nltk.word_tokenize(sent))):

                    if hasattr(chunk, 'label'):
                        data[sent_id][ENTITIES][chunk_id] = (
                            chunk.label(), ' '.join(c[0] for c in chunk))
                        chunk_id += len([c[0] for c in chunk])
                        print(chunk.label(), ' '.join(c[0] for c in chunk))
                    else:
                        chunk_id += 1

                    # assert chunk_id < len(fields[1].split())
            #sent = st.tag(fields[1].split())
            #print(sent)
    return data

示例#9

0

显示文件

def TagProb(Readfile, file):
    if path.exists(sys.path[0] + '/Preparation/save/data/' + file):
        remove(sys.path[0] + '/Preparation/save/data/' + file)
    tagger = StanfordPOSTagger(model_filename, path_to_jar)
    WordDict = {}
    for line in open(Readfile):
        sentence = tagger.tag(line.split())
        for WordTag in sentence:
            if WordTag[0] not in WordDict.keys():
                WordDict[WordTag[0]] = {}
                WordDict[WordTag[0]][WordTag[1]] = 1
            else:
                if WordTag[1] not in WordDict[WordTag[0]].keys():
                    WordDict[WordTag[0]][WordTag[1]] = 1
                else:
                    WordDict[WordTag[0]][
                        WordTag[1]] = 1 + WordDict[WordTag[0]][WordTag[1]]
    for word in WordDict.keys():
        sum_freq = 0
        for tag in WordDict[word].keys():
            sum_freq = WordDict[word][tag] + sum_freq
        for tag in WordDict[word].keys():
            WordDict[word][tag] = WordDict[word][tag] / sum_freq
    with open(file, 'a', encoding='utf-8') as Writer:
        for word in WordDict.keys():
            Writer.write(str(word) + ':' + str(WordDict[word]) + '\n')
    return WordDict

示例#10

0

显示文件

文件： ficto_news.py 项目： rameez-mrq/ficto-news

def features(text):
    # POS-Tagging
    tagged = StanfordPOSTagger(model_filename=model_filename,
                               path_to_jar=path_to_jar,
                               encoding='utf8',
                               verbose=False,
                               java_options='-mx3000m')
    classified_word = tagged.tag(nltk.word_tokenize(text))
    text_postags = []
    for index_classified in classified_word:
        text_postags.append(index_classified[1])
    freq_pos = nltk.FreqDist(text_postags)
    adverb, adjective, noun, pronoun, verb = 0, 0, 0, 0, 0
    for index_freq in freq_pos.most_common(len(freq_pos)):
        if index_freq[0] in ["RB", "RBR", "RBS"]:
            adverb += index_freq[1]
        elif index_freq[0] in ["JJ", "JJR", "JJS"]:
            adjective += index_freq[1]
        elif index_freq[0] in ["NN", "NNS", "NNP", "NNPS"]:
            noun += index_freq[1]
        elif index_freq[0] in ["PRP", "PRP$"]:
            pronoun += index_freq[1]
        elif index_freq[0] in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
            verb += index_freq[1]
    X_test = []
    X_test.extend(
        (adverb / adjective, adverb / noun, adverb / pronoun, adjective / verb,
         adjective / pronoun, noun / verb, noun / pronoun, verb / pronoun))
    return X_test

示例#11

0

显示文件

def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records,
                                             pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: ' + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' %
              (idx, len(records), len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source

示例#12

0

显示文件

文件： data_preprocessing_3.py 项目： zhhz417/Learning

def _preprpcessing_eng(id_list):
    stop_w = set(stopwords.words('english'))
    eng_tagger = StanfordPOSTagger(
        model_filename=
        '/home/zhouh/Downloads/stanford-postagger-full-2018-02-27/models/english-bidirectional-distsim.tagger',
        path_to_jar=
        '/home/zhouh/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger.jar'
    )

    for i in id_list:
        try:
            text = read_file('/home/zhouh/Thesis/code/Transcripts/english/' +
                             i + '.txt')
            words = nt.word_tokenize(text, language='english')
            word = [x for x in words if x not in string.punctuation]
            word = [x for x in word if x not in stop_w]
            word = [x for x in word if not x.isdigit()]
            word = eng_tagger.tag(word)
            tt = ''
            for w in word:
                tt += '/'.join(w) + ' '

            new_path = '/home/zhouh/Thesis/code/Transcripts/eng_preprocessed/' + i + 'pre.txt'
            if os.path.exists(new_path):
                os.remove(new_path)
            with open(new_path, 'w') as f:
                f.write(tt)
        except:
            continue

示例#13

0

显示文件

文件： genere.py 项目： HauLou/Stylometrie

def genere_liste_natures(l_auteurs, STANFORD_PARSER='../stanford', STANFORD_MODELS='../stanford', JAVAHOME='/import/lhauseux/jre1.8.0_45/bin', bid = '/import/lhauseux/Pyzo/pyzo2015a/stanford-postagger-2015-04-20/models/english-bidirectional-distsim.tagger', pt = '/import/lhauseux/Pyzo/pyzo2015a/stanford-postagger-2015-04-20/stanford-postagger.jar'):
    # On règle les paramètres du prog' de Stangord et de java :
    os.environ['STANFORD_PARSER'] = STANFORD_PARSER
    os.environ['STANFORD_MODELS'] = STANFORD_MODELS
    os.environ['JAVAHOME'] = JAVAHOME
    
    st = StanfordPOSTagger(bid,path_to_jar=pt,java_options='-mx15000m') 
    nltk.internals.config_java(options='-xmx2G')

    
    # On crée le dossier forets si nécessaire
    if not os.path.isdir('./listes_natures'):
        os.mkdir('./listes_natures')
    for auteur in l_auteurs:
        # On crée le dossier propre à l'auteur si nécessaire
        if not os.path.isdir('./listes_natures/'+auteur):
            os.mkdir('./listes_natures/'+auteur)
        articles = os.listdir('./auteurs/'+auteur)
        for article in articles:
            if article != 'liens.txt':
                # On récupère l'article au format texte
                f = open('./auteurs/'+auteur+'/'+article,'r')
                contenu = f.read()
                f.close()
                # On le transforme en forêts d'arbres
                contenu = nltk.word_tokenize(contenu)
                contenu = st.tag(contenu)
                contenu = [c[1] for c in contenu]
                # On enregistre
                f = open('./listes_natures/'+auteur+'/'+article,'wb')
                pickle.dump(contenu,f)
                f.close()
                print(auteur,article)

示例#14

0

显示文件

文件： attribution_candidats.py 项目： Jboulery/MLTwitter

def fr_words(DATA_PATH, candidats) :

    import pandas as pd
    import operator
    import nltk
    from nltk.corpus import stopwords
    
    #nltk stanford french tagger
    from nltk.tag import StanfordPOSTagger
    jar = 'C:/Users/user/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger-full-2018-02-27/stanford-postagger-3.9.1.jar'
    model = 'C:/Users/user/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger-full-2018-02-27/models/french.tagger'
    import os
    java_path = "C:/ProgramData/Oracle/Java/javapath/java.exe"
    os.environ['JAVAHOME'] = java_path
    pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8' )
    
    #tokenizer (enlever les # @ et la ponctuation...)
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    
    #lecture des tweets
    df = pd.read_csv(DATA_PATH)
    df = df[df['text'].notnull()]
    a = len(df)
    fr_words = [[] for i in range (len(candidats))]
    indesirable = ["RT","https","http","c","les", "et", "ça","coach", "ils","thevoice", "quand", "donc","thevoice_tf1" ]
    
    for j in range (len(candidats)):   
        count = dict() 
        candidat = candidats[j]
        for i in range (0,a) :  
            if i in [ 7224, 16457,16458,22348,22349,22350,22351,22352, 22353,22354,22355] : 
                continue 
            else : 
                line = df.at[i,'text']
                tokenized = tokenizer.tokenize(line)
                # ne garder que les mots qui ne sont pas des stop words (de, que, dans...)
                # en minuscule 
                words = [ w.lower() for w in tokenized if (w not in stopwords.words('french') and w not in indesirable)]
                if set(candidat) & set(words):
                    for word in words :
                        
                        if word in count.keys() :
                            count[word] += 1
                        else :
                            count[word] = 1
                else:
                    continue
                
    
        count = sorted(count.items(), key=operator.itemgetter(1), reverse = True)
        
        fr_words1 = count [0:50]
        
        # enlever tous les verbes 
        for element in fr_words1 : 
            if pos_tagger.tag(element[0].split())[0][1] not in ['VINF','V'] :
                fr_words[j].append(element)
            else :
                continue
    return fr_words

示例#15

0

显示文件

文件： enrichment.py 项目： jabhinav/Educational-Content-Enrichment

def main():

    with open('/home/abhinav/PycharmProjects/video_enrichment/text.txt',
              'r') as myfile:
        text = myfile.read().replace('\n', '')

        # text = """Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora."""
    # text  = "Concepts present in text are outline of machine learning, data mining, statistics, cluster analysis, algorithms like logic, pseudo code."
    text = p.sub('', text)
    sentences = nltk.sent_tokenize(text)

    for sentence in sentences:
        sentence = sentence.lower()  # Lower Case the whole sentence
        sentence = p.sub(
            '', sentence)  # Removing anything enclosed within brackets
        print(sentence)

        ## TAGGING
        st_tag = StanfordPOSTagger(model_filename=eng_model_filename_pos,
                                   path_to_jar=my_path_to_pos_jar)
        tagged_sentence = st_tag.tag(word_tokenize(sentence))
        # print(tagged_sentence)

        ## ENTITY RECOGNITION
        # st_ner = StanfordNERTagger(model_filename=eng_model_filename_ner, path_to_jar=my_path_to_ner_jar)
        # print(st_ner.tag('Rami Eid is studying at Stony Brook University in NY'.split()))

        ## PARSING
        # print(parsing(sentence))

        ## Chunking Using Regex
        regex_exps = [
            "NP: {<JJ|NN.?>*<NN.?>}",
            "NP: {<JJ|NN.?>*<NN.?><IN>?<JJ|NN.?>*<NN.?>}", "NP: {<JJ>*<NN.?>+}"
        ]  # Include the following pattern to count conjuctions "NP: {<JJ|NN.?>*<NN.?><CC>?<JJ|NN.?>*<NN.?>}"
        for grammar in regex_exps:
            IOB_tagged = chunking(tagged_sentence, grammar)
            remove_IOBtags(IOB_tagged)

    # print(concept_count)

    ## Prune concepts on word level using word frequency count on BBC corpus
    prune_concepts_WordLevel()
    print("Pruned concepts are:", pruned_concepts)

    ## Identify Wikipedia articles(titles) that match concepts extracted from the text if Jaccard Similarity is one or if wikipedia title is a part of concept extracted
    Wikipedia_aritcle_matching()
    print("\n", concept_wiki_article)
    print("\nFinal List Of Concepts:", final_wiki_concepts)
    # prereq_graph.add_nodes_from(final_wiki_concepts)

    wiki_based_similarity()

    Connected_components = nx.connected_components(un_prereq_graph)
    print("\n Pre-req Graph successfully created")
    # print("\nConnected Components: ")
    # print(Connected_components)
    nx.draw(prereq_graph, with_labels=True)
    plt.axis('off')
    plt.savefig("graph_prereq.png")

示例#16

0

显示文件

文件： my_util.py 项目： swdanielli/nlp_util

def get_pos_tags(content, stopwords, is_stemming, is_math):
  # Content should be tokenized
  pos_tagger_dir = '/usr/users/swli/program/nlp_util/stanford-postagger'
  model = pos_tagger_dir + '/models/wsj-0-18-bidirectional-distsim.tagger'
  classpath = pos_tagger_dir + '/stanford-postagger_with_slf4j.jar'
  tagger = StanfordPOSTagger(model, classpath, java_options='-mx4000m')
  try:
    tag_results = tagger.tag(re.split('\s+', content))
  except OSError:
    sentences = re.split('\s+\.\s+', content)
    tag_results = []
    for index in range(len(sentences)):
      sentence = sentences[index]
      if index < len(sentences)-1:
        sentence += ' .'
      tag_results += get_contaminated_tag_results(sentence, tagger)
 
  pos_tags = []
  for pair in tag_results:
    word = pair[0]
    # map simple equation to tokens
    if is_math:
      word = simple_eq_to_text(word)
    # remove punctuation
    word = "".join(l for l in word if l not in string.punctuation)
    word = word.lower()
    word = process_word(word, stopwords, is_stemming, is_math)
    if word:
      pos_tags.append(pair[1])
  return pos_tags

示例#17

0

显示文件

def pos_tag(review):
    eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
    tmp = eng_tagger.tag(review)
    result = []
    for element in tmp:
        result.append(element[1])
    return result

示例#18

0

显示文件

def pos_tag(mots,
            jar=os.path.join(".", "models", "stanford-postagger",
                             "stanford-postagger-3.8.0.jar"),
            mdl=os.path.join(".", "models", "stanford-postagger",
                             "french-ud.tagger")):
    try:
        pos_tagger = StanfordPOSTagger(mdl, jar, encoding='utf8')
    except LookupError:
        java_path = r"C:\Program Files (x86)\Java\jre1.8.0_261\bin\java.exe"
        os.environ['JAVAHOME'] = java_path
        pos_tagger = StanfordPOSTagger(mdl, jar, encoding='utf8')
    tagged = pos_tagger.tag(mots)
    tags = [g for m, g in tagged]
    forced_det = ["au", "aux"]
    absent_of_table = ["PART", "SCONJ"]
    if any(item in mots
           for item in forced_det) or any(item in tags
                                          for item in absent_of_table):
        for i, couple in enumerate(tagged):
            mot = couple[0]
            gram = couple[1]
            if mot in forced_det:
                tagged[i] = (mot, "DET")
            if gram == "PART":
                tagged[i] = (mot, "ADV")
            if gram == "SCONJ":
                tagged[i] = (mot, "CONJ")
    return tagged

示例#19

0

显示文件

文件： reia.py 项目： aditi2205/reia2

def call_reia():
    while (True):
        max_score = 0.1
        map_val = ""
        with open('/media/ubuntu/Local Disk/MAJOR_PROJECT/REIA/mqueue.txt',
                  'r') as f:
            first_line = f.readline()
            while first_line == "":
                time.sleep(1)
                call_reia()
        print('-----------------------')
        user_input = first_line.split(' ', 1)[1]
        user_name = get_username(first_line.split(' ', 1)[0])
        suggest_list = []
        suggest_message = ""
        #prev_ts = ts
        print("\nINPUT = ")
        print(user_input)
        label = classify(user_input)
        if label == "":
            post_message(
                "Sorry, I could not understand. Please rephrase and try again."
            )
            consume_message()
            continue
        print("Classified as : " + str(label))
        tokens = nltk.word_tokenize(user_input)
        print(tokens)
        st = StanfordPOSTagger(config['tagger']['model'],
                               path_to_jar=config['tagger']['path'])
        stanford_tag = st.tag(user_input.split())
        print("Tags")
        print(stanford_tag)
        with open(MAPPING_PATH, 'r') as data_file:
            data = json.load(data_file)
        for i in data[label]:
            dist = jf.jaro_distance(str(user_input), str(i))
            suggest_list.append(tuple((dist, i)))
            print(dist)
            if (dist > max_score):
                max_score = dist
                map_val = i
        if max_score < config['preferences']['similarity_threshold']:
            post_message(
                "Sorry, I could not understand. Please rephrase and try again."
            )
            consume_message()
            if config['preferences']['suggestions'] == True:
                suggest = suggestions(suggest_list)
                post_message("Did you mean :")
                for i in suggest:
                    suggest_message += (str(i[1]) + "\n")
                post_message(suggest_message)
            continue
        print("\nMapped to : " + map_val)
        #post_message(map_val)
        construct_command(user_input, label, tokens, map_val, stanford_tag,
                          exec_command, user_name)
        #call('sed -i -e "1d	" REIA/mqueue.txt')
        consume_message()

示例#20

0

显示文件

文件： stanford_nlp.py 项目： chaoxu95/phd-thesis-code

def get_postagger_for_criterion(criterion):
    ini_path = "/stanford/postagger"
    os.environ['STANFORD_PARSER'] = ini_path
    os.environ['STANFORD_MODELS'] = ini_path
    os.environ['CLASSPATH'] = ini_path
    st = StanfordPOSTagger('models/english-bidirectional-distsim.tagger')
    postagger_list = st.tag(criterion)
    return postagger_list

示例#21

0

显示文件

def standford_pos(text):
    eng_tagger = StanfordPOSTagger(
        model_filename=
        r'D:\Program Files\stanford-corenlp-full\stanford-postagger\models\english-bidirectional-distsim.tagger',
        path_to_jar=
        r'D:\Program Files\stanford-corenlp-full\stanford-postagger\stanford-postagger.jar'
    )
    return eng_tagger.tag(text.split())

示例#22

0

显示文件

    def getTagged(self, text):
        from nltk.tag import StanfordPOSTagger

        if self.lang == 1:
            jar = 'stanford-pos-tagger/stanford-postagger-3.8.0.jar'
            model = 'stanford-pos-tagger/french.tagger'
            pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
            tokenizedText = nltk.word_tokenize(text.lower())
            taggedText = pos_tagger.tag(tokenizedText)
        else:
            jar = 'stanford-pos-tagger/stanford-postagger-3.8.0.jar'
            model = 'stanford-pos-tagger/arabic.tagger'
            pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
            tokenizedText = nltk.word_tokenize(text.lower())
            taggedText = pos_tagger.tag(tokenizedText)
            print(taggedText)
        return taggedText

示例#23

0

显示文件

文件： features.py 项目： BMKEG/exp-parser

class FeatureProcessing(object):
  def __init__(self):
    self.feat_index = {}
    self.implication_words = ["demonstrate", "suggest", "indicate"]
    self.hyp_words = ["possible"]
    self.method_words = ["probe", "detect"]
    self.pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')

  def get_features(self, phrase, filter_feature='0'):
    words = word_tokenize(phrase)
    pos_tags = self.pos_tagger.tag(words)
    features = []
    for word, tag in pos_tags:
      wl = word.lower()
      # Feat 1: POS features
      if filter_feature != '1':
        if tag != ',' and tag != '.':
          features.append(tag)
      # Feat 2: Verb and adverb identity
      if filter_feature != '2':
        if tag == 'RB' or tag.startswith('VB'):
          features.append(wl)
      # Feat 3: Presence of figure references and citations
      if filter_feature != '3':
        if word.startswith("Fig"):
          features.append("figure")
        if re.search("[A-Z][^\s]+ et al.", phrase) is not None:
          features.append("reference")
    # Feat 4: Presence of specific words or phrases
    if filter_feature != '4':
      if re.search("[Dd]ata not shown", phrase) is not None:
        features.append("data_not_shown")
      for word in self.implication_words:
        if word in phrase:
          features.append("implication_word")
      for word in self.hyp_words:
        if word in phrase:
          features.append("hyp_word")
      for word in self.method_words:
        if word in phrase:
          features.append("method_word")
    return features

  def index_data(self, data, filter_feature='0'):
    all_features = [self.get_features(datum, filter_feature) for datum in data]
    for features in all_features:
      for feat in features:
        if feat not in self.feat_index:
          self.feat_index[feat] = len(self.feat_index)

  def featurize(self, phrase, filter_feature='0'):
    indexed_features = [0] * len(self.feat_index)
    features = self.get_features(phrase, filter_feature)
    for feat in features:
      if feat not in self.feat_index:
        continue
      indexed_features[self.feat_index[feat]] += 1
    return indexed_features

示例#24

0

显示文件

文件： gen_keyphrase_core_stanford.py 项目： SeerLabs/semeval2017

def gen_keyphrases(text):
    # Used when tokenizing words
    sentence_re = r'''(?x)        # set flag to allow verbose regexps
        (?:[A-Z])(?:\.[A-Z])+\.?    # abbreviations, e.g. U.S.A.
        | \w+(?:-\w+)*            # words with optional internal hyphens
        | \$?\d+(?:\.\d+)?%?        # currency and percentages, e.g. $12.40, 82%
        | \.\.\.                # ellipsis
        | [][.,;"'?():-_`]        # these are separate tokens
    '''

    lemmatizer = nltk.WordNetLemmatizer()

    #Taken from Su Nam Kim Paper...
    grammar = r"""
        NBAR:
            {<NN.*|JJ>*<NN.*>}    # Nouns and Adjectives, terminated with Nouns

        NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}    # Above, connected with in/of/etc...
    """
    chunker = nltk.RegexpParser(grammar)

    tokenizer = nltk.RegexpTokenizer(sentence_re)
    toks = tokenizer.tokenize(text)
    span_toks = tokenizer.span_tokenize(text)
    logger.debug(toks)
    logger.debug("tokens: %(1)d" % {"1": len(toks)})

    # old way of tokenization
    #toks = nltk.regexp_tokenize(text, sentence_re)
    st = StanfordPOSTagger(config.stanford_bidirectional_tagger_path,
                           config.stanford_postagger_jar_path,
                           encoding="utf8",
                           java_options="-mx8g")
    _postoks = st.tag(toks)
    # examine the postags, if "[", then change the tag to "X", create a new list
    postoks = []
    for pt in _postoks:
        if pt[0] == "[":
            postoks.append(('[', 'X'))
        elif pt[0] == "]":
            postoks.append((']', 'X'))
        else:
            postoks.append(pt)
    logger.info(postoks)

    # NLTK POS Tagger
    #postoks = nltk.tag.pos_tag(toks)
    logger.debug("postoks: %(1)d" % {"1": len(postoks)})
    tree = chunker.parse(postoks)
    # cast a Tree into a ParentedTree
    ptree = nltk.ParentedTree.convert(tree)
    # for each token, record its tree position
    pos_map = generate_pos_map(ptree, span_toks)

    stopwords = nltk.corpus.stopwords.words('english')
    return get_terms(ptree, lemmatizer, stopwords, pos_map)

示例#25

0

显示文件

文件： newsTest.py 项目： choon94/choon94.github.io

def postagger():
	os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27/stanford-postagger.jar'
	os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH']

	eng_tagger = StanfordPOSTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27/models/english-bidirectional-distsim.tagger')

	for x in content:
		print(eng_tagger.tag(x.split()))

示例#26

0

显示文件

def number(sentence):
    pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
    tagged_sentence = pos_tagger.tag(sentence.split())
    int_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    numbers = [
        word for word, tag in tagged_sentence
        if ((tag == 'DET' and det_or_nb(word) == 'nb') or (word[0] in int_list)
            )
    ]
    return (' '.join(numbers))

示例#27

0

显示文件

def _tagging(data):
    df = pd.read_csv("/var/www/pyapi/scripts/Stanford_POS_Tags.csv")
    os.environ[
        "STANFORD_MODELS"] = "/var/www/pyapi/scpDocs/stanford-postagger-full-2017-06-09/models"
    spanish_postagger = StanfordPOSTagger(
        'spanish.tagger',
        '/var/www/pyapi/scpDocs/stanford-postagger-full-2017-06-09/stanford-postagger.jar'
    )
    tagged = spanish_postagger.tag(data.split())
    return _describe_stanford_pos_tag(tagged, df)

示例#28

0

显示文件

def stanford_pos_tag(text, java_path=None):
    _setup_java_home(java_path)
    model_name = "english-caseless-left3words-distsim.tagger"

    stanfort_dir = get_from_resource("stanford-postagger-full-2018-10-16")
    jar = str(stanfort_dir.joinpath("stanford-postagger-3.9.2.jar"))
    model = str(stanfort_dir.joinpath("models/{}".format(model_name)))

    st = StanfordPOSTagger(model, jar, encoding="utf8")
    text_tokenized = word_tokenize(text)
    return st.tag(text_tokenized)

示例#29

0

显示文件

文件： question_parser.py 项目： Fabrice-64/chatbot

    def _tag_words(self, jar, model, cleaned_sentence):
        """
            Arguments:
            cleaned_sentence

            Returns:
            tagged_words: a list containing tuples i.e (word, syntactic value).
            """
        pos_tagger = StanfordPOSTagger(model, jar, encoding="utf-8")
        tagged_words = pos_tagger.tag(word_tokenize(cleaned_sentence))
        return tagged_words

示例#30

0

显示文件

文件： SentenceDetailsModified.py 项目： mahmuduzzaman/ANLP-Grammatical-Error-Correction

 def getSolutionInTag(self):
     english_postagger = StanfordPOSTagger(
         "/home/geethu/Documents/project anlp/c/models/english-bidirectional-distsim.tagger",
         "/home/geethu/Documents/project anlp/c/stanford-postagger.jar")
     print(self.synt)
     #retag the sentence again
     new_tag_values = english_postagger.tag(self.words)
     for index in range(len(self.words)):
         print(index)
         word, tag = new_tag_values[index]
         self.synt[index] = tag

示例#31

0

显示文件

def Text_to_tag(Readfile, file):
    if path.exists(sys.path[0] + '/Preparation/save/data/' + file):
        remove(sys.path[0] + '/Preparation/save/data/' + file)
    tagger = StanfordPOSTagger(model_filename, path_to_jar)
    for line in open(Readfile):
        TagList = []
        sentence = tagger.tag(line.split())
        for WordTag in sentence:
            TagList.append(WordTag[1])
        with open(file, 'a', encoding='utf-8') as Writer:
            Writer.write(" ".join(TagList) + '\n')

示例#32

0

显示文件

def impp(input_question):
	try:
		import numpy as np
		import os 
		os.getcwd()
		import pandas as pd
		import spacy
		from . import formula
		nlp = spacy.load('en_core_web_sm')
		from difflib import SequenceMatcher
		import re
		import nltk
		import pprint
		pp = pprint.PrettyPrinter(indent=4)
		from nltk import word_tokenize
		from nltk.corpus import stopwords
		path_to_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0.jar'
		path_to_models_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0-models.jar'

		jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-postagger-3.8.0.jar'
		model = '/usr/local/lib/python2.7/dist-packages/nltk/tag/models/english-left3words-distsim.tagger'
		from nltk.parse.corenlp import CoreNLPParser
		from nltk.tag import StanfordNERTagger
		from nltk.parse.stanford import StanfordParser
		from nltk.parse.stanford import StanfordDependencyParser
		from nltk.stem import PorterStemmer
		from nltk.tokenize import sent_tokenize
		from nltk.tag import StanfordPOSTagger
		pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
	
		dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
		#print ("1")
		#print (os.path.exists('/home/piut/django-apps/wps/wps/patterns.csv'))
		#print ("2")	
		pattern=read('patterns.csv')
		#print ("1")	
		#print pattern
		question=input_question
		tagged_question=pos_tagger.tag(nltk.word_tokenize(question))
		doc = nlp(question)
		#print "###################################################################"
		#print doc
		#print ("2")
		result = dependency_parser.raw_parse(question)
		#pp.pprint(tagged_question)
		#print ("3")
		#return str(moreMoney(dependency,doc,pattern,unknown))
		unknown=find(tagged_question,question,doc,input_question)
		if unknown==0:
			return 0
		return unknown
  # 		fe
	except:
		return 0

示例#33

0

显示文件

def token_after(token, sentence):
    k = 0
    pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
    tagged_sentence = pos_tagger.tag(sentence.split())
    for i in range(len(tagged_sentence)):
        if tagged_sentence[i][0] == token:
            k = i
    if tagged_sentence[k + 1][1] == 'NC':
        return (' '.join([tagged_sentence[k + 1][0]]))
    else:
        return None

示例#34

0

显示文件

文件： POSTagger.py 项目： JamezMing/flaskChatbot

class POSTagger:
    def __init__(
        self,
        path_to_model="/home/james/Downloads/stanford-postagger-full-2016-10-31/models/english-bidirectional-distsim.tagger",
        path_to_jar="/home/james/Downloads/stanford-postagger-full-2016-10-31/stanford-postagger.jar"
    ):
        self.tagger = StanfordPOSTagger(path_to_model, path_to_jar)

    def parse(self, line):
        line = nltk.word_tokenize(line)
        return self.tagger.tag(line)

示例#35

0

显示文件

文件： POSEmojiLex.py 项目： alisonrib17/Pesquisa-IC

    def create_pos(self, tweet):
        self.pos_tweet = None

        tweet = word_tokenize(tweet)

        english_pos = StanfordPOSTagger(
            'postagger/models/english-bidirectional-distsim.tagger',
            'postagger/stanford-postagger.jar')

        self.pos_tweet = english_pos.tag(tweet)

        return self.pos_tweet

示例#36

0

显示文件

文件： POS_Tagging.py 项目： AnirudhNarasimhamurthy/Natural-Language-Processing-Fall-2015

def pos_tagging(sentence):

    english_postagger=StanfordPOSTagger('stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-08-27/stanford-postagger.jar')

    VP_list=[]

    POS_list=english_postagger.tag(sentence.split())

    '''for i in range(0, len(POS_list)):
        if POS_list[i][1] in ['NNS','NNP','NNPS']:
            NP_list.append(POS_list[i][0])'''


    return POS_list

示例#37

0

显示文件

文件： calculate_sentence_features.py 项目： StonyBrookNLP/Alignment-Classifier

def get_pos_tag(sen):#pass sentence dataframe
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen

示例#38

0

显示文件

文件： POS_Tagging.py 项目： AnirudhNarasimhamurthy/Natural-Language-Processing-Fall-2015

def pos_person_tagging(sentence):

    #Setting the path and jar files for the POS Tagger

    english_postagger=StanfordPOSTagger('stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-08-27/stanford-postagger.jar')

    NP_list=[]

    POS_list=english_postagger.tag(sentence.split())

    for i in range(0, len(POS_list)):
        if POS_list[i][1] in ['NNS','NNP','NNPS']:
            NP_list.append(POS_list[i][0])


    return NP_list

示例#39

0

显示文件

文件： tagQTS.py 项目： scylense/wenji

def main():

    initialize()
    # create tagger
    model = '../stanford-postagger/models/chinese-distsim.tagger'
    jar = '../stanford-postagger/stanford-postagger.jar'
    zhPOS = StanfordPOSTagger(model, jar)

    # streaming model: process each line in turn
    with io.open(INFILE, 'r', encoding='utf8') as qts, io.open(OUTFILE, 'w', encoding='utf8') as pos:

        for line in qts:
            qtsPOS = zhPOS.tag(line)
            s = " ".join("%s" % tup[1] for tup in qtsPOS) + "\n"
            pos.write(s)

    return()

示例#40

0

显示文件

文件： naive_bayes.py 项目： bryantan/10701_finalproject

class POSTagger:

    def __init__(self, tagger_path, model_path, output_filename):
        self.st = StanfordPOSTagger(tagger_path, model_path)
        self.output_filename = output_filename
        try:
            os.remove(self.output_filename)
        except OSError:
            pass

    def output_knowledge(self, sentence):
        sentence += " ."
        s = ""
        with open(self.output_filename, "a") as file:
            for word, pos_tag in self.st.tag(sentence.split()):
                file.write(("%s\t%s\n" % (word, pos_tag)).encode("utf-8"))
            file.write("\n")

示例#41

0

显示文件

文件： alignment_utils.py 项目： StonyBrookNLP/Alignment-Classifier

def get_pos_tag(sen):
    os.environ['CLASSPATH']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/stanford-postagger.jar' #set classpath to pos tagger
    os.environ['STANFORD_MODELS']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/models'
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen

示例#42

0

显示文件

文件： 11-7-part-of-speech-tagging.py 项目： wiznut/Examples

from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize

STANFORD_POS_MODEL_PATH = '압축을 푼 장소/models/english-bidirectional-distsim.tagger'
STANFORD_POS_JAR_PATH = '압축을 푼 장소/stanford-postagger-3.6.0.jar'

pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH, STANFORD_POS_JAR_PATH)

# 임의로 만들어낸 예제입니다. 이 부분을 원하는 문장으로 바꿔서 실습하세요.
text = 'One day in November 2016, the two authors of this book, Seungyeon and Youngjoo, had a coffee at Red Rock cafe, which is a very popular place in Mountain View.'

tokens = word_tokenize(text)
print(tokens)  # 쪼개진 토큰을 출력합니다.
print()
print(pos_tagger.tag(tokens))  # 품사 태깅을 하고 그 결과를 출력합니다.

# 동사와 명사만 뽑아봅시다.
noun_and_verbs = []
for token in pos_tagger.tag(tokens):
    if token[1].startswith('V') or token[1].startswith('N'):
        noun_and_verbs.append(token[0])
print(', '.join(noun_and_verbs))

示例#43

0

显示文件

文件： rte_train_1_without_weightlearning.py 项目： asd36952/RTE

    buf = 0
    for k in range(len(synonymSet_h)):
        for n in range(len(synonymSet_t)):
            ##############################modifying function######################
            #if synonymSet_h[k].wup_similarity(synonymSet_t[n]!=None):
            #    x.append(synonymSet_h[k].wup_similarity(SynonymSet_t[n]))
            if synonymSet_h[k].wup_similarity(synonymSet_t[n])!=None:
                if buf<synonymSet_h[k].wup_similarity(synonymSet_t[n]):
                    buf=synonymSet_h[k].wup_similarity(synonymSet_t[n])
    return buf

for m in root.findall("pair"):
    hypothesis=m.findtext("h").casefold()
    tokenized_hypothesis=nltk.word_tokenize(hypothesis)
#    tagged_tokenized_hypothesis=nltk.pos_tag(tokenized_hypothesis)    #nltk tagger
    tagged_tokenized_hypothesis=st.tag(tokenized_hypothesis)    #stanfordnlp tagger
    text=m.findtext("t").casefold()
    tokenized_text=nltk.word_tokenize(text)
#    tagged_tokenized_text=nltk.pos_tag(tokenized_text)    #nltk tagger
    tagged_tokenized_text=st.tag(tokenized_text)    #stanfordnlp tagger
    output.write("newhypo:\n")
    for i in range(len(tokenized_hypothesis)):
        output.write(tagged_tokenized_hypothesis[i][0])
        output.write(tagged_tokenized_hypothesis[i][1])
    output.write("newtext:\n")
    for j in range(len(tokenized_text)):
        output.write(tagged_tokenized_text[j][0])
        output.write(tagged_tokenized_text[j][1])
    output.write("value:\n")
    output.write(m.get("entailment"))
    output.write("\n")

示例#44

0

显示文件

文件： sampleGoose.py 项目： anuragknp/CSCI544-MBTI-tagger

__author__ = 'Anirudh'

import codecs
import nltk
from nltk.tag import StanfordPOSTagger
nltk.internals.config_java("C:\Program Files\Java\jdk1.8.0_60\\bin\java.exe")

import os
java_path = "C:\Program Files\Java\jdk1.8.0_60\\bin\java.exe"
os.environ['JAVAHOME'] = java_path


# st = StanfordPOSTagger('english-bidirectional-distsim.tagger')

st = StanfordPOSTagger('D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\models\\arabic.tagger','D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\stanford-postagger.jar')
#st = StanfordPOSTagger('D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\models\\english-bidirectional-distsim.tagger','D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\stanford-postagger.jar')

file="arabic_in.txt"
source = codecs.open(file,"r","utf-16-be")
destination = codecs.open("utf8encoder_out.txt","wb","utf-8")
contents=source.read()
destination.write(contents)

destination = codecs.open("utf8encoder_out.txt","r","utf-8")
contents2=destination.read()

print contents2.split()

print st.tag(contents2.split())

示例#45

0

显示文件

文件： filter.py 项目： DDMAL/HHP-EventsEditor

readFile = (open(filename)).read()
paras = readFile.split('\n')
parasCopy = []
paraIndex = 0
for paragraph in paras:
	paraIndex += 1
	logging.info('Processing paragraph %d' %paraIndex)
	if not paragraph == '':
		name = ''
		paragraphCopy = ""
		sentenceList = getSentences(paragraph)
		sentenceIndex = 0
		for sentence in sentenceList:
			sentenceIndex += 1
			logging.info('Processing sentence %d' %sentenceIndex)
			tokens = POSTagger.tag(sentence.split())
			logging.info('POS Tagging of a sentence')
			nameAnalysis = getName (sentence, tokens)
			sentenceCopy = sentence
			if nameAnalysis[0] == '' and nameAnalysis[1] > 0 and not name == '':
				sentenceCopy = replacePRP(nameAnalysis[2], name, sentence)
			elif not nameAnalysis[0] == '' and nameAnalysis[3] == 1:
				name = nameAnalysis[0]
			if sentenceCopy.count('(') > 0 and not name == '':
				dateBucket = bracketProcess(sentenceCopy, tokens)
				sentenceCopy = bracketRemove(sentenceCopy)
				if not dateBucket == []:
					date_1 = dateBucket[0]
					sentence_1 = name + " was born in"
					for i in date_1:
						sentence_1 += " " + i

示例#46

0

显示文件

文件： swagger_parser.py 项目： kenmick/SwaggerParser

        home_path + '/stanford-postagger.jar')

url_noun = []
url_not_noun = []
pos = ['NN', 'NNS', 'IN', 'JJ', 'JJS', 'RB', 'TO', 'PRP', 'PRP$', 'NNP', 'NNPS', 'DT', 'VBG', 'VBN', 'VBD']
count = 1

for path in paths:
    print str(count) + '/' + str(len(paths))
    count += 1
    isNoun = True
    print path
    # remove parameters in path, such as {id}, [id], :id, and split url by level, namely by '/'
    urls = re.sub('/?[\[{].*?[\]}]|/:\w+', '', path).replace('.json', '').lstrip('/').split('/')
    for url in urls:
        for word_pos in st.tag(get_divided_url(url)):
            # print st.tag(get_divided_url(url))
            if word_pos[1] not in pos:
                url_not_noun.append(path)
                isNoun = False
                break
        if not isNoun:
            break
    if not isNoun:
        continue
    url_noun.append(path)

# save result to swagger_statistic.json
swagger_statistic = OrderedDict()
swagger_statistic['host'] = host
swagger_statistic['basePath'] = basePath

示例#47

0

显示文件

文件： Code.py 项目： diawahad/Tableaux_De_Bord

"""

######################################################################################

from nltk.tag import StanfordPOSTagger
jar = 'C:/Users/Etudiant/Documents/Tableau de bord/stanford-postagger-full-2018-02-27/stanford-postagger-3.9.1.jar'
model = 'C:/Users/Etudiant/Documents/Tableau de bord/stanford-postagger-full-2018-02-27/models/french.tagger'
import os
java_path = "C:/Program Files/Java/jdk1.8.0_151/bin/java.exe"
os.environ['JAVAHOME'] = java_path
pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
words={}
tab2 = {}
for i in range(5):
   select=[]
   n=pos_tagger.tag(tab[i])
   stops_verb=['NC','N','NPP']
   for x in n:
       if x[1] in stops_verb:
           select.append(x[0])
           #sel = max(set(select), key=select.count)
   #tab2[i]=sel
   words={}
   for word in set(select):
       
       count = 0  
       for j in range(len(select)):
           if word == select[j]:
               count += 1
       words[word]= count
   tab2[i] = (sorted( words.items(), key = lambda x : -x[1]))[:2]

示例#48

0

显示文件

文件： Process_XML_ZH.py 项目： atechnicolorskye/Data-Processing

def Process_ZH(File):
    # Read file
    with open(File, 'r') as File:
        # print 'Opened'
        Input = File.readlines()


    try:
        for line in Input:
            # Checks if identifier is in the line
            if 'segment' in line:
                #Sets counter to be on and starts count at 0
                Annotations = []
                Annotation_Next = False
                Line = []
                Word_Count = 0

                # Switch to UTF-8 to ensure accurate counting
                Line_UTF8_Decode = line.decode('utf-8')
                Line_Split = Line_UTF8_Decode.split()

                for Split in Line_Split:
                    if 'feature' in Split:
                        Annotations.append([Split[17:-1], Word_Count, 0])
                        # print Annotations

                    elif 'state=' in Split:
                        Line_Temp = re.findall('>([^>]*)</', Split)
                        # print 'State_1'
                        if Line_Temp != []:
                            # Ensures that Line_Temp is a string
                            Line_Temp = Line_Temp[0]
                            Word_Count += 1
                            # print 'State_2'

                            # To ensure nested entities are parsed correctly
                            if Annotations[-1][2] != 0:
                                Annotation_Next = True
                                Length = range(len(Annotations))
                                for x in Length[::-1]:
                                    if Annotations[x][2] == 0 and Annotation_Next == True:
                                        Annotations[x][2] = Word_Count
                                        Annotation_Next = False
                            else:
                                Annotations[-1][2] = Word_Count
                            # print Annotations
                        elif Line_Temp == [] and '<segment' in Split[15:]:
                            pass
                        else:
                            Word_Count += 1
                            Line_Temp = Split[15:]
                            # print 'State 3'
                        if Line_Temp != []:
                            Line.append(Line_Temp)

                    elif '</segment>' in Split:
                        Seg_Split = Split.split('</segment>')
                        for x in Seg_Split:
                            if x != '':
                                Word_Count += 1
                                Line.append(x)
                            elif x == '':
                                if Annotations[-1][2] != 0:
                                # print 'Seg 2'
                                    Annotation_Next = True
                                    Length = range(len(Annotations))
                                    for x in Length[::-1]:
                                    # print Annotations[x][2]
                                        if Annotations[x][2] == 0 and Annotation_Next == True:
                                        # print 'Seg 3'
                                            Annotations[x][2] = Word_Count
                                            Annotation_Next = False
                                else:
                                    Annotations[-1][2] = Word_Count
                            # print Annotations

                        # if '<' not in Split[0]:
                        #     Word_Count += 1
                        #     print Split
                        #     Line_Temp = Split[:-10]
                        #     print Line_Temp
                        #     Line.append(Line_Temp)
                        #     # print 'Seg_1'
                        #     if Annotations[-1][2] != 0:
                        #         # print 'Seg 2'
                        #         Annotation_Next = True
                        #         Length = range(len(Annotations))
                        #         for x in Length[::-1]:
                        #             # print Annotations[x][2]
                        #             if Annotations[x][2] == 0 and Annotation_Next == True:
                        #                 # print 'Seg 3'
                        #                 Annotations[x][2] = Word_Count
                        #                 Annotation_Next = False
                        #     else:
                        #         Annotations[-1][2] = Word_Count
                        #     # print Annotations

                    elif '<segment' not in Split:
                        # print Split
                        Line.append(Split)
                        # Checks if Split is a punctuation character
                        if re.findall('[%s]' % zhon.hanzi.punctuation, Split) == [] and Split != ':':
                            Word_Count += 1
                Line_Done = ' '.join(Line)

                # Tags using StanfordPOSTagger

                ST = StanfordPOSTagger('~/Annotations/models/chinese-distsim.tagger', '~/Annotations/stanford-postagger.jar', encoding='utf-8')
                Tags = ST.tag(Line)
                Tags_Done = ''
                for x in Tags:
                    # print x
                    Tags_Done += x[1][-2:] + ' '

                # print Line_Done
                # print Tags_Done

                Annotations_Done = ''
                for x in Annotations:
                    Annotations_Done += str(x[1]) + ',' + str(x[2]) + ',' + str(x[1]) + ',' + str(x[2]) + ' ' + x[0].upper() + '|'
                # print Annotations_Done

                with open('Processed_Annotations.txt', 'a') as P_A:
                        P_A.write(Line_Done.encode('utf-8') + '\n')
                        P_A.write(Tags_Done + '\n')
                        P_A.write(Annotations_Done[:-1] + '\n' + '\n')
    except IndexError:
        pass

示例#49

0

显示文件

文件： nltk_test.py 项目： xiabofei/python_details

#
# # 中文命名实体识别
# chi_tagger = StanfordNERTagger('chinese.misc.distsim.crf.ser.gz')
# sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'
# for word, tag in chi_tagger.tag(sent.split()):
#     print word.encode('utf-8'), tag
#
# # 英文词性标注
from nltk.tag import StanfordPOSTagger
# eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
# print eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split())
# # 中文词性标注
chi_tagger = StanfordPOSTagger('chinese-distsim.tagger')
# sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'
sent = u'宫体 子宫 呈 垂直位 宫内膜 高 T2 信号 连续'
for _, word_and_tag in chi_tagger.tag(sent.split()):
    word, tag = word_and_tag.split('#')
    print word.encode('utf-8'), tag


# 中英文句法分析 区别在于词库不同
from nltk.parse.stanford import StanfordParser
eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
sent = list(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split())
for tree in eng_parser.parse(sent):
    tree.pprint()


# 依存关系分析
from nltk.parse.stanford import StanfordDependencyParser
eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')

示例#50

0

显示文件

文件： test_data.py 项目： sureshbvn/nlpProject

            resultList.append(3)
        elif util.is_ending_withComma(token):
            resultList.append(4)
        else:
            resultList.append(5)
    return resultList



if __name__=="__main__":

    string = open(properties.test_raw).read()
    str_ = re.sub('[^a-zA-Z0-9\n\.\,\x7f-\xff]', ' ', string)
    resultList = classLabel(str_.split())
    cleaned_test_str = re.sub('[^a-zA-Z0-9\n\x7f-\xff]', ' ', string).lower()
    postag_t = st.tag(cleaned_test_str.split())
    text_file = open(properties.test_tagged_output_file, "w")
    invokeChunker(cleaned_test_str)
    chunkTags = extractChunkTags()

    postag = []
    l = [',','...','.','\'','!']
    for i in range(len(postag_t)):
        if postag_t[i][0] not in l:
            postag.append(postag_t[i])

    for i in range(len(postag)):
        tup = postag[i]
        token = tup[0]
        tag = tup[1]
        chunkTag = chunkTags[i][3]