示例#1
0
class Lemmatizer(AbstractStemmer):
    def __init__(self, ):
        super(Lemmatizer, self).__init__()
        self.basename = 'lemmatized'
        self.pos_tagger = StanfordPOSTagger(
            'english-left3words-distsim.tagger', java_options='-mx1024m')
        self.lemmatizer = WordNetLemmatizer()
        self.max_length = 500

    def process(self, words):
        current_sentence = []
        pos_words = []
        for word in words:
            current_sentence.append(word)
            if word in '.!?' and len(current_sentence) > self.max_length:
                try:
                    pos_words += self.pos_tagger.tag(current_sentence)
                except Exception:
                    print 'Broke on', current_sentence
                    raise
                current_sentence = []
        for i in range(len(current_sentence) / self.max_length):
            try:
                pos_words += self.pos_tagger.tag(
                    current_sentence[:self.max_length])
            except Exception:
                print 'Broke on', current_sentence[:self.max_length]
                raise
            current_sentence = current_sentence[self.max_length:]
        try:
            pos_words += self.pos_tagger.tag(current_sentence)
        except Exception:
            print 'Broke on', current_sentence
            raise
        processed_words = [
            self.lemmatizer.lemmatize(wd, pos=self.get_wn_pos(ps))
            for wd, ps in pos_words
        ]
        return processed_words

    # from http://stackoverflow.com/questions/15586721
    def get_wn_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
示例#2
0
class Lemmatizer(AbstractStemmer):

    def __init__(self, ):
        super(Lemmatizer, self).__init__()
        self.basename = 'lemmatized'
        self.pos_tagger = StanfordPOSTagger('english-left3words-distsim.tagger', java_options='-mx1024m')
        self.lemmatizer = WordNetLemmatizer()
        self.max_length = 500

    def process(self, words):
        current_sentence = []
        pos_words = []
        for word in words:
            current_sentence.append(word)
            if word in '.!?' and len(current_sentence) > self.max_length:
                try:
                    pos_words += self.pos_tagger.tag(current_sentence)
                except Exception:
                    print 'Broke on', current_sentence
                    raise
                current_sentence = []
        for i in range(len(current_sentence) / self.max_length):
            try:
                pos_words += self.pos_tagger.tag(current_sentence[:self.max_length])
            except Exception:
                print 'Broke on', current_sentence[:self.max_length]
                raise
            current_sentence = current_sentence[self.max_length:]
        try:
            pos_words += self.pos_tagger.tag(current_sentence)
        except Exception:
            print 'Broke on', current_sentence
            raise
        processed_words = [self.lemmatizer.lemmatize(wd, pos=self.get_wn_pos(ps)) for wd, ps in pos_words]
        return processed_words

    # from http://stackoverflow.com/questions/15586721
    def get_wn_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
def emotion_pos_tagging():
    tag_target = ['V', 'N', 'J', 'R']
    tag_list = []

    # 단어 사전 엑셀 파일 입력
    df_emotion = open_emotion_dataframe()

    # 품사 태깅
    for word in df_emotion['영어']:
        STANFORD_POS_MODEL_PATH = "path/english-bidirectional-distsim.tagger"
        STANFORD_POS_JAR_PATH = "path/stanford-postagger-3.9.2.jar"

        pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH,
                                       STANFORD_POS_JAR_PATH)

        pos = pos_tagger.tag([word])
        tag_first = pos[0][1][0]
        if tag_first in tag_target:
            if tag_first == 'V':
                tag_list.append('동사')
            if tag_first == 'N':
                tag_list.append('명사')
            if tag_first == 'J':
                tag_list.append('형용사')
            if tag_first == 'R':
                tag_list.append('부사')
        else:
            tag_list.append('')
    df_emotion['품사'] = tag_list

    # 품사 태깅한 확장 단어 사전 데이터프레임 출력
    df_emotion.to_excel(f"../res/dic/감정 단어.xlsx")
示例#4
0
def update_training_data(usr_input,label,command):
	format_input = ""
	st = StanfordPOSTagger(config['tagger']['model'],path_to_jar=config['tagger']['path'])
	tags = st.tag(usr_input.split())
	print(tags)
	with open(MAPPING_PATH,'r') as data_file:    
		data = json.load(data_file)
		for pos,tag in enumerate(tags):
			if(tag[1] != "NNP"):
				format_input += tag[0]
				format_input += " "
		data[label].append(format_input)
		with open(MAPPING_PATH, "w") as jsonFile:
			jsonFile.write(json.dumps(data, sort_keys=False, indent=4))
	with open(TRAINDATA_PATH,'r') as data_file:
		data = json.load(data_file)
		add_dict = {
			"text" : format_input,
			"label" : label
		}
		data.append(add_dict)
		with open(TRAINDATA_PATH, "w") as jsonFile:
			jsonFile.write(json.dumps(data, sort_keys=False, indent=4))
	with open(COMMAND_PATH,'r') as data_file:
		data = json.load(data_file)
		add_dict = {
			format_input : command
		}
		data[label].update(add_dict)
		with open(COMMAND_PATH,"w") as jsonFile:
			jsonFile.write(json.dumps(data, sort_keys=False, indent=4))
	print('Added')
示例#5
0
def handleMessage(sid, txt):
    tagger = StanfordPOSTagger(_path_to_model,
                               path_to_jar=_path_to_jar,
                               java_options='-mx4096m')
    tagged = tagger.tag(nltk.word_tokenize(txt))
    responseMessage = str(tagged)
    sendResponse(sid, responseMessage)
示例#6
0
def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source
示例#7
0
def call_reia():

    max_score = 0.1
    map_val = ""

    print('-----------------------')
    user_input = raw_input("enter the string: ")
    #user_name = get_username(first_line.split(' ', 1)[0])
    suggest_list = []
    suggest_message = ""
    #prev_ts = ts
    print("\nINPUT = ")
    print(user_input)
    label = classify(user_input)
    if label == "":
        post_message(
            "Sorry, I could not understand. Please rephrase and try again.")
        consume_message()

    print("Classified as : " + str(label))
    tokens = nltk.word_tokenize(user_input)
    print(tokens)
    st = StanfordPOSTagger(config['tagger']['model'],
                           path_to_jar=config['tagger']['path'])
    stanford_tag = st.tag(user_input.split())
    print("Tags")
    print(stanford_tag)
    """with open(MAPPING_PATH,'r') as data_file:    
示例#8
0
def new_load_data(f_name):
    data = {}
    import os
    java_path = "C:/Program Files/Java/jdk1.8.0_121/bin/java.exe"
    os.environ['JAVAHOME'] = java_path
    st = StanfordPOSTagger('english-bidirectional-distsim.tagger',
                           'stanford-postagger.jar',
                           encoding='utf-8')

    with open(f_name, 'r') as file:
        for line in file:
            fields = line.split('\t')
            sent_id = fields[0]
            """if sent_id == 'sent1656':
                print('yay')"""
            data[sent_id] = {}
            data[sent_id][SENTENCE] = fields[1].strip('\n').split()
            data[sent_id][ENTITIES] = {}
            tokenized_sent = nltk.sent_tokenize(fields[1])
            for sent in tokenized_sent:
                chunk_id = 0
                for chunk in nltk.ne_chunk(st.tag(nltk.word_tokenize(sent))):

                    if hasattr(chunk, 'label'):
                        data[sent_id][ENTITIES][chunk_id] = (
                            chunk.label(), ' '.join(c[0] for c in chunk))
                        chunk_id += len([c[0] for c in chunk])
                        print(chunk.label(), ' '.join(c[0] for c in chunk))
                    else:
                        chunk_id += 1

                    # assert chunk_id < len(fields[1].split())
            #sent = st.tag(fields[1].split())
            #print(sent)
    return data
示例#9
0
def TagProb(Readfile, file):
    if path.exists(sys.path[0] + '/Preparation/save/data/' + file):
        remove(sys.path[0] + '/Preparation/save/data/' + file)
    tagger = StanfordPOSTagger(model_filename, path_to_jar)
    WordDict = {}
    for line in open(Readfile):
        sentence = tagger.tag(line.split())
        for WordTag in sentence:
            if WordTag[0] not in WordDict.keys():
                WordDict[WordTag[0]] = {}
                WordDict[WordTag[0]][WordTag[1]] = 1
            else:
                if WordTag[1] not in WordDict[WordTag[0]].keys():
                    WordDict[WordTag[0]][WordTag[1]] = 1
                else:
                    WordDict[WordTag[0]][
                        WordTag[1]] = 1 + WordDict[WordTag[0]][WordTag[1]]
    for word in WordDict.keys():
        sum_freq = 0
        for tag in WordDict[word].keys():
            sum_freq = WordDict[word][tag] + sum_freq
        for tag in WordDict[word].keys():
            WordDict[word][tag] = WordDict[word][tag] / sum_freq
    with open(file, 'a', encoding='utf-8') as Writer:
        for word in WordDict.keys():
            Writer.write(str(word) + ':' + str(WordDict[word]) + '\n')
    return WordDict
示例#10
0
def features(text):
    # POS-Tagging
    tagged = StanfordPOSTagger(model_filename=model_filename,
                               path_to_jar=path_to_jar,
                               encoding='utf8',
                               verbose=False,
                               java_options='-mx3000m')
    classified_word = tagged.tag(nltk.word_tokenize(text))
    text_postags = []
    for index_classified in classified_word:
        text_postags.append(index_classified[1])
    freq_pos = nltk.FreqDist(text_postags)
    adverb, adjective, noun, pronoun, verb = 0, 0, 0, 0, 0
    for index_freq in freq_pos.most_common(len(freq_pos)):
        if index_freq[0] in ["RB", "RBR", "RBS"]:
            adverb += index_freq[1]
        elif index_freq[0] in ["JJ", "JJR", "JJS"]:
            adjective += index_freq[1]
        elif index_freq[0] in ["NN", "NNS", "NNP", "NNPS"]:
            noun += index_freq[1]
        elif index_freq[0] in ["PRP", "PRP$"]:
            pronoun += index_freq[1]
        elif index_freq[0] in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
            verb += index_freq[1]
    X_test = []
    X_test.extend(
        (adverb / adjective, adverb / noun, adverb / pronoun, adjective / verb,
         adjective / pronoun, noun / verb, noun / pronoun, verb / pronoun))
    return X_test
示例#11
0
def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records,
                                             pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: ' + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' %
              (idx, len(records), len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source
示例#12
0
def _preprpcessing_eng(id_list):
    stop_w = set(stopwords.words('english'))
    eng_tagger = StanfordPOSTagger(
        model_filename=
        '/home/zhouh/Downloads/stanford-postagger-full-2018-02-27/models/english-bidirectional-distsim.tagger',
        path_to_jar=
        '/home/zhouh/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger.jar'
    )

    for i in id_list:
        try:
            text = read_file('/home/zhouh/Thesis/code/Transcripts/english/' +
                             i + '.txt')
            words = nt.word_tokenize(text, language='english')
            word = [x for x in words if x not in string.punctuation]
            word = [x for x in word if x not in stop_w]
            word = [x for x in word if not x.isdigit()]
            word = eng_tagger.tag(word)
            tt = ''
            for w in word:
                tt += '/'.join(w) + ' '

            new_path = '/home/zhouh/Thesis/code/Transcripts/eng_preprocessed/' + i + 'pre.txt'
            if os.path.exists(new_path):
                os.remove(new_path)
            with open(new_path, 'w') as f:
                f.write(tt)
        except:
            continue
示例#13
0
def genere_liste_natures(l_auteurs, STANFORD_PARSER='../stanford', STANFORD_MODELS='../stanford', JAVAHOME='/import/lhauseux/jre1.8.0_45/bin', bid = '/import/lhauseux/Pyzo/pyzo2015a/stanford-postagger-2015-04-20/models/english-bidirectional-distsim.tagger', pt = '/import/lhauseux/Pyzo/pyzo2015a/stanford-postagger-2015-04-20/stanford-postagger.jar'):
    # On règle les paramètres du prog' de Stangord et de java :
    os.environ['STANFORD_PARSER'] = STANFORD_PARSER
    os.environ['STANFORD_MODELS'] = STANFORD_MODELS
    os.environ['JAVAHOME'] = JAVAHOME
    
    st = StanfordPOSTagger(bid,path_to_jar=pt,java_options='-mx15000m') 
    nltk.internals.config_java(options='-xmx2G')

    
    # On crée le dossier forets si nécessaire
    if not os.path.isdir('./listes_natures'):
        os.mkdir('./listes_natures')
    for auteur in l_auteurs:
        # On crée le dossier propre à l'auteur si nécessaire
        if not os.path.isdir('./listes_natures/'+auteur):
            os.mkdir('./listes_natures/'+auteur)
        articles = os.listdir('./auteurs/'+auteur)
        for article in articles:
            if article != 'liens.txt':
                # On récupère l'article au format texte
                f = open('./auteurs/'+auteur+'/'+article,'r')
                contenu = f.read()
                f.close()
                # On le transforme en forêts d'arbres
                contenu = nltk.word_tokenize(contenu)
                contenu = st.tag(contenu)
                contenu = [c[1] for c in contenu]
                # On enregistre
                f = open('./listes_natures/'+auteur+'/'+article,'wb')
                pickle.dump(contenu,f)
                f.close()
                print(auteur,article)
示例#14
0
def fr_words(DATA_PATH, candidats) :

    import pandas as pd
    import operator
    import nltk
    from nltk.corpus import stopwords
    
    #nltk stanford french tagger
    from nltk.tag import StanfordPOSTagger
    jar = 'C:/Users/user/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger-full-2018-02-27/stanford-postagger-3.9.1.jar'
    model = 'C:/Users/user/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger-full-2018-02-27/models/french.tagger'
    import os
    java_path = "C:/ProgramData/Oracle/Java/javapath/java.exe"
    os.environ['JAVAHOME'] = java_path
    pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8' )
    
    #tokenizer (enlever les # @ et la ponctuation...)
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    
    #lecture des tweets
    df = pd.read_csv(DATA_PATH)
    df = df[df['text'].notnull()]
    a = len(df)
    fr_words = [[] for i in range (len(candidats))]
    indesirable = ["RT","https","http","c","les", "et", "ça","coach", "ils","thevoice", "quand", "donc","thevoice_tf1" ]
    
    for j in range (len(candidats)):   
        count = dict() 
        candidat = candidats[j]
        for i in range (0,a) :  
            if i in [ 7224, 16457,16458,22348,22349,22350,22351,22352, 22353,22354,22355] : 
                continue 
            else : 
                line = df.at[i,'text']
                tokenized = tokenizer.tokenize(line)
                # ne garder que les mots qui ne sont pas des stop words (de, que, dans...)
                # en minuscule 
                words = [ w.lower() for w in tokenized if (w not in stopwords.words('french') and w not in indesirable)]
                if set(candidat) & set(words):
                    for word in words :
                        
                        if word in count.keys() :
                            count[word] += 1
                        else :
                            count[word] = 1
                else:
                    continue
                
    
        count = sorted(count.items(), key=operator.itemgetter(1), reverse = True)
        
        fr_words1 = count [0:50]
        
        # enlever tous les verbes 
        for element in fr_words1 : 
            if pos_tagger.tag(element[0].split())[0][1] not in ['VINF','V'] :
                fr_words[j].append(element)
            else :
                continue
    return fr_words
def main():

    with open('/home/abhinav/PycharmProjects/video_enrichment/text.txt',
              'r') as myfile:
        text = myfile.read().replace('\n', '')

        # text = """Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora."""
    # text  = "Concepts present in text are outline of machine learning, data mining, statistics, cluster analysis, algorithms like logic, pseudo code."
    text = p.sub('', text)
    sentences = nltk.sent_tokenize(text)

    for sentence in sentences:
        sentence = sentence.lower()  # Lower Case the whole sentence
        sentence = p.sub(
            '', sentence)  # Removing anything enclosed within brackets
        print(sentence)

        ## TAGGING
        st_tag = StanfordPOSTagger(model_filename=eng_model_filename_pos,
                                   path_to_jar=my_path_to_pos_jar)
        tagged_sentence = st_tag.tag(word_tokenize(sentence))
        # print(tagged_sentence)

        ## ENTITY RECOGNITION
        # st_ner = StanfordNERTagger(model_filename=eng_model_filename_ner, path_to_jar=my_path_to_ner_jar)
        # print(st_ner.tag('Rami Eid is studying at Stony Brook University in NY'.split()))

        ## PARSING
        # print(parsing(sentence))

        ## Chunking Using Regex
        regex_exps = [
            "NP: {<JJ|NN.?>*<NN.?>}",
            "NP: {<JJ|NN.?>*<NN.?><IN>?<JJ|NN.?>*<NN.?>}", "NP: {<JJ>*<NN.?>+}"
        ]  # Include the following pattern to count conjuctions "NP: {<JJ|NN.?>*<NN.?><CC>?<JJ|NN.?>*<NN.?>}"
        for grammar in regex_exps:
            IOB_tagged = chunking(tagged_sentence, grammar)
            remove_IOBtags(IOB_tagged)

    # print(concept_count)

    ## Prune concepts on word level using word frequency count on BBC corpus
    prune_concepts_WordLevel()
    print("Pruned concepts are:", pruned_concepts)

    ## Identify Wikipedia articles(titles) that match concepts extracted from the text if Jaccard Similarity is one or if wikipedia title is a part of concept extracted
    Wikipedia_aritcle_matching()
    print("\n", concept_wiki_article)
    print("\nFinal List Of Concepts:", final_wiki_concepts)
    # prereq_graph.add_nodes_from(final_wiki_concepts)

    wiki_based_similarity()

    Connected_components = nx.connected_components(un_prereq_graph)
    print("\n Pre-req Graph successfully created")
    # print("\nConnected Components: ")
    # print(Connected_components)
    nx.draw(prereq_graph, with_labels=True)
    plt.axis('off')
    plt.savefig("graph_prereq.png")
示例#16
0
def get_pos_tags(content, stopwords, is_stemming, is_math):
  # Content should be tokenized
  pos_tagger_dir = '/usr/users/swli/program/nlp_util/stanford-postagger'
  model = pos_tagger_dir + '/models/wsj-0-18-bidirectional-distsim.tagger'
  classpath = pos_tagger_dir + '/stanford-postagger_with_slf4j.jar'
  tagger = StanfordPOSTagger(model, classpath, java_options='-mx4000m')
  try:
    tag_results = tagger.tag(re.split('\s+', content))
  except OSError:
    sentences = re.split('\s+\.\s+', content)
    tag_results = []
    for index in range(len(sentences)):
      sentence = sentences[index]
      if index < len(sentences)-1:
        sentence += ' .'
      tag_results += get_contaminated_tag_results(sentence, tagger)
 
  pos_tags = []
  for pair in tag_results:
    word = pair[0]
    # map simple equation to tokens
    if is_math:
      word = simple_eq_to_text(word)
    # remove punctuation
    word = "".join(l for l in word if l not in string.punctuation)
    word = word.lower()
    word = process_word(word, stopwords, is_stemming, is_math)
    if word:
      pos_tags.append(pair[1])
  return pos_tags
示例#17
0
def pos_tag(review):
    eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
    tmp = eng_tagger.tag(review)
    result = []
    for element in tmp:
        result.append(element[1])
    return result
示例#18
0
def pos_tag(mots,
            jar=os.path.join(".", "models", "stanford-postagger",
                             "stanford-postagger-3.8.0.jar"),
            mdl=os.path.join(".", "models", "stanford-postagger",
                             "french-ud.tagger")):
    try:
        pos_tagger = StanfordPOSTagger(mdl, jar, encoding='utf8')
    except LookupError:
        java_path = r"C:\Program Files (x86)\Java\jre1.8.0_261\bin\java.exe"
        os.environ['JAVAHOME'] = java_path
        pos_tagger = StanfordPOSTagger(mdl, jar, encoding='utf8')
    tagged = pos_tagger.tag(mots)
    tags = [g for m, g in tagged]
    forced_det = ["au", "aux"]
    absent_of_table = ["PART", "SCONJ"]
    if any(item in mots
           for item in forced_det) or any(item in tags
                                          for item in absent_of_table):
        for i, couple in enumerate(tagged):
            mot = couple[0]
            gram = couple[1]
            if mot in forced_det:
                tagged[i] = (mot, "DET")
            if gram == "PART":
                tagged[i] = (mot, "ADV")
            if gram == "SCONJ":
                tagged[i] = (mot, "CONJ")
    return tagged
示例#19
0
文件: reia.py 项目: aditi2205/reia2
def call_reia():
    while (True):
        max_score = 0.1
        map_val = ""
        with open('/media/ubuntu/Local Disk/MAJOR_PROJECT/REIA/mqueue.txt',
                  'r') as f:
            first_line = f.readline()
            while first_line == "":
                time.sleep(1)
                call_reia()
        print('-----------------------')
        user_input = first_line.split(' ', 1)[1]
        user_name = get_username(first_line.split(' ', 1)[0])
        suggest_list = []
        suggest_message = ""
        #prev_ts = ts
        print("\nINPUT = ")
        print(user_input)
        label = classify(user_input)
        if label == "":
            post_message(
                "Sorry, I could not understand. Please rephrase and try again."
            )
            consume_message()
            continue
        print("Classified as : " + str(label))
        tokens = nltk.word_tokenize(user_input)
        print(tokens)
        st = StanfordPOSTagger(config['tagger']['model'],
                               path_to_jar=config['tagger']['path'])
        stanford_tag = st.tag(user_input.split())
        print("Tags")
        print(stanford_tag)
        with open(MAPPING_PATH, 'r') as data_file:
            data = json.load(data_file)
        for i in data[label]:
            dist = jf.jaro_distance(str(user_input), str(i))
            suggest_list.append(tuple((dist, i)))
            print(dist)
            if (dist > max_score):
                max_score = dist
                map_val = i
        if max_score < config['preferences']['similarity_threshold']:
            post_message(
                "Sorry, I could not understand. Please rephrase and try again."
            )
            consume_message()
            if config['preferences']['suggestions'] == True:
                suggest = suggestions(suggest_list)
                post_message("Did you mean :")
                for i in suggest:
                    suggest_message += (str(i[1]) + "\n")
                post_message(suggest_message)
            continue
        print("\nMapped to : " + map_val)
        #post_message(map_val)
        construct_command(user_input, label, tokens, map_val, stanford_tag,
                          exec_command, user_name)
        #call('sed -i -e "1d	" REIA/mqueue.txt')
        consume_message()
示例#20
0
def get_postagger_for_criterion(criterion):
    ini_path = "/stanford/postagger"
    os.environ['STANFORD_PARSER'] = ini_path
    os.environ['STANFORD_MODELS'] = ini_path
    os.environ['CLASSPATH'] = ini_path
    st = StanfordPOSTagger('models/english-bidirectional-distsim.tagger')
    postagger_list = st.tag(criterion)
    return postagger_list
示例#21
0
def standford_pos(text):
    eng_tagger = StanfordPOSTagger(
        model_filename=
        r'D:\Program Files\stanford-corenlp-full\stanford-postagger\models\english-bidirectional-distsim.tagger',
        path_to_jar=
        r'D:\Program Files\stanford-corenlp-full\stanford-postagger\stanford-postagger.jar'
    )
    return eng_tagger.tag(text.split())
示例#22
0
    def getTagged(self, text):
        from nltk.tag import StanfordPOSTagger

        if self.lang == 1:
            jar = 'stanford-pos-tagger/stanford-postagger-3.8.0.jar'
            model = 'stanford-pos-tagger/french.tagger'
            pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
            tokenizedText = nltk.word_tokenize(text.lower())
            taggedText = pos_tagger.tag(tokenizedText)
        else:
            jar = 'stanford-pos-tagger/stanford-postagger-3.8.0.jar'
            model = 'stanford-pos-tagger/arabic.tagger'
            pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
            tokenizedText = nltk.word_tokenize(text.lower())
            taggedText = pos_tagger.tag(tokenizedText)
            print(taggedText)
        return taggedText
示例#23
0
class FeatureProcessing(object):
  def __init__(self):
    self.feat_index = {}
    self.implication_words = ["demonstrate", "suggest", "indicate"]
    self.hyp_words = ["possible"]
    self.method_words = ["probe", "detect"]
    self.pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')

  def get_features(self, phrase, filter_feature='0'):
    words = word_tokenize(phrase)
    pos_tags = self.pos_tagger.tag(words)
    features = []
    for word, tag in pos_tags:
      wl = word.lower()
      # Feat 1: POS features
      if filter_feature != '1':
        if tag != ',' and tag != '.':
          features.append(tag)
      # Feat 2: Verb and adverb identity
      if filter_feature != '2':
        if tag == 'RB' or tag.startswith('VB'):
          features.append(wl)
      # Feat 3: Presence of figure references and citations
      if filter_feature != '3':
        if word.startswith("Fig"):
          features.append("figure")
        if re.search("[A-Z][^\s]+ et al.", phrase) is not None:
          features.append("reference")
    # Feat 4: Presence of specific words or phrases
    if filter_feature != '4':
      if re.search("[Dd]ata not shown", phrase) is not None:
        features.append("data_not_shown")
      for word in self.implication_words:
        if word in phrase:
          features.append("implication_word")
      for word in self.hyp_words:
        if word in phrase:
          features.append("hyp_word")
      for word in self.method_words:
        if word in phrase:
          features.append("method_word")
    return features

  def index_data(self, data, filter_feature='0'):
    all_features = [self.get_features(datum, filter_feature) for datum in data]
    for features in all_features:
      for feat in features:
        if feat not in self.feat_index:
          self.feat_index[feat] = len(self.feat_index)

  def featurize(self, phrase, filter_feature='0'):
    indexed_features = [0] * len(self.feat_index)
    features = self.get_features(phrase, filter_feature)
    for feat in features:
      if feat not in self.feat_index:
        continue
      indexed_features[self.feat_index[feat]] += 1
    return indexed_features
def gen_keyphrases(text):
    # Used when tokenizing words
    sentence_re = r'''(?x)        # set flag to allow verbose regexps
        (?:[A-Z])(?:\.[A-Z])+\.?    # abbreviations, e.g. U.S.A.
        | \w+(?:-\w+)*            # words with optional internal hyphens
        | \$?\d+(?:\.\d+)?%?        # currency and percentages, e.g. $12.40, 82%
        | \.\.\.                # ellipsis
        | [][.,;"'?():-_`]        # these are separate tokens
    '''

    lemmatizer = nltk.WordNetLemmatizer()

    #Taken from Su Nam Kim Paper...
    grammar = r"""
        NBAR:
            {<NN.*|JJ>*<NN.*>}    # Nouns and Adjectives, terminated with Nouns

        NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}    # Above, connected with in/of/etc...
    """
    chunker = nltk.RegexpParser(grammar)

    tokenizer = nltk.RegexpTokenizer(sentence_re)
    toks = tokenizer.tokenize(text)
    span_toks = tokenizer.span_tokenize(text)
    logger.debug(toks)
    logger.debug("tokens: %(1)d" % {"1": len(toks)})

    # old way of tokenization
    #toks = nltk.regexp_tokenize(text, sentence_re)
    st = StanfordPOSTagger(config.stanford_bidirectional_tagger_path,
                           config.stanford_postagger_jar_path,
                           encoding="utf8",
                           java_options="-mx8g")
    _postoks = st.tag(toks)
    # examine the postags, if "[", then change the tag to "X", create a new list
    postoks = []
    for pt in _postoks:
        if pt[0] == "[":
            postoks.append(('[', 'X'))
        elif pt[0] == "]":
            postoks.append((']', 'X'))
        else:
            postoks.append(pt)
    logger.info(postoks)

    # NLTK POS Tagger
    #postoks = nltk.tag.pos_tag(toks)
    logger.debug("postoks: %(1)d" % {"1": len(postoks)})
    tree = chunker.parse(postoks)
    # cast a Tree into a ParentedTree
    ptree = nltk.ParentedTree.convert(tree)
    # for each token, record its tree position
    pos_map = generate_pos_map(ptree, span_toks)

    stopwords = nltk.corpus.stopwords.words('english')
    return get_terms(ptree, lemmatizer, stopwords, pos_map)
示例#25
0
def postagger():
	os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27/stanford-postagger.jar'
	os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH']

	eng_tagger = StanfordPOSTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27/models/english-bidirectional-distsim.tagger')

	for x in content:
		print(eng_tagger.tag(x.split()))
示例#26
0
def number(sentence):
    pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
    tagged_sentence = pos_tagger.tag(sentence.split())
    int_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    numbers = [
        word for word, tag in tagged_sentence
        if ((tag == 'DET' and det_or_nb(word) == 'nb') or (word[0] in int_list)
            )
    ]
    return (' '.join(numbers))
示例#27
0
def _tagging(data):
    df = pd.read_csv("/var/www/pyapi/scripts/Stanford_POS_Tags.csv")
    os.environ[
        "STANFORD_MODELS"] = "/var/www/pyapi/scpDocs/stanford-postagger-full-2017-06-09/models"
    spanish_postagger = StanfordPOSTagger(
        'spanish.tagger',
        '/var/www/pyapi/scpDocs/stanford-postagger-full-2017-06-09/stanford-postagger.jar'
    )
    tagged = spanish_postagger.tag(data.split())
    return _describe_stanford_pos_tag(tagged, df)
示例#28
0
def stanford_pos_tag(text, java_path=None):
    _setup_java_home(java_path)
    model_name = "english-caseless-left3words-distsim.tagger"

    stanfort_dir = get_from_resource("stanford-postagger-full-2018-10-16")
    jar = str(stanfort_dir.joinpath("stanford-postagger-3.9.2.jar"))
    model = str(stanfort_dir.joinpath("models/{}".format(model_name)))

    st = StanfordPOSTagger(model, jar, encoding="utf8")
    text_tokenized = word_tokenize(text)
    return st.tag(text_tokenized)
示例#29
0
    def _tag_words(self, jar, model, cleaned_sentence):
        """
            Arguments:
            cleaned_sentence

            Returns:
            tagged_words: a list containing tuples i.e (word, syntactic value).
            """
        pos_tagger = StanfordPOSTagger(model, jar, encoding="utf-8")
        tagged_words = pos_tagger.tag(word_tokenize(cleaned_sentence))
        return tagged_words
 def getSolutionInTag(self):
     english_postagger = StanfordPOSTagger(
         "/home/geethu/Documents/project anlp/c/models/english-bidirectional-distsim.tagger",
         "/home/geethu/Documents/project anlp/c/stanford-postagger.jar")
     print(self.synt)
     #retag the sentence again
     new_tag_values = english_postagger.tag(self.words)
     for index in range(len(self.words)):
         print(index)
         word, tag = new_tag_values[index]
         self.synt[index] = tag
示例#31
0
def Text_to_tag(Readfile, file):
    if path.exists(sys.path[0] + '/Preparation/save/data/' + file):
        remove(sys.path[0] + '/Preparation/save/data/' + file)
    tagger = StanfordPOSTagger(model_filename, path_to_jar)
    for line in open(Readfile):
        TagList = []
        sentence = tagger.tag(line.split())
        for WordTag in sentence:
            TagList.append(WordTag[1])
        with open(file, 'a', encoding='utf-8') as Writer:
            Writer.write(" ".join(TagList) + '\n')
示例#32
0
def impp(input_question):
	try:
		import numpy as np
		import os 
		os.getcwd()
		import pandas as pd
		import spacy
		from . import formula
		nlp = spacy.load('en_core_web_sm')
		from difflib import SequenceMatcher
		import re
		import nltk
		import pprint
		pp = pprint.PrettyPrinter(indent=4)
		from nltk import word_tokenize
		from nltk.corpus import stopwords
		path_to_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0.jar'
		path_to_models_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0-models.jar'

		jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-postagger-3.8.0.jar'
		model = '/usr/local/lib/python2.7/dist-packages/nltk/tag/models/english-left3words-distsim.tagger'
		from nltk.parse.corenlp import CoreNLPParser
		from nltk.tag import StanfordNERTagger
		from nltk.parse.stanford import StanfordParser
		from nltk.parse.stanford import StanfordDependencyParser
		from nltk.stem import PorterStemmer
		from nltk.tokenize import sent_tokenize
		from nltk.tag import StanfordPOSTagger
		pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
	
		dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
		#print ("1")
		#print (os.path.exists('/home/piut/django-apps/wps/wps/patterns.csv'))
		#print ("2")	
		pattern=read('patterns.csv')
		#print ("1")	
		#print pattern
		question=input_question
		tagged_question=pos_tagger.tag(nltk.word_tokenize(question))
		doc = nlp(question)
		#print "###################################################################"
		#print doc
		#print ("2")
		result = dependency_parser.raw_parse(question)
		#pp.pprint(tagged_question)
		#print ("3")
		#return str(moreMoney(dependency,doc,pattern,unknown))
		unknown=find(tagged_question,question,doc,input_question)
		if unknown==0:
			return 0
		return unknown
  # 		fe
	except:
		return 0
示例#33
0
def token_after(token, sentence):
    k = 0
    pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
    tagged_sentence = pos_tagger.tag(sentence.split())
    for i in range(len(tagged_sentence)):
        if tagged_sentence[i][0] == token:
            k = i
    if tagged_sentence[k + 1][1] == 'NC':
        return (' '.join([tagged_sentence[k + 1][0]]))
    else:
        return None
示例#34
0
class POSTagger:
    def __init__(
        self,
        path_to_model="/home/james/Downloads/stanford-postagger-full-2016-10-31/models/english-bidirectional-distsim.tagger",
        path_to_jar="/home/james/Downloads/stanford-postagger-full-2016-10-31/stanford-postagger.jar"
    ):
        self.tagger = StanfordPOSTagger(path_to_model, path_to_jar)

    def parse(self, line):
        line = nltk.word_tokenize(line)
        return self.tagger.tag(line)
示例#35
0
    def create_pos(self, tweet):
        self.pos_tweet = None

        tweet = word_tokenize(tweet)

        english_pos = StanfordPOSTagger(
            'postagger/models/english-bidirectional-distsim.tagger',
            'postagger/stanford-postagger.jar')

        self.pos_tweet = english_pos.tag(tweet)

        return self.pos_tweet
def pos_tagging(sentence):

    english_postagger=StanfordPOSTagger('stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-08-27/stanford-postagger.jar')

    VP_list=[]

    POS_list=english_postagger.tag(sentence.split())

    '''for i in range(0, len(POS_list)):
        if POS_list[i][1] in ['NNS','NNP','NNPS']:
            NP_list.append(POS_list[i][0])'''


    return POS_list
def get_pos_tag(sen):#pass sentence dataframe
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen
def pos_person_tagging(sentence):

    #Setting the path and jar files for the POS Tagger

    english_postagger=StanfordPOSTagger('stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-08-27/stanford-postagger.jar')

    NP_list=[]

    POS_list=english_postagger.tag(sentence.split())

    for i in range(0, len(POS_list)):
        if POS_list[i][1] in ['NNS','NNP','NNPS']:
            NP_list.append(POS_list[i][0])


    return NP_list
示例#39
0
文件: tagQTS.py 项目: scylense/wenji
def main():

    initialize()
    # create tagger
    model = '../stanford-postagger/models/chinese-distsim.tagger'
    jar = '../stanford-postagger/stanford-postagger.jar'
    zhPOS = StanfordPOSTagger(model, jar)

    # streaming model: process each line in turn
    with io.open(INFILE, 'r', encoding='utf8') as qts, io.open(OUTFILE, 'w', encoding='utf8') as pos:

        for line in qts:
            qtsPOS = zhPOS.tag(line)
            s = " ".join("%s" % tup[1] for tup in qtsPOS) + "\n"
            pos.write(s)

    return()
示例#40
0
class POSTagger:

    def __init__(self, tagger_path, model_path, output_filename):
        self.st = StanfordPOSTagger(tagger_path, model_path)
        self.output_filename = output_filename
        try:
            os.remove(self.output_filename)
        except OSError:
            pass

    def output_knowledge(self, sentence):
        sentence += " ."
        s = ""
        with open(self.output_filename, "a") as file:
            for word, pos_tag in self.st.tag(sentence.split()):
                file.write(("%s\t%s\n" % (word, pos_tag)).encode("utf-8"))
            file.write("\n")
def get_pos_tag(sen):
    os.environ['CLASSPATH']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/stanford-postagger.jar' #set classpath to pos tagger
    os.environ['STANFORD_MODELS']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/models'
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize

STANFORD_POS_MODEL_PATH = '압축을 푼 장소/models/english-bidirectional-distsim.tagger'
STANFORD_POS_JAR_PATH = '압축을 푼 장소/stanford-postagger-3.6.0.jar'

pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH, STANFORD_POS_JAR_PATH)

# 임의로 만들어낸 예제입니다. 이 부분을 원하는 문장으로 바꿔서 실습하세요.
text = 'One day in November 2016, the two authors of this book, Seungyeon and Youngjoo, had a coffee at Red Rock cafe, which is a very popular place in Mountain View.'

tokens = word_tokenize(text)
print(tokens)  # 쪼개진 토큰을 출력합니다.
print()
print(pos_tagger.tag(tokens))  # 품사 태깅을 하고 그 결과를 출력합니다.

# 동사와 명사만 뽑아봅시다.
noun_and_verbs = []
for token in pos_tagger.tag(tokens):
    if token[1].startswith('V') or token[1].startswith('N'):
        noun_and_verbs.append(token[0])
print(', '.join(noun_and_verbs))
    buf = 0
    for k in range(len(synonymSet_h)):
        for n in range(len(synonymSet_t)):
            ##############################modifying function######################
            #if synonymSet_h[k].wup_similarity(synonymSet_t[n]!=None):
            #    x.append(synonymSet_h[k].wup_similarity(SynonymSet_t[n]))
            if synonymSet_h[k].wup_similarity(synonymSet_t[n])!=None:
                if buf<synonymSet_h[k].wup_similarity(synonymSet_t[n]):
                    buf=synonymSet_h[k].wup_similarity(synonymSet_t[n])
    return buf

for m in root.findall("pair"):
    hypothesis=m.findtext("h").casefold()
    tokenized_hypothesis=nltk.word_tokenize(hypothesis)
#    tagged_tokenized_hypothesis=nltk.pos_tag(tokenized_hypothesis)    #nltk tagger
    tagged_tokenized_hypothesis=st.tag(tokenized_hypothesis)    #stanfordnlp tagger
    text=m.findtext("t").casefold()
    tokenized_text=nltk.word_tokenize(text)
#    tagged_tokenized_text=nltk.pos_tag(tokenized_text)    #nltk tagger
    tagged_tokenized_text=st.tag(tokenized_text)    #stanfordnlp tagger
    output.write("newhypo:\n")
    for i in range(len(tokenized_hypothesis)):
        output.write(tagged_tokenized_hypothesis[i][0])
        output.write(tagged_tokenized_hypothesis[i][1])
    output.write("newtext:\n")
    for j in range(len(tokenized_text)):
        output.write(tagged_tokenized_text[j][0])
        output.write(tagged_tokenized_text[j][1])
    output.write("value:\n")
    output.write(m.get("entailment"))
    output.write("\n")
__author__ = 'Anirudh'

import codecs
import nltk
from nltk.tag import StanfordPOSTagger
nltk.internals.config_java("C:\Program Files\Java\jdk1.8.0_60\\bin\java.exe")

import os
java_path = "C:\Program Files\Java\jdk1.8.0_60\\bin\java.exe"
os.environ['JAVAHOME'] = java_path


# st = StanfordPOSTagger('english-bidirectional-distsim.tagger')

st = StanfordPOSTagger('D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\models\\arabic.tagger','D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\stanford-postagger.jar')
#st = StanfordPOSTagger('D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\models\\english-bidirectional-distsim.tagger','D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\stanford-postagger.jar')

file="arabic_in.txt"
source = codecs.open(file,"r","utf-16-be")
destination = codecs.open("utf8encoder_out.txt","wb","utf-8")
contents=source.read()
destination.write(contents)

destination = codecs.open("utf8encoder_out.txt","r","utf-8")
contents2=destination.read()

print contents2.split()

print st.tag(contents2.split())
示例#45
0
readFile = (open(filename)).read()
paras = readFile.split('\n')
parasCopy = []
paraIndex = 0
for paragraph in paras:
	paraIndex += 1
	logging.info('Processing paragraph %d' %paraIndex)
	if not paragraph == '':
		name = ''
		paragraphCopy = ""
		sentenceList = getSentences(paragraph)
		sentenceIndex = 0
		for sentence in sentenceList:
			sentenceIndex += 1
			logging.info('Processing sentence %d' %sentenceIndex)
			tokens = POSTagger.tag(sentence.split())
			logging.info('POS Tagging of a sentence')
			nameAnalysis = getName (sentence, tokens)
			sentenceCopy = sentence
			if nameAnalysis[0] == '' and nameAnalysis[1] > 0 and not name == '':
				sentenceCopy = replacePRP(nameAnalysis[2], name, sentence)
			elif not nameAnalysis[0] == '' and nameAnalysis[3] == 1:
				name = nameAnalysis[0]
			if sentenceCopy.count('(') > 0 and not name == '':
				dateBucket = bracketProcess(sentenceCopy, tokens)
				sentenceCopy = bracketRemove(sentenceCopy)
				if not dateBucket == []:
					date_1 = dateBucket[0]
					sentence_1 = name + " was born in"
					for i in date_1:
						sentence_1 += " " + i
示例#46
0
        home_path + '/stanford-postagger.jar')

url_noun = []
url_not_noun = []
pos = ['NN', 'NNS', 'IN', 'JJ', 'JJS', 'RB', 'TO', 'PRP', 'PRP$', 'NNP', 'NNPS', 'DT', 'VBG', 'VBN', 'VBD']
count = 1

for path in paths:
    print str(count) + '/' + str(len(paths))
    count += 1
    isNoun = True
    print path
    # remove parameters in path, such as {id}, [id], :id, and split url by level, namely by '/'
    urls = re.sub('/?[\[{].*?[\]}]|/:\w+', '', path).replace('.json', '').lstrip('/').split('/')
    for url in urls:
        for word_pos in st.tag(get_divided_url(url)):
            # print st.tag(get_divided_url(url))
            if word_pos[1] not in pos:
                url_not_noun.append(path)
                isNoun = False
                break
        if not isNoun:
            break
    if not isNoun:
        continue
    url_noun.append(path)

# save result to swagger_statistic.json
swagger_statistic = OrderedDict()
swagger_statistic['host'] = host
swagger_statistic['basePath'] = basePath
示例#47
0
"""

######################################################################################

from nltk.tag import StanfordPOSTagger
jar = 'C:/Users/Etudiant/Documents/Tableau de bord/stanford-postagger-full-2018-02-27/stanford-postagger-3.9.1.jar'
model = 'C:/Users/Etudiant/Documents/Tableau de bord/stanford-postagger-full-2018-02-27/models/french.tagger'
import os
java_path = "C:/Program Files/Java/jdk1.8.0_151/bin/java.exe"
os.environ['JAVAHOME'] = java_path
pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
words={}
tab2 = {}
for i in range(5):
   select=[]
   n=pos_tagger.tag(tab[i])
   stops_verb=['NC','N','NPP']
   for x in n:
       if x[1] in stops_verb:
           select.append(x[0])
           #sel = max(set(select), key=select.count)
   #tab2[i]=sel
   words={}
   for word in set(select):
       
       count = 0  
       for j in range(len(select)):
           if word == select[j]:
               count += 1
       words[word]= count
   tab2[i] = (sorted( words.items(), key = lambda x : -x[1]))[:2]
def Process_ZH(File):
    # Read file
    with open(File, 'r') as File:
        # print 'Opened'
        Input = File.readlines()


    try:
        for line in Input:
            # Checks if identifier is in the line
            if 'segment' in line:
                #Sets counter to be on and starts count at 0
                Annotations = []
                Annotation_Next = False
                Line = []
                Word_Count = 0

                # Switch to UTF-8 to ensure accurate counting
                Line_UTF8_Decode = line.decode('utf-8')
                Line_Split = Line_UTF8_Decode.split()

                for Split in Line_Split:
                    if 'feature' in Split:
                        Annotations.append([Split[17:-1], Word_Count, 0])
                        # print Annotations

                    elif 'state=' in Split:
                        Line_Temp = re.findall('>([^>]*)</', Split)
                        # print 'State_1'
                        if Line_Temp != []:
                            # Ensures that Line_Temp is a string
                            Line_Temp = Line_Temp[0]
                            Word_Count += 1
                            # print 'State_2'

                            # To ensure nested entities are parsed correctly
                            if Annotations[-1][2] != 0:
                                Annotation_Next = True
                                Length = range(len(Annotations))
                                for x in Length[::-1]:
                                    if Annotations[x][2] == 0 and Annotation_Next == True:
                                        Annotations[x][2] = Word_Count
                                        Annotation_Next = False
                            else:
                                Annotations[-1][2] = Word_Count
                            # print Annotations
                        elif Line_Temp == [] and '<segment' in Split[15:]:
                            pass
                        else:
                            Word_Count += 1
                            Line_Temp = Split[15:]
                            # print 'State 3'
                        if Line_Temp != []:
                            Line.append(Line_Temp)

                    elif '</segment>' in Split:
                        Seg_Split = Split.split('</segment>')
                        for x in Seg_Split:
                            if x != '':
                                Word_Count += 1
                                Line.append(x)
                            elif x == '':
                                if Annotations[-1][2] != 0:
                                # print 'Seg 2'
                                    Annotation_Next = True
                                    Length = range(len(Annotations))
                                    for x in Length[::-1]:
                                    # print Annotations[x][2]
                                        if Annotations[x][2] == 0 and Annotation_Next == True:
                                        # print 'Seg 3'
                                            Annotations[x][2] = Word_Count
                                            Annotation_Next = False
                                else:
                                    Annotations[-1][2] = Word_Count
                            # print Annotations

                        # if '<' not in Split[0]:
                        #     Word_Count += 1
                        #     print Split
                        #     Line_Temp = Split[:-10]
                        #     print Line_Temp
                        #     Line.append(Line_Temp)
                        #     # print 'Seg_1'
                        #     if Annotations[-1][2] != 0:
                        #         # print 'Seg 2'
                        #         Annotation_Next = True
                        #         Length = range(len(Annotations))
                        #         for x in Length[::-1]:
                        #             # print Annotations[x][2]
                        #             if Annotations[x][2] == 0 and Annotation_Next == True:
                        #                 # print 'Seg 3'
                        #                 Annotations[x][2] = Word_Count
                        #                 Annotation_Next = False
                        #     else:
                        #         Annotations[-1][2] = Word_Count
                        #     # print Annotations

                    elif '<segment' not in Split:
                        # print Split
                        Line.append(Split)
                        # Checks if Split is a punctuation character
                        if re.findall('[%s]' % zhon.hanzi.punctuation, Split) == [] and Split != ':':
                            Word_Count += 1
                Line_Done = ' '.join(Line)

                # Tags using StanfordPOSTagger

                ST = StanfordPOSTagger('~/Annotations/models/chinese-distsim.tagger', '~/Annotations/stanford-postagger.jar', encoding='utf-8')
                Tags = ST.tag(Line)
                Tags_Done = ''
                for x in Tags:
                    # print x
                    Tags_Done += x[1][-2:] + ' '

                # print Line_Done
                # print Tags_Done

                Annotations_Done = ''
                for x in Annotations:
                    Annotations_Done += str(x[1]) + ',' + str(x[2]) + ',' + str(x[1]) + ',' + str(x[2]) + ' ' + x[0].upper() + '|'
                # print Annotations_Done

                with open('Processed_Annotations.txt', 'a') as P_A:
                        P_A.write(Line_Done.encode('utf-8') + '\n')
                        P_A.write(Tags_Done + '\n')
                        P_A.write(Annotations_Done[:-1] + '\n' + '\n')
    except IndexError:
        pass
示例#49
0
#
# # 中文命名实体识别
# chi_tagger = StanfordNERTagger('chinese.misc.distsim.crf.ser.gz')
# sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'
# for word, tag in chi_tagger.tag(sent.split()):
#     print word.encode('utf-8'), tag
#
# # 英文词性标注
from nltk.tag import StanfordPOSTagger
# eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
# print eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split())
# # 中文词性标注
chi_tagger = StanfordPOSTagger('chinese-distsim.tagger')
# sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'
sent = u'宫体 子宫 呈 垂直位 宫内膜 高 T2 信号 连续'
for _, word_and_tag in chi_tagger.tag(sent.split()):
    word, tag = word_and_tag.split('#')
    print word.encode('utf-8'), tag


# 中英文句法分析 区别在于词库不同
from nltk.parse.stanford import StanfordParser
eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
sent = list(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split())
for tree in eng_parser.parse(sent):
    tree.pprint()


# 依存关系分析
from nltk.parse.stanford import StanfordDependencyParser
eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
示例#50
0
            resultList.append(3)
        elif util.is_ending_withComma(token):
            resultList.append(4)
        else:
            resultList.append(5)
    return resultList



if __name__=="__main__":

    string = open(properties.test_raw).read()
    str_ = re.sub('[^a-zA-Z0-9\n\.\,\x7f-\xff]', ' ', string)
    resultList = classLabel(str_.split())
    cleaned_test_str = re.sub('[^a-zA-Z0-9\n\x7f-\xff]', ' ', string).lower()
    postag_t = st.tag(cleaned_test_str.split())
    text_file = open(properties.test_tagged_output_file, "w")
    invokeChunker(cleaned_test_str)
    chunkTags = extractChunkTags()

    postag = []
    l = [',','...','.','\'','!']
    for i in range(len(postag_t)):
        if postag_t[i][0] not in l:
            postag.append(postag_t[i])

    for i in range(len(postag)):
        tup = postag[i]
        token = tup[0]
        tag = tup[1]
        chunkTag = chunkTags[i][3]