示例#1
0
def pos_tag(mots,
            jar=os.path.join(".", "models", "stanford-postagger",
                             "stanford-postagger-3.8.0.jar"),
            mdl=os.path.join(".", "models", "stanford-postagger",
                             "french-ud.tagger")):
    try:
        pos_tagger = StanfordPOSTagger(mdl, jar, encoding='utf8')
    except LookupError:
        java_path = r"C:\Program Files (x86)\Java\jre1.8.0_261\bin\java.exe"
        os.environ['JAVAHOME'] = java_path
        pos_tagger = StanfordPOSTagger(mdl, jar, encoding='utf8')
    tagged = pos_tagger.tag(mots)
    tags = [g for m, g in tagged]
    forced_det = ["au", "aux"]
    absent_of_table = ["PART", "SCONJ"]
    if any(item in mots
           for item in forced_det) or any(item in tags
                                          for item in absent_of_table):
        for i, couple in enumerate(tagged):
            mot = couple[0]
            gram = couple[1]
            if mot in forced_det:
                tagged[i] = (mot, "DET")
            if gram == "PART":
                tagged[i] = (mot, "ADV")
            if gram == "SCONJ":
                tagged[i] = (mot, "CONJ")
    return tagged
示例#2
0
def new_load_data(f_name):
    data = {}
    import os
    java_path = "C:/Program Files/Java/jdk1.8.0_121/bin/java.exe"
    os.environ['JAVAHOME'] = java_path
    st = StanfordPOSTagger('english-bidirectional-distsim.tagger',
                           'stanford-postagger.jar',
                           encoding='utf-8')

    with open(f_name, 'r') as file:
        for line in file:
            fields = line.split('\t')
            sent_id = fields[0]
            """if sent_id == 'sent1656':
                print('yay')"""
            data[sent_id] = {}
            data[sent_id][SENTENCE] = fields[1].strip('\n').split()
            data[sent_id][ENTITIES] = {}
            tokenized_sent = nltk.sent_tokenize(fields[1])
            for sent in tokenized_sent:
                chunk_id = 0
                for chunk in nltk.ne_chunk(st.tag(nltk.word_tokenize(sent))):

                    if hasattr(chunk, 'label'):
                        data[sent_id][ENTITIES][chunk_id] = (
                            chunk.label(), ' '.join(c[0] for c in chunk))
                        chunk_id += len([c[0] for c in chunk])
                        print(chunk.label(), ' '.join(c[0] for c in chunk))
                    else:
                        chunk_id += 1

                    # assert chunk_id < len(fields[1].split())
            #sent = st.tag(fields[1].split())
            #print(sent)
    return data
def tag_tweet_twit(tweet,
                   tagger=StanfordPOSTagger('gate-EN-twitter-fast.model')):
    # get indicies for words
    tokens = twit_token.ize(tweet['text'])
    text = " ".join(tokens)
    tags = tagger.tag(tokens)
    index_tags = []
    for word, tag in tags:
        try:
            index_tags = [{
                'indices': [text.index(word),
                            text.index(word) + len(word)],
                'category':
                tag
            } for word, tag in tags]
        except:
            try:
                index_tags = [{'category': tag} for word, tag in tags]
            except:
                error_id = tweet["_id"]
                print(error_id)
                return error_id

    tweet['entities'].update({'Token': index_tags})
    tagged_tweet = json.dumps(tweet) + "\n"
    with open(POS_dir + tok + DBname + "_twitIE_POS", 'a') as tagFile:
        tagFile.writelines(tagged_tweet)
    return ""
示例#4
0
def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source
示例#5
0
 def __init__(self):
     self.feat_index = {}
     self.implication_words = ["demonstrate", "suggest", "indicate"]
     self.hyp_words = ["possible"]
     self.method_words = ["probe", "detect"]
     self.pos_tagger = StanfordPOSTagger(
         'english-bidirectional-distsim.tagger')
def newone(inpt):
     data=pd.read_csv('combinedproject.csv')
     jar ='C:/Users/Deepanshu.pal/Documents/deepanshu/stanford-postagger-2018-10-16/stanford-postagger.jar'
     model ='C:/Users/Deepanshu.pal/Documents/deepanshu/stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger'
     pos_tagger = StanfordPOSTagger(model, jar,encoding='utf8')
     data['bag'] = data.apply(lambda row:nltk.word_tokenize( row["Description"] ),axis=1)
     stop_words = set(stopwords.words('english')) 
     data['bag_remove_stopwords'] =data.apply(lambda ro: [w for w in ro['bag'] if not w in stop_words],axis=1) 
     
            
     def text_process(mess):
         nopunc = [char for char in mess if char not in string.punctuation]
            
         nopunc = ''.join(nopunc)
         stem_data=[ps.stem(w) for w in nopunc.split()]
         return [word for word in stem_data if word.lower() not in stopwords.words('english')]
            
     ps = PorterStemmer()   
     data['bag_stemming'] =data.apply(lambda ro: [ps.stem(w) for w in ro['bag_remove_stopwords'] ],axis=1)
            
     data['Description'].head(5).apply(text_process)
     bow_transformer = CountVectorizer(analyzer=text_process).fit(data['Description'])
     messages_bow = bow_transformer.transform(data['Description'])
     tfidf_transformer = TfidfTransformer().fit(messages_bow)
     messages_tfidf = tfidf_transformer.transform(messages_bow)
            
     new_project_description=inpt
     new_project_bow = bow_transformer.transform([new_project_description])
     new_project_vector = tfidf_transformer.transform(new_project_bow)
     result_array=cosine_similarity(new_project_vector, messages_tfidf)
     similar_score_array=result_array.flatten()
     data['similar_score']=similar_score_array
     return data.sort_values(["similar_score"], axis=0, 
                                 ascending=False)
def main():

    with open('/home/abhinav/PycharmProjects/video_enrichment/text.txt',
              'r') as myfile:
        text = myfile.read().replace('\n', '')

        # text = """Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora."""
    # text  = "Concepts present in text are outline of machine learning, data mining, statistics, cluster analysis, algorithms like logic, pseudo code."
    text = p.sub('', text)
    sentences = nltk.sent_tokenize(text)

    for sentence in sentences:
        sentence = sentence.lower()  # Lower Case the whole sentence
        sentence = p.sub(
            '', sentence)  # Removing anything enclosed within brackets
        print(sentence)

        ## TAGGING
        st_tag = StanfordPOSTagger(model_filename=eng_model_filename_pos,
                                   path_to_jar=my_path_to_pos_jar)
        tagged_sentence = st_tag.tag(word_tokenize(sentence))
        # print(tagged_sentence)

        ## ENTITY RECOGNITION
        # st_ner = StanfordNERTagger(model_filename=eng_model_filename_ner, path_to_jar=my_path_to_ner_jar)
        # print(st_ner.tag('Rami Eid is studying at Stony Brook University in NY'.split()))

        ## PARSING
        # print(parsing(sentence))

        ## Chunking Using Regex
        regex_exps = [
            "NP: {<JJ|NN.?>*<NN.?>}",
            "NP: {<JJ|NN.?>*<NN.?><IN>?<JJ|NN.?>*<NN.?>}", "NP: {<JJ>*<NN.?>+}"
        ]  # Include the following pattern to count conjuctions "NP: {<JJ|NN.?>*<NN.?><CC>?<JJ|NN.?>*<NN.?>}"
        for grammar in regex_exps:
            IOB_tagged = chunking(tagged_sentence, grammar)
            remove_IOBtags(IOB_tagged)

    # print(concept_count)

    ## Prune concepts on word level using word frequency count on BBC corpus
    prune_concepts_WordLevel()
    print("Pruned concepts are:", pruned_concepts)

    ## Identify Wikipedia articles(titles) that match concepts extracted from the text if Jaccard Similarity is one or if wikipedia title is a part of concept extracted
    Wikipedia_aritcle_matching()
    print("\n", concept_wiki_article)
    print("\nFinal List Of Concepts:", final_wiki_concepts)
    # prereq_graph.add_nodes_from(final_wiki_concepts)

    wiki_based_similarity()

    Connected_components = nx.connected_components(un_prereq_graph)
    print("\n Pre-req Graph successfully created")
    # print("\nConnected Components: ")
    # print(Connected_components)
    nx.draw(prereq_graph, with_labels=True)
    plt.axis('off')
    plt.savefig("graph_prereq.png")
示例#8
0
    def __init__(
            self,
            java_path="C:/Program Files (x86)/Java/jre1.8.0_251/bin/java.exe",
            stopwords=default_stopwords,
            symbols="""#§_-@+=*<>()[]{}/\\"'""",
            punct="""!;:,.?-..."""):

        # Chargement du path du dossier du ficher actuel
        dir_path = os.path.dirname(os.path.abspath(__file__))

        # Chargement du lexique
        f = open(dir_path + '/data/lexique.txt', 'r', encoding="utf8")
        self.lexique = dict(eval(f.read()))

        # Défnition des stopwords
        self.stopwords = stopwords
        self.symbols = symbols
        self.punct = punct

        ## INITIALISATION DU TAGGER

        # Initialisation des path pour le StanfordPOSTagger
        jar = dir_path + '/stanford-postagger-full-2018-10-16/stanford-postagger-3.9.2.jar'
        model = dir_path + '/stanford-postagger-full-2018-10-16/models/french.tagger'
        self.java_path = java_path
        os.environ['JAVAHOME'] = self.java_path

        # Initialisation du StanfordPOSTagger
        self.pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
示例#9
0
    def __init__(self):

        classifier_path1 = "stanford/english.muc.7class.distsim.crf.ser.gz"

        # scenario 1
        # classifier_path2 = "stanford/id-ner-model-half.ser.gz"
        # scenario 2
        # classifier_path2 = "stanford/id-ner-model-id.ser.gz"
        # scenario 3
        # classifier_path2 = "stanford/id-ner-model-2.ser.gz"
        ner_jar_path = "stanford/stanford-ner.jar"

        # for handling error nltk internals
        nltk.internals.config_java(options='-xmx5g')

        self.pre = Preprocess()
        self.scp = StanfordParser(
            './stanford/stanford-parser.jar',
            './stanford/stanford-parser-3.9.1-models.jar',
            encoding='utf8')
        self.ner_tagger = StanfordNERTagger(classifier_path1,
                                            ner_jar_path,
                                            encoding='utf8')  # for scenario 3
        self.pos_tagger = StanfordPOSTagger(
            './stanford/english-bidirectional-distsim.tagger',
            './stanford/stanford-postagger.jar',
            encoding='utf8')
        # combining classifier from Stanford with custom classifier
        # self.com_tagger = NERComboTagger(classifier_path1,ner_jar_path,stanford_ner_models=classifier_path1+","+classifier_path2) #for scenario 1 and 2
        self.core_nlp = StanfordCoreNLP('http://localhost', port=9000)
示例#10
0
def update_training_data(usr_input,label,command):
	format_input = ""
	st = StanfordPOSTagger(config['tagger']['model'],path_to_jar=config['tagger']['path'])
	tags = st.tag(usr_input.split())
	print(tags)
	with open(MAPPING_PATH,'r') as data_file:    
		data = json.load(data_file)
		for pos,tag in enumerate(tags):
			if(tag[1] != "NNP"):
				format_input += tag[0]
				format_input += " "
		data[label].append(format_input)
		with open(MAPPING_PATH, "w") as jsonFile:
			jsonFile.write(json.dumps(data, sort_keys=False, indent=4))
	with open(TRAINDATA_PATH,'r') as data_file:
		data = json.load(data_file)
		add_dict = {
			"text" : format_input,
			"label" : label
		}
		data.append(add_dict)
		with open(TRAINDATA_PATH, "w") as jsonFile:
			jsonFile.write(json.dumps(data, sort_keys=False, indent=4))
	with open(COMMAND_PATH,'r') as data_file:
		data = json.load(data_file)
		add_dict = {
			format_input : command
		}
		data[label].update(add_dict)
		with open(COMMAND_PATH,"w") as jsonFile:
			jsonFile.write(json.dumps(data, sort_keys=False, indent=4))
	print('Added')
示例#11
0
def handleMessage(sid, txt):
    tagger = StanfordPOSTagger(_path_to_model,
                               path_to_jar=_path_to_jar,
                               java_options='-mx4096m')
    tagged = tagger.tag(nltk.word_tokenize(txt))
    responseMessage = str(tagged)
    sendResponse(sid, responseMessage)
def french_processing_text(df, text_field, POS_JAR, POS_MODEL):
    tokenizer = RegexpTokenizer(r'\w+')
    pos_tagger = StanfordPOSTagger(POS_MODEL, POS_JAR, encoding='utf8')
    lemmatizer = FrenchLefffLemmatizer()
    stop = stopwords.words('french')

    # Store processed original dataset
    new_text = []

    # Store normalized words (1-gram) and its POS
    words = []
    pos = []

    for text in df[text_field]:  # a text may have multiple sentences
        new_sentences = ''
        for sentence in text.split(
                '.'
        ):  # Treat one sentence at a time. Why ? because we're playing with POS
            processed_sentence = french_processing_sentence(
                sentence, tokenizer, pos_tagger, lemmatizer, stop)
            new_sentences += ' '.join(processed_sentence) + ' '
        new_text.append(new_sentences)

    df['processed_text'] = new_text

    return df
示例#13
0
def get_word_dependencies(text):
    dependencies = {}
    dep_parser = StanfordDependencyParser(
        model_path=osp.join(
            datadir,
            "stanford_data/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
        ),
        java_options="-mx4g -XX:-UseGCOverheadLimit")
    st = StanfordPOSTagger(osp.join(datadir, "stanford_pos/stanford-postagger-3.9.1.jar"),\
  osp.join(datadir, 'stanford_pos/models/english-bidirectional-distsim.tagger'), java_options='-mx4g, XX:-UseGCOverheadLimit')
    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st.stanford_jar = ':'.join(stanford_jars)
    result = dep_parser.raw_parse(text)
    dep = result.__next__()
    #print(list(dep.triples()))
    for i in list(dep.triples()):
        w1 = i[0][0]
        w2 = i[2][0]
        if w1 in dependencies:
            dependencies[w1].append((w2, i[1]))
        else:
            dependencies[w1] = [(w2, i[1])]
    #print(dependencies)
    return dependencies
示例#14
0
def call_reia():

    max_score = 0.1
    map_val = ""

    print('-----------------------')
    user_input = raw_input("enter the string: ")
    #user_name = get_username(first_line.split(' ', 1)[0])
    suggest_list = []
    suggest_message = ""
    #prev_ts = ts
    print("\nINPUT = ")
    print(user_input)
    label = classify(user_input)
    if label == "":
        post_message(
            "Sorry, I could not understand. Please rephrase and try again.")
        consume_message()

    print("Classified as : " + str(label))
    tokens = nltk.word_tokenize(user_input)
    print(tokens)
    st = StanfordPOSTagger(config['tagger']['model'],
                           path_to_jar=config['tagger']['path'])
    stanford_tag = st.tag(user_input.split())
    print("Tags")
    print(stanford_tag)
    """with open(MAPPING_PATH,'r') as data_file:    
示例#15
0
    def __init__(self,
                 use_stanford=False,
                 NER_model=None,
                 NER_tagger=None,
                 POS_model=None,
                 POS_tagger=None):
        """The initializer of the class

        :param NER_model: NER model path
        :param NER_tagger: NER tagger path
        :param POS_model: POS model path
        :param POS_tagger: POS tagger path
        :param use_stanford: boolean, if using stanford NER and POS tagging

        """
        self.NER_model = NER_model
        self.NER_tagger = NER_tagger
        self.POS_model = POS_model
        self.POS_tagger = POS_tagger
        self.use_stanford = use_stanford

        if use_stanford:
            if NER_model is None or NER_tagger is None or POS_model is None \
               or POS_tagger is None:
                sys.exit('tagging initialization: Stanford models and taggers'
                         ' have to be provided!')
            else:
                self.post = StanfordPOSTagger(self.POS_model,
                                              self.POS_tagger).tag
                self.nert = StanfordNERTagger(self.NER_model,
                                              self.NER_tagger).tag
        else:
            self.post = nltk.pos_tag
            self.nert = nltk.ne_chunk
示例#16
0
def segment_pos(dir='rawdata', datetype='all', outdir='nohref_seg'):
    jieba.set_dictionary('dict/dict.txt.big')
    for tag in loadTag():
        jieba.add_word(tag)

    chinese_postagger = StanfordPOSTagger('tagger/chinese-distsim.tagger',
                                          'tagger/stanford-postagger.jar',
                                          encoding='utf-8')

    for file in parseDateType(dir, datetype):
        dirname, filename = os.path.split(file)
        head = filename.split('.')[0]
        outfile = outdir + '/' + head + '.txt'
        if os.path.isfile(outfile):
            print 'pass %s...' % head
            continue

        print 'segment %s ...' % head
        f = open(outfile, 'w')
        dataList = readJson(file)
        p = re.compile("http[s]?://.*\n")
        for data in dataList:
            content = data['content']
            content = re.sub(p, '', content)
            segList = jieba.cut(content)
            wordList, tagList = postagging(chinese_postagger, segList)
            for w, t in zip(wordList, tagList):
                f.write(w.encode('utf-8'))
                f.write(' ')
                f.write(t)
                f.write(' ')
            f.write('\n')
        f.close()
示例#17
0
文件: reia.py 项目: aditi2205/reia2
def call_reia():
    while (True):
        max_score = 0.1
        map_val = ""
        with open('/media/ubuntu/Local Disk/MAJOR_PROJECT/REIA/mqueue.txt',
                  'r') as f:
            first_line = f.readline()
            while first_line == "":
                time.sleep(1)
                call_reia()
        print('-----------------------')
        user_input = first_line.split(' ', 1)[1]
        user_name = get_username(first_line.split(' ', 1)[0])
        suggest_list = []
        suggest_message = ""
        #prev_ts = ts
        print("\nINPUT = ")
        print(user_input)
        label = classify(user_input)
        if label == "":
            post_message(
                "Sorry, I could not understand. Please rephrase and try again."
            )
            consume_message()
            continue
        print("Classified as : " + str(label))
        tokens = nltk.word_tokenize(user_input)
        print(tokens)
        st = StanfordPOSTagger(config['tagger']['model'],
                               path_to_jar=config['tagger']['path'])
        stanford_tag = st.tag(user_input.split())
        print("Tags")
        print(stanford_tag)
        with open(MAPPING_PATH, 'r') as data_file:
            data = json.load(data_file)
        for i in data[label]:
            dist = jf.jaro_distance(str(user_input), str(i))
            suggest_list.append(tuple((dist, i)))
            print(dist)
            if (dist > max_score):
                max_score = dist
                map_val = i
        if max_score < config['preferences']['similarity_threshold']:
            post_message(
                "Sorry, I could not understand. Please rephrase and try again."
            )
            consume_message()
            if config['preferences']['suggestions'] == True:
                suggest = suggestions(suggest_list)
                post_message("Did you mean :")
                for i in suggest:
                    suggest_message += (str(i[1]) + "\n")
                post_message(suggest_message)
            continue
        print("\nMapped to : " + map_val)
        #post_message(map_val)
        construct_command(user_input, label, tokens, map_val, stanford_tag,
                          exec_command, user_name)
        #call('sed -i -e "1d	" REIA/mqueue.txt')
        consume_message()
def emotion_pos_tagging():
    tag_target = ['V', 'N', 'J', 'R']
    tag_list = []

    # 단어 사전 엑셀 파일 입력
    df_emotion = open_emotion_dataframe()

    # 품사 태깅
    for word in df_emotion['영어']:
        STANFORD_POS_MODEL_PATH = "path/english-bidirectional-distsim.tagger"
        STANFORD_POS_JAR_PATH = "path/stanford-postagger-3.9.2.jar"

        pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH,
                                       STANFORD_POS_JAR_PATH)

        pos = pos_tagger.tag([word])
        tag_first = pos[0][1][0]
        if tag_first in tag_target:
            if tag_first == 'V':
                tag_list.append('동사')
            if tag_first == 'N':
                tag_list.append('명사')
            if tag_first == 'J':
                tag_list.append('형용사')
            if tag_first == 'R':
                tag_list.append('부사')
        else:
            tag_list.append('')
    df_emotion['품사'] = tag_list

    # 품사 태깅한 확장 단어 사전 데이터프레임 출력
    df_emotion.to_excel(f"../res/dic/감정 단어.xlsx")
示例#19
0
    def __init__(self):
        self.root_path = '../Models/stanfordNLP/'

        # word segmenter
        self.segmenter = StanfordSegmenter(
            path_to_jar=self.root_path + "stanford-segmenter.jar",
            path_to_slf4j=self.root_path + "log4j-over-slf4j.jar",
            path_to_sihan_corpora_dict=self.root_path + "segmenter/",
            path_to_model=self.root_path + "segmenter/pku.gz",
            path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz")

        # pos tagger
        self.posTagger = StanfordPOSTagger(
            self.root_path + 'pos-tagger/chinese-distsim.tagger',
            path_to_jar=self.root_path + "stanford-postagger.jar")

        # named entity recognizer
        self.nerTagger = StanfordNERTagger(
            self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz',
            path_to_jar=self.root_path + 'stanford-ner.jar')

        self.parser = StanfordDependencyParser(
            model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz',
            path_to_jar=self.root_path + 'stanford-parser.jar',
            path_to_models_jar=self.root_path +
            'stanford-parser-3.7.0-models.jar',
            encoding='gbk')
示例#20
0
def POSTagger(postag_folder_name='', 
		postag_folder='', 
		postag_model='', 
		postag_jarpath=''):
	###
	# default_postag_folder_name = 'stanford-postagger-full-2015-12-09'
	default_postag_folder_name = 'stanford-postagger-full-2017-06-09'
	if len(postag_folder_name)==0:
		postag_folder_name = default_postag_folder_name
	###
	default_postag_folder = os.path.join(os.path.expanduser('~'), 'Stanford NLP', postag_folder_name)
	if len(postag_folder)==0:
		postag_folder = default_postag_folder
	###
	default_postag_model = os.path.join(postag_folder,'models', 'chinese-distsim.tagger')
	if len(postag_model)==0:
		postag_model = default_postag_model
	###
	default_postag_jarpath = os.path.join(postag_folder,'stanford-postagger.jar')
	if len(postag_jarpath)==0:
		postag_jarpath = default_postag_jarpath
	###
	tagger = StanfordPOSTagger(postag_model, postag_jarpath)
	###
	return tagger
示例#21
0
def pos_tag(review):
    eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
    tmp = eng_tagger.tag(review)
    result = []
    for element in tmp:
        result.append(element[1])
    return result
示例#22
0
class KGQAPOSTagger:
    """
    Parts-of-Speech and Named Entity Recognition taggers, based on Stanford Taggers (https://www.nltk.org/_modules/nltk/tag/stanford.html)
    """
    _POSTagger = StanfordPOSTagger(
        #model_filename='stanford-postagger-2018-10-16/models/english-bidirectional-distsim.tagger',
        model_filename='stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger',
        path_to_jar="stanford-postagger-2018-10-16/stanford-postagger.jar")

    _NERTagger = StanfordNERTagger(
        model_filename='stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        path_to_jar='stanford-ner-2018-10-16/stanford-ner.jar',
        encoding='utf-8')

    def __init__(self):
        _empty = 0

    def tag(self, sentence, ner=True):
        """
            POS and optional NER tagging
        :param sentence: sentence to tag
        :param ner: if True, also perform NER tagging
        :return: list of POS-word tuples and list of NER-word tuples (if ner was set to True)
        """
        if isinstance(sentence, list):
            pos_tags = KGQAPOSTagger._POSTagger.tag(sentence)
        else:
            pos_tags = KGQAPOSTagger._POSTagger.tag(sentence.split())
        if ner:
            tokens = nltk.tokenize.word_tokenize(sentence)
            ner_tags = KGQAPOSTagger._NERTagger.tag(tokens)
        else:
            ner_tags = None
        return pos_tags, ner_tags
示例#23
0
def _preprpcessing_eng(id_list):
    stop_w = set(stopwords.words('english'))
    eng_tagger = StanfordPOSTagger(
        model_filename=
        '/home/zhouh/Downloads/stanford-postagger-full-2018-02-27/models/english-bidirectional-distsim.tagger',
        path_to_jar=
        '/home/zhouh/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger.jar'
    )

    for i in id_list:
        try:
            text = read_file('/home/zhouh/Thesis/code/Transcripts/english/' +
                             i + '.txt')
            words = nt.word_tokenize(text, language='english')
            word = [x for x in words if x not in string.punctuation]
            word = [x for x in word if x not in stop_w]
            word = [x for x in word if not x.isdigit()]
            word = eng_tagger.tag(word)
            tt = ''
            for w in word:
                tt += '/'.join(w) + ' '

            new_path = '/home/zhouh/Thesis/code/Transcripts/eng_preprocessed/' + i + 'pre.txt'
            if os.path.exists(new_path):
                os.remove(new_path)
            with open(new_path, 'w') as f:
                f.write(tt)
        except:
            continue
示例#24
0
def load_pos(tagger_path, model_path, tagset):
    # detect model type
    if model_path.endswith('RDR'):
        return POSModelWrapper(RDRPOSTagger(model_path), tagset)
    else:
        return POSModelWrapper(StanfordPOSTagger(model_path, tagger_path, 
                                                 'utf8'), tagset)
示例#25
0
def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records,
                                             pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: ' + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' %
              (idx, len(records), len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source
示例#26
0
 def __init__(self, ):
     super(Lemmatizer, self).__init__()
     self.basename = 'lemmatized'
     self.pos_tagger = StanfordPOSTagger(
         'english-left3words-distsim.tagger', java_options='-mx1024m')
     self.lemmatizer = WordNetLemmatizer()
     self.max_length = 500
def tag_tweet_zub_full(tweet,
                       tagger=StanfordPOSTagger('gate-EN-twitter-fast.model')):
    # get indicies for words
    if tweet.get('full_text', None):
        tokens = nltk.word_tokenize(
            re.sub(r'([^\s\w]|_)+', '', tweet['full_text']))
    else:
        #        print(tweet['full_text'])
        tokens = nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '', tweet['text']))
    text = " ".join(tokens)
    tags = tagger.tag(tokens)
    index_tags = []
    for word, tag in tags:
        try:
            index_tags = [{
                'indices': [text.index(word),
                            text.index(word) + len(word)],
                'category':
                tag
            } for word, tag in tags]
        except:
            try:
                index_tags = [{'category': tag} for word, tag in tags]
            except:
                error_id = tweet["_id"]
                print(error_id)
                return error_id

    tweet['entities'].update({'Token': index_tags})
    tagged_tweet = json.dumps(tweet) + "\n"
    with open(POS_dir + tok + DBname + "_twitIE_POS_FULL_TEXT",
              'a') as tagFile:
        tagFile.writelines(tagged_tweet)
    return ""
示例#28
0
def fr_words(DATA_PATH, candidats) :

    import pandas as pd
    import operator
    import nltk
    from nltk.corpus import stopwords
    
    #nltk stanford french tagger
    from nltk.tag import StanfordPOSTagger
    jar = 'C:/Users/user/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger-full-2018-02-27/stanford-postagger-3.9.1.jar'
    model = 'C:/Users/user/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger-full-2018-02-27/models/french.tagger'
    import os
    java_path = "C:/ProgramData/Oracle/Java/javapath/java.exe"
    os.environ['JAVAHOME'] = java_path
    pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8' )
    
    #tokenizer (enlever les # @ et la ponctuation...)
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    
    #lecture des tweets
    df = pd.read_csv(DATA_PATH)
    df = df[df['text'].notnull()]
    a = len(df)
    fr_words = [[] for i in range (len(candidats))]
    indesirable = ["RT","https","http","c","les", "et", "ça","coach", "ils","thevoice", "quand", "donc","thevoice_tf1" ]
    
    for j in range (len(candidats)):   
        count = dict() 
        candidat = candidats[j]
        for i in range (0,a) :  
            if i in [ 7224, 16457,16458,22348,22349,22350,22351,22352, 22353,22354,22355] : 
                continue 
            else : 
                line = df.at[i,'text']
                tokenized = tokenizer.tokenize(line)
                # ne garder que les mots qui ne sont pas des stop words (de, que, dans...)
                # en minuscule 
                words = [ w.lower() for w in tokenized if (w not in stopwords.words('french') and w not in indesirable)]
                if set(candidat) & set(words):
                    for word in words :
                        
                        if word in count.keys() :
                            count[word] += 1
                        else :
                            count[word] = 1
                else:
                    continue
                
    
        count = sorted(count.items(), key=operator.itemgetter(1), reverse = True)
        
        fr_words1 = count [0:50]
        
        # enlever tous les verbes 
        for element in fr_words1 : 
            if pos_tagger.tag(element[0].split())[0][1] not in ['VINF','V'] :
                fr_words[j].append(element)
            else :
                continue
    return fr_words
示例#29
0
def features(text):
    # POS-Tagging
    tagged = StanfordPOSTagger(model_filename=model_filename,
                               path_to_jar=path_to_jar,
                               encoding='utf8',
                               verbose=False,
                               java_options='-mx3000m')
    classified_word = tagged.tag(nltk.word_tokenize(text))
    text_postags = []
    for index_classified in classified_word:
        text_postags.append(index_classified[1])
    freq_pos = nltk.FreqDist(text_postags)
    adverb, adjective, noun, pronoun, verb = 0, 0, 0, 0, 0
    for index_freq in freq_pos.most_common(len(freq_pos)):
        if index_freq[0] in ["RB", "RBR", "RBS"]:
            adverb += index_freq[1]
        elif index_freq[0] in ["JJ", "JJR", "JJS"]:
            adjective += index_freq[1]
        elif index_freq[0] in ["NN", "NNS", "NNP", "NNPS"]:
            noun += index_freq[1]
        elif index_freq[0] in ["PRP", "PRP$"]:
            pronoun += index_freq[1]
        elif index_freq[0] in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
            verb += index_freq[1]
    X_test = []
    X_test.extend(
        (adverb / adjective, adverb / noun, adverb / pronoun, adjective / verb,
         adjective / pronoun, noun / verb, noun / pronoun, verb / pronoun))
    return X_test
示例#30
0
def TagProb(Readfile, file):
    if path.exists(sys.path[0] + '/Preparation/save/data/' + file):
        remove(sys.path[0] + '/Preparation/save/data/' + file)
    tagger = StanfordPOSTagger(model_filename, path_to_jar)
    WordDict = {}
    for line in open(Readfile):
        sentence = tagger.tag(line.split())
        for WordTag in sentence:
            if WordTag[0] not in WordDict.keys():
                WordDict[WordTag[0]] = {}
                WordDict[WordTag[0]][WordTag[1]] = 1
            else:
                if WordTag[1] not in WordDict[WordTag[0]].keys():
                    WordDict[WordTag[0]][WordTag[1]] = 1
                else:
                    WordDict[WordTag[0]][
                        WordTag[1]] = 1 + WordDict[WordTag[0]][WordTag[1]]
    for word in WordDict.keys():
        sum_freq = 0
        for tag in WordDict[word].keys():
            sum_freq = WordDict[word][tag] + sum_freq
        for tag in WordDict[word].keys():
            WordDict[word][tag] = WordDict[word][tag] / sum_freq
    with open(file, 'a', encoding='utf-8') as Writer:
        for word in WordDict.keys():
            Writer.write(str(word) + ':' + str(WordDict[word]) + '\n')
    return WordDict