示例#1
0
def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records,
                                             pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: ' + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' %
              (idx, len(records), len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source
示例#2
0
def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source
示例#3
0
def load_pos_tagger(stanford_base_dir):
    jar = stanford_base_dir + '/stanford-postagger.jar'
    model = stanford_base_dir + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model_filename=model, path_to_jar=jar)

    stanford_base_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_base_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    return pos_tagger
示例#4
0
def load_pos_tagger():
    path = os.path.dirname(__file__)
    path =  os.path.join(file_dir[: file_dir.rfind('pykp') + 4], 'stanford-postagger')
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    return pos_tagger
示例#5
0
    def xxtest_StanfordPOSTagger(self):

        jar = '\\usr\\stanford-postagger-full-2015-12-09\\stanford-postagger.jar'
        model = '\\usr\\stanford-postagger-full-2015-12-09\\models\\english-left3words-distsim.tagger'

        tagger = StanfordPOSTagger(model, jar)

        stanford_dir = tagger._stanford_jar[0].rpartition('\\')[0]
        stanford_jars = find_jars_within_path(stanford_dir)
        tagger._stanford_jar = ':'.join(stanford_jars)

        text = tagger.tag(word_tokenize("What's the airspeed of an unladen swallow ?"))

        self.assertTrue(text is not None)
def get_pos_tag(sen):#pass sentence dataframe
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen
def get_pos_tag(sen):
    os.environ['CLASSPATH']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/stanford-postagger.jar' #set classpath to pos tagger
    os.environ['STANFORD_MODELS']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/models'
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen
示例#8
0
def check_postag(config):
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(
        config['dataset'])

    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)

    for dataset_name in config['testing_datasets']:
        # override the original test_set
        # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type'])

        test_sets = load_additional_testing_data(config['testing_datasets'],
                                                 idx2word, word2idx, config)
        test_set = test_sets[dataset_name]

        # print(dataset_name)
        # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']])))
        test_data_plain = zip(*(test_set['source'], test_set['target']))

        test_size = len(test_data_plain)

        # Alternatively to setting the CLASSPATH add the jar and model via their path:
        jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
        # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
        model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'
        pos_tagger = StanfordPOSTagger(model, jar)

        for idx in xrange(len(test_data_plain)):  # len(test_data_plain)
            test_s_o, test_t_o = test_data_plain[idx]

            source = keyphrase_utils.cut_zero(test_s_o, idx2word)

            print(source)

            # Add other jars from Stanford directory
            stanford_dir = jar.rpartition('/')[0]
            stanford_jars = find_jars_within_path(stanford_dir)
            pos_tagger._stanford_jar = ':'.join(stanford_jars)

            text = pos_tagger.tag(source)
            print(text)
示例#9
0
from nltk.corpus import wordnet as wn
import pandas as pd

df = pd.DataFrame([])

###############################################################################
#               Import Stanford Tagger
###############################################################################
_stanford_url = 'https://nlp.stanford.edu/software/lex-parser.shtml'
jar = '/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-postagger-2015-12-09/stanford-postagger.jar'
model = '/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-postagger-2015-12-09/models/english-left3words-distsim.tagger'

pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
stanford_dir = pos_tagger._stanford_jar.rpartition('/')[0]
stanford_jars = find_jars_within_path(stanford_dir)
pos_tagger._stanford_jar = ':'.join(stanford_jars)

###############################################################################
#               Import Stanford Parser
###############################################################################
_MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
os.environ[
    'STANFORD_PARSER'] = '/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser.jar'
os.environ[
    'STANFORD_MODELS'] = '/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar'
os.putenv(
    "CLASSPATH",
    "/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar"
)

path_to_jar_p = "/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser.jar"
from nltk.internals import find_jars_within_path
from nltk.tag import StanfordPOSTagger

# ---- 1. SETUP ENVIRONMENT VARIABLES ----

sjar = '/Users/nischikata/PycharmProjects/stanford-postagger-full-2015-12-09/stanford-postagger.jar'
model = '/Users/nischikata/PycharmProjects/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger'

# ---- 2. CREATE POS TAGGER ----
POS_TAGGER = StanfordPOSTagger(model, sjar)

#  ---- 3. ADD OTHER JARS FROM STANFORD DIRECTORY ----
# yep, that should happen anyway if the CLASSPATH is set, but for some reason it doesn't - these 3 lines will do the job:
stanford_dir = POS_TAGGER._stanford_jar.rpartition('/')[0]
stanford_jars = find_jars_within_path(stanford_dir)
POS_TAGGER._stanford_jar = ':'.join(stanford_jars)
# configure Stanford POS Tagger
from nltk.tag import StanfordPOSTagger
from nltk.internals import find_jars_within_path
import platform

stanford_pos_dir = 'resources/libs/stanford-postagger-2015-12-09/'
eng_model_filename = stanford_pos_dir + 'models/english-left3words-distsim.tagger'
my_path_to_jar = stanford_pos_dir + 'stanford-postagger.jar'

tagger = StanfordPOSTagger(model_filename=eng_model_filename, path_to_jar=my_path_to_jar)
# https://gist.github.com/alvations/e1df0ba227e542955a8a
# http://stackoverflow.com/questions/34361725/nltk-stanfordnertagger-noclassdeffounderror-org-slf4j-loggerfactory-in-windo
stanford_jars = find_jars_within_path(stanford_pos_dir)
separator = ';' if 'Windows' in platform.platform() else ':'
tagger._stanford_jar = separator.join(stanford_jars)
# End configuration

class WLLR(object):
    def __init__(self, documents):
        super(WLLR, self).__init__()
        self.__documents = documents
        self.__set_contrast_indicator = set_fore_contrast_indicator | set_post_contrast_indicator

        print 'Initialize document info...'
        document_info = self.__init_document_info(documents)

        print 'Initialize dictionary...'
        self.__dictionary = {}
        self.__init_dictionary(document_info)