Exemplo n.º 1
0
def Init():
    parser = stanford.StanfordDependencyParser(model_path="./stanford_libs/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    stanford_dir = parser._classpath[0].rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    parser._classpath = tuple(find_jars_within_path(stanford_dir))

    '''st = StanfordNERTagger('./stanford_libs/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz',
    './stanford_libs/stanford-ner-2015-12-09/stanford-ner.jar')
    stanford_dir = st._stanford_jar.rpartition('/')[0]
    st._stanford_jar = ':'.join(find_jars_within_path(stanford_dir))
'''
    stop = stopwords.words('english')
    return parser, None, stop
Exemplo n.º 2
0
    def __init__(self,
                 model_filename,
                 path_to_jar=None,
                 encoding='utf8',
                 verbose=False,
                 java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn(
                'The StanfordTagger class is not meant to be '
                'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?'
            )
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=('STANFORD_MODELS', ),
                                         verbose=verbose)

        # Adding logging jar files to classpath
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))

        self._encoding = encoding
        self.java_options = java_options
    def nonlocal_ner_tag_tokens(self):
        home = expanduser("~")
        os.environ['CLASSPATH'] = home + '/stanford-ner-2015-12-09'
        os.environ[
            'STANFORD_MODELS'] = home + '/stanford-ner-2015-12-09/classifiers'

        st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz",
                               java_options='-mx4000m')

        stanford_dir = st._stanford_jar[0].rpartition('/')[0]
        stanford_jars = find_jars_within_path(stanford_dir)

        st._stanford_jar = ':'.join(stanford_jars)

        # do not tokenise text
        nltk.internals.config_java(
            options=
            '-tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions "tokenizeNLs=true"'
        )

        self.nonlocal_ner_doc_tokens = []
        temp_nonlocal_bulk_process = []
        length_of_docs = [len(doc) for doc in self.tokenized_docs_by_lines]
        for doc_idx, doc in enumerate(self.tokenized_docs_by_lines):
            for line_idx, line in enumerate(doc):
                temp_nonlocal_bulk_process.append(line)

        temp_nonlocal_bulk_process = st.tag_sents(temp_nonlocal_bulk_process)

        current_idx = 0
        for doc_len_idx, doc_len in enumerate(length_of_docs):
            self.nonlocal_ner_doc_tokens.append(
                temp_nonlocal_bulk_process[current_idx:current_idx + doc_len])
            current_idx += doc_len
        print("NER nonlocal tagged tokens")
Exemplo n.º 4
0
def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source
Exemplo n.º 5
0
def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records,
                                             pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: ' + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' %
              (idx, len(records), len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source
Exemplo n.º 6
0
def get_word_dependencies(text):
    dependencies = {}
    dep_parser = StanfordDependencyParser(
        model_path=osp.join(
            datadir,
            "stanford_data/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
        ),
        java_options="-mx4g -XX:-UseGCOverheadLimit")
    st = StanfordPOSTagger(osp.join(datadir, "stanford_pos/stanford-postagger-3.9.1.jar"),\
  osp.join(datadir, 'stanford_pos/models/english-bidirectional-distsim.tagger'), java_options='-mx4g, XX:-UseGCOverheadLimit')
    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st.stanford_jar = ':'.join(stanford_jars)
    result = dep_parser.raw_parse(text)
    dep = result.__next__()
    #print(list(dep.triples()))
    for i in list(dep.triples()):
        w1 = i[0][0]
        w2 = i[2][0]
        if w1 in dependencies:
            dependencies[w1].append((w2, i[1]))
        else:
            dependencies[w1] = [(w2, i[1])]
    #print(dependencies)
    return dependencies
Exemplo n.º 7
0
def dependency_parser_nltk(word_lists, filename):
    os.environ['JAVAHOME'] = JAVA_PATH
    os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH
    os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS
    chinese_parser = StanfordDependencyParser(model_path=nltk_parse_model_path)
    STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0]
    chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR))
    chinese_parser.java_options = '-mx15000m'

    node_file = shelve.open(filename)
    all_dependency_list = []
    for index, sentence in enumerate(word_lists):
        # 存进all_dependency_list中,存储数据类型是列表
        res = list(chinese_parser.parse(sentence.strip().split()))
        print("we have finished ", index + 1, " sentence!!!")

        list_file = [triple for triple in res[0].triples()]
        all_dependency_list.append(list_file)

        #存进node_file,存储数据类型是dict/defaultdict,用作备份文件
        node_dict = {}
        node = res[0].nodes
        for inner_index in range(len(node.items()) * 2):
            if node[inner_index]['word'] != None or node[inner_index][
                    'ctag'] != None:
                # print(node[inner_index])
                node_dict[node[inner_index]["address"]] = node[inner_index]
                # print(node[inner_index]["address"], type(node[inner_index]["address"]))
        node_file[str(index)] = node_dict

    node_file.close()
    return all_dependency_list
Exemplo n.º 8
0
 def tokernizer(self,tagger):
 	stanford_dir = tagger._stanford_jar.rpartition('/')[0]
 	stanford_jars = find_jars_within_path(stanford_dir)
 	tagger._stanford_jar = ':'.join(stanford_jars)
 	#tags = tagger.tag(self.stop_wrds())
 	tags = tagger.tag(word_tokenize(self.sentence))
 	return tags
Exemplo n.º 9
0
    def __init__(self, model='stanford/models/english-bidirectional-distsim.tagger', libpath='stanford/', verbose=False):
        self._model = model
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)
        self._xml_regex = re.compile(
            r'  <word wid="[0-9]*" pos="([^"]*)" lemma="([^"]*)">(.*?)</word>')

        config_java(verbose=verbose)
Exemplo n.º 10
0
def load_pos_tagger(stanford_base_dir):
    jar = stanford_base_dir + '/stanford-postagger.jar'
    model = stanford_base_dir + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model_filename=model, path_to_jar=jar)

    stanford_base_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_base_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    return pos_tagger
Exemplo n.º 11
0
    def __init__(self,
                 model='stanford/models/english-bidirectional-distsim.tagger',
                 libpath='stanford/',
                 verbose=False):
        self._model = model
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)
        self._xml_regex = re.compile(
            r'  <word wid="[0-9]*" pos="([^"]*)" lemma="([^"]*)">(.*?)</word>')

        config_java(verbose=verbose)
Exemplo n.º 12
0
def load_pos_tagger():
    path = os.path.dirname(__file__)
    path =  os.path.join(file_dir[: file_dir.rfind('pykp') + 4], 'stanford-postagger')
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    return pos_tagger
Exemplo n.º 13
0
    def xxtest_StanfordPOSTagger(self):

        jar = '\\usr\\stanford-postagger-full-2015-12-09\\stanford-postagger.jar'
        model = '\\usr\\stanford-postagger-full-2015-12-09\\models\\english-left3words-distsim.tagger'

        tagger = StanfordPOSTagger(model, jar)

        stanford_dir = tagger._stanford_jar[0].rpartition('\\')[0]
        stanford_jars = find_jars_within_path(stanford_dir)
        tagger._stanford_jar = ':'.join(stanford_jars)

        text = tagger.tag(word_tokenize("What's the airspeed of an unladen swallow ?"))

        self.assertTrue(text is not None)
Exemplo n.º 14
0
    def __init__(
        self,
        path_to_jar=None,
        path_to_models_jar=None,
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        encoding='utf8',
        verbose=False,
        java_options='-mx4g',
        corenlp_options='',
    ):

        # find the most recent code and model jar
        stanford_jar = max(
            find_jar_iter(
                self._JAR,
                path_to_jar,
                env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_path: os.path.dirname(model_path),
        )

        model_jar = max(
            find_jar_iter(
                self._MODEL_JAR_PATTERN,
                path_to_models_jar,
                env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_path: os.path.dirname(model_path),
        )

        # self._classpath = (stanford_jar, model_jar)

        # Adding logging jar files to classpath
        stanford_dir = os.path.split(stanford_jar)[0]
        self._classpath = tuple([model_jar] +
                                find_jars_within_path(stanford_dir))

        self.model_path = model_path
        self._encoding = encoding
        self.corenlp_options = corenlp_options
        self.java_options = java_options
def get_pos_tag(sen):#pass sentence dataframe
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen
Exemplo n.º 16
0
def stanford_ne_tagger(tokens):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)

    st._stanford_jar = ':'.join(stanford_jars)
    tags = st.tag(tokens)
    continuous_chunks = get_continuous_chunks(tags)
    named_entities_str_tag = set()
    for ne in continuous_chunks:
        if (ne[0][1] == u'LOCATION'):
            named_entities_str_tag.add(
                lower(u' '.join([token for token, tag in ne])))

    return named_entities_str_tag
Exemplo n.º 17
0
    def __init__(
        self,
        path_to_jar=None,
        path_to_models_jar=None,
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        encoding='utf8',
        verbose=False,
        java_options='-mx4g',
        corenlp_options='',
    ):

        # find the most recent code and model jar
        stanford_jar = max(
            find_jar_iter(
                self._JAR,
                path_to_jar,
                env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_path: os.path.dirname(model_path),
        )

        model_jar = max(
            find_jar_iter(
                self._MODEL_JAR_PATTERN,
                path_to_models_jar,
                env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_path: os.path.dirname(model_path),
        )

        # self._classpath = (stanford_jar, model_jar)

        # Adding logging jar files to classpath
        stanford_dir = os.path.split(stanford_jar)[0]
        self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))

        self.model_path = model_path
        self._encoding = encoding
        self.corenlp_options = corenlp_options
        self.java_options = java_options
Exemplo n.º 18
0
    def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'):
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar,
            env_vars=('STANFORD_POSTAGGER',),
            searchpath=(), url=_stanford_url,
            verbose=verbose
        )
        
        # Adding logging jar files to classpath 
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
        
        self._encoding = encoding
        self.java_options = java_options

        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
def get_pos_tag(sen):
    os.environ['CLASSPATH']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/stanford-postagger.jar' #set classpath to pos tagger
    os.environ['STANFORD_MODELS']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/models'
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen
Exemplo n.º 20
0
    def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'):
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar,
            env_vars=('STANFORD_POSTAGGER',),
            searchpath=(), url=_stanford_url,
            verbose=verbose
        )
        
        # Adding logging jar files to classpath 
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
        
        self._encoding = encoding
        self.java_options = java_options

        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
Exemplo n.º 21
0
def find_maltparser(parser_dirname):
    """
    A module to find MaltParser .jar file and its dependencies.
    """
    if os.path.exists(parser_dirname): # If a full path is given.
        _malt_dir = parser_dirname
    else: # Try to find path to maltparser directory in environment variables.
        _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
    # Checks that that the found directory contains all the necessary .jar
    malt_dependencies = ['','','']
    _malt_jars = set(find_jars_within_path(_malt_dir))
    _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
    malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])

    assert malt_dependencies.issubset(_jars)
    assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars))
    return list(_malt_jars)
Exemplo n.º 22
0
def find_maltparser(parser_dirname):
    """
    A module to find MaltParser .jar file and its dependencies.
    """
    if os.path.exists(parser_dirname): # If a full path is given.
        _malt_dir = parser_dirname
    else: # Try to find path to maltparser directory in environment variables.
        _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
    # Checks that that the found directory contains all the necessary .jar
    malt_dependencies = ['','','']
    _malt_jars = set(find_jars_within_path(_malt_dir))
    _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
    malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])

    assert malt_dependencies.issubset(_jars)
    assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars))
    return list(_malt_jars)
Exemplo n.º 23
0
def check_postag(config):
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(
        config['dataset'])

    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)

    for dataset_name in config['testing_datasets']:
        # override the original test_set
        # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type'])

        test_sets = load_additional_testing_data(config['testing_datasets'],
                                                 idx2word, word2idx, config)
        test_set = test_sets[dataset_name]

        # print(dataset_name)
        # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']])))
        test_data_plain = zip(*(test_set['source'], test_set['target']))

        test_size = len(test_data_plain)

        # Alternatively to setting the CLASSPATH add the jar and model via their path:
        jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
        # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
        model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'
        pos_tagger = StanfordPOSTagger(model, jar)

        for idx in xrange(len(test_data_plain)):  # len(test_data_plain)
            test_s_o, test_t_o = test_data_plain[idx]

            source = keyphrase_utils.cut_zero(test_s_o, idx2word)

            print(source)

            # Add other jars from Stanford directory
            stanford_dir = jar.rpartition('/')[0]
            stanford_jars = find_jars_within_path(stanford_dir)
            pos_tagger._stanford_jar = ':'.join(stanford_jars)

            text = pos_tagger.tag(source)
            print(text)
Exemplo n.º 24
0
    def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                    'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        self._stanford_model = find_file(model_filename,
                env_vars=('STANFORD_MODELS',), verbose=verbose)
        
        # Adding logging jar files to classpath 
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
        
        self._encoding = encoding
        self.java_options = java_options
Exemplo n.º 25
0
def parser_nltk(word_lists, filename):
    os.environ['JAVAHOME'] = JAVA_PATH
    os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH
    os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS
    chinese_parser = StanfordParser(model_path=nltk_parse_model_path)
    STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0]
    chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR))
    chinese_parser.java_options = '-mx15000m'
    all_parser_sentence = []
    file = shelve.open(filename)
    flag = 0

    for sentence in word_lists:
        if sentence.strip() != "":
            res = list(chinese_parser.parse((sentence.strip()).split()))
            new_str = return_str_tofile(sentence_parse=str(res[0]))
            file[str(flag)] = res
            all_parser_sentence.append(new_str)
            flag += 1
            print("###### NLTK Dependency Parser Have finished " + str(flag) +
                  " sentences ###")
    return all_parser_sentence
# -*- coding: utf-8 -*-
# export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar
from __future__ import unicode_literals
import os
import sys
import io
import copy
import nltk
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
parser = StanfordParser(
    model_path=
    "stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
)
stanford_dir = parser._classpath[0].rpartition('/')[0]
parser._classpath = tuple(find_jars_within_path(stanford_dir))


# from set_parser import parse_it
class Node(object):
    """
		A generic representation of a tree node. Includes a string label and a list of a children.
	"""
    def __init__(self, label):
        """
			Creates a node with the given label. The label must be a string for use with the PQ-Gram
			algorithm.
		"""
        self.label = label
        self.children = list()
# -*- coding: utf-8 -*-
# export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar
from __future__ import unicode_literals
import os
import sys
import io
import copy
import nltk
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
parser=StanfordParser(model_path="stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
stanford_dir = parser._classpath[0].rpartition('/')[0]
parser._classpath = tuple(find_jars_within_path(stanford_dir))
# from set_parser import parse_it
class Node(object):
	"""
		A generic representation of a tree node. Includes a string label and a list of a children.
	"""

	def __init__(self, label):
		"""
			Creates a node with the given label. The label must be a string for use with the PQ-Gram
			algorithm.
		"""
		self.label = label
		self.children = list()

	def addkid(self, node, before=False):
		"""
			Adds a child node. When the before flag is true, the child node will be inserted at the
			beginning of the list of children, otherwise the child node is appended.
Exemplo n.º 28
0
def preprocess(flist, folder_path):
    """ (file open for reading, str) -> Nonetype

    flist contains one filename per line and folder_path represents a 
    directory. Do preprocessing on each file from flist in folder_path.
    """

    error_log = []
    for i in range(len(flist)):

        path = flist[i]

        stemmer = PorterStemmer()
        parser = StanfordParser(
            model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
            verbose=True)
        stanford_dir = parser._classpath[0].rpartition('/')[0]
        parser._classpath = tuple(find_jars_within_path(stanford_dir))

        with open(path, 'r') as rf:
            try:
                sent = [line.strip('\n ') for line in rf]
            except UnicodeDecodeError as e:
                error_log.append('Unicode Decode Error:\t' + path + '\n')
                pass
            else:
                if not sent:
                    error_log.append('Empty File Error:\t' + path + '\n')
                    pass
                else:
                    # Stemming with Porter Stemmer
                    pars_stem = stemmer.stem(' '.join(sent))
                    stemmed = '\n'.join(sent)

                    wf = open(
                        folder_path + path.split('.')[0].split('/')[-1] +
                        '.stem', 'w')
                    wf.write(stemmed)
                    wf.close()

                    # POS Tagging after tokenizing and stemming
                    pos = nltk.pos_tag(pars_stem.split())
                    wf = open(
                        folder_path + path.split('.')[0].split('/')[-1] +
                        '.pos', 'w')
                    wf.write(str(pos))
                    wf.close()

                    # CFG parser
                    try:
                        parsed = parser.raw_parse(pars_stem)
                    except (TypeError, IndexError, NameError) as e:
                        error_log.append('Unparsable Error:/t' + path + '/n')
                        pass
                    wf = open(
                        folder_path + path.split('.')[0].split('/')[-1] +
                        '.pars', 'w')
                    s_pars = " ".join(str(x) for x in list(parsed))
                    s_pars = s_pars.replace("Tree", "")
                    s_pars = s_pars.replace("[", "")
                    s_pars = s_pars.replace("]", "")
                    s_pars = s_pars.replace("\'", "")
                    wf.write(s_pars)
                    wf.close()

    # Print files paths with Errors
    if error_log:
        wf = open(folder_path + 'error_log', 'wb')
        for line in error_log:
            wf.write(line)
        wf.close()
Exemplo n.º 29
0
def update_tagger_jars(tagger):
    stanford_dir = tagger._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)

    tagger._stanford_jar = ':'.join(stanford_jars)
    return tagger
Exemplo n.º 30
0
    def __init__(self, libpath='stanford/', verbose=False):
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)

        config_java(verbose=verbose)
Exemplo n.º 31
0
#!/bin/env python3.5
from nltk.tag.stanford import StanfordNERTagger
from nltk.internals import find_jars_within_path
from nltk.tokenize import sent_tokenize
import os

tagger = StanfordNERTagger('data/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'data/stanford-ner-2015-12-09/stanford-ner.jar')
tagger._stanford_jar = ':'.join(find_jars_within_path(os.getcwd() + 'data/stanford-ner-2015-12-09'))
print(tagger.tag_sents([''.join([c for c in x if c not in '",:.?/!@#$%^&*()][{}~']).split() for x in sent_tokenize(input('Enter a sentence: '))]))
Exemplo n.º 32
0
 def __init__(self):
     self.parser = StanfordParser(
         model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
     stanford_dir = self.parser._classpath[0].rpartition('/')[0]
     self.parser._classpath = tuple(find_jars_within_path(stanford_dir))
Exemplo n.º 33
0
    word_tf = []
    for word in unique_terms:
        word_tf.append(collection.tf(word, document))
    return word_tf


stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()

java_path = 'C:/Program Files (x86)/Java/jre1.8.0_101/bin/'
os.environ['JAVA_HOME'] = java_path
stanford_dir = 'C:/stanford-ner-2016-10-31/'
jarfile = stanford_dir + 'stanford-ner.jar'
modelfile = stanford_dir + 'classifiers/english.muc.7class.distsim.crf.ser.gz'
st = StanfordNERTagger(modelfile, jarfile)
stanford_jars = find_jars_within_path(stanford_dir)
st._stanford_jar = ';'.join(stanford_jars)

if __name__ == "__main__":
    folder = "Thomas_Baker"
    # Empty list to hold text documents.
    texts = []

    listing = os.listdir(folder)
    for file in sorted(listing):
        if file.endswith(".txt"):
            url = folder + "/" + file
            f = open(url, encoding="latin-1")
            raw = f.read()
            f.close()
            tokens = nltk.word_tokenize(raw)
Exemplo n.º 34
0
#中文词性标注
chi_tagger=StanfordPOSTagger(model_filename='/home/hsiao/Develops/nlp/stanford-postagger-full-2016-10-31/models/chinese-distsim.tagger',
                             path_to_jar='/home/hsiao/Develops/nlp/stanford-postagger-full-2016-10-31/stanford-postagger.jar')
print(chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。'.split()))



#英文句法分析
#import os
#java_path='/usr/lib/jvm/jdk/jdk1.8.0_121'
#os.environ['JAVAHOME']=java_path
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
eng_parser=StanfordParser('/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser.jar',
                          '/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar',
                          '/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/englishPCFG.ser.gz')
eng_parser.__classpath=tuple(find_jars_within_path('/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/'))
print (list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())))


#英文依存句法分析
from nltk.parse.stanford import StanfordDependencyParser
eng_parser=StanfordDependencyParser('/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser.jar',
                          '/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar',
                          '/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/englishPCFG.ser.gz')
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
print (res[0])
for row in res[0].triples():
    print(row)

Exemplo n.º 35
0
#!/bin/env python3.5
#Author: Saurabh Pathak
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
from nltk.tokenize import sent_tokenize
from nltk import download
from nltk.tree import ParentedTree
import os

#download('punkt', quiet=True)
#download('names', quiet=True)

os.environ['CLASSPATH'] = os.getenv('CLASSPATH', '') + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser.jar:' + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
parser._classpath = find_jars_within_path(os.getcwd() + 'data/stanford-parser-full-2015-12-09')

text = input('Enter some text:')

tlist = [ParentedTree.fromstring(str(list(parsetree)[0])) for parsetree in parser.raw_parse_sents(sent_tokenize(text))]

tlist2 = [tree.copy(True) for tree in tlist]
from hobbs import *
from lappinleasse import *

print('Input text was:\n', text)
def resolve(ls, algo):
    print('\nResolving with', algo)
    i = -1
    for parsetree in ls:
        i += 1
Exemplo n.º 36
0
    def __init__(self, libpath='stanford/', verbose=False):
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)

        config_java(verbose=verbose)
Exemplo n.º 37
0
 def __init__(self):
     self.parser = StanfordParser(
         model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
     stanford_dir = self.parser._classpath[0].rpartition('/')[0]
     self.parser._classpath = tuple(find_jars_within_path(stanford_dir))
Exemplo n.º 38
0
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('/home/ubuntu/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/ubuntu/stanford-ner-2015-12-09/stanford-ner.jar')
text = 'While in Frabce'

tokenized_text = word_tokenize(text)
#print tokenized_text
#classified_text = st.tag(tokenized_text)
#print(classified_text)




import nltk
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('/home/ubuntu/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/ubuntu/stanford-ner-2015-12-09/stanford-ner.jar')
print st._stanford_jar
stanford_dir = st._stanford_jar.rpartition('/')[0]
from nltk.internals import find_jars_within_path
stanford_jars = find_jars_within_path(stanford_dir)
print ":".join(stanford_jars)
st._stanford_jar = ':'.join(stanford_jars)
print st._stanford_jar
text = st.tag('Rami Eid is studying at Stony Brook University in NY'.split())
print text